Xpathl路径表达式
Xpath使用路径表达式来选取XML文档(或是HTML文档)中的节点或节点集
路径表达式:
/div:从根节点开始选取div节点
//a:选取文档中所有的a节点而不考虑其位置
@class:选取名为class的属性
. :选取当前节点
.. :选取当前节点的父节点
ctrl+shift+x 打开Xpath
/div/a :从根节点开始选取div节点下的a节点
/div/a[@class='header-wrapper' ]
请求头
etree用来添加HTML和body标签
data = """ <div> <ul> <li class="item-0"><a href="link1">first</a></li> <li class="item-in"><a href="link2"><span class="bold">second</span></a></li> <li class="item-0"><a href="link3">third</a></li> </ul> </div> """ html = etree.HTML(data)
2.获取第一个li的a标签
1.print(html.xpath("//li/a[@href='link1']")) 2.print(html.xpath("//li[1]/a[1])
3.获取第二个li的span标签
1.#因为span是li的后代元素,所以要用两个// 2.print(html.xpath("//li//span"))
4.获取最后一个li里面的href(last的使用)
print(html.xpath("//li[last()]/a/@href"))
设置代理——隐藏请求
第二遍没太听懂
百度的返回头如何设置 cookies
关闭校验:verify=False
指定证书:verify='xxx'(某个路径)
session信息
Requests模块构造URL
PUT方法
HTTP POST方法用于提交数据(如表单)
Head方法请求部分信息
HTTP方法GET
request是一个Python三方库
import pymongo # 指定客户端连接mongodb pymongo_client = pymongo.MongoClient("mongodb://localhost:27017") # 指定数据库名称 pymongo_db = pymongo_client["imooc"] # 创建集合,创建表 pymongo_collection = pymongo_db["pymongo_test"] data = { "name": "imooc", "flag": 1, "url": "https://www.imooc.com" } mylist = [ {"name": "imooc", "flag": "123", "url": "https://www.imooc.com"}, {"name": "taobao", "flag": "456", "url": "https://www.imooc.com"}, {"name": "qq", "flag": "761", "url": "https://www.imooc.com"}, {"name": "知乎", "flag": "354", "url": "https://www.imooc.com"}, {"name": "微博", "flag": "954", "url": "https://www.imooc.com"} ] # pymongo_collection.insert_one(data) # pymongo_collection.insert_many(mylist) # result=pymongo_collection.find({},{"_id":0,"name":1,"url":1,"flag":1}) # result=pymongo_collection.find({},{"_id":0,"name":1,"url":1,"flag":1}) # result=pymongo_collection.find_one() # print(result) # for item in result: # print(item) # 大于一百 # find_result=pymongo_collection.find({"flag":{"$gt":100}}) # 首字母t # find_result=pymongo_collection.find({"name":{"$regex":"^t"}}) # for item in find_result: # print(item) # pymongo_collection.update_one({"name":{"$regex":"^t"}},{"$set":{"name":"baidu"}}) # pymongo_collection.delete_one({"name":"微博"}) pymongo_collection.delete_many({})
import requests from lxml import etree from pymongo.collection import Collection import pymongo print("开始0") class Dangdang(object): mongo_client=pymongo.MongoClient(host="localhost",port=27017) dangdang_db=mongo_client["dangdang_db"] def __init__(self): self.header = { "Host": "bang.dangdang.com", "Connection": "keep-alive", "Cache-Control": "max-age=0", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Referer": "http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-24hours-0-0-1-2", "Accept-Encoding": "gzip,deflate", "Accept-Language": "zh-CN,zh;q=0.9" } self.dangdang=Collection(Dangdang.dangdang_db,"dangdang") def get_dangdang(self, page): """发送请求到当当网获取数据""" url = "http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-24hours-0-0-1-%s" % page response = requests.get(url=url, headers=self.header) if response: # html数据实例化 # print(response.text) html1 = etree.HTML(response.content) items = html1.xpath("//ul[@class='bang_list clearfix bang_list_mode']/li") return items def join_list(self,item): # 处理列表→字符串 return "".join(item) def parse_item(self,items): # 解析条目 # 存到mongodb之前的数据 result_list=[] for item in items: # 名称 title=item.xpath(".//div[@class='name']/a/@title") # 图书评论 comment=item.xpath(".//div[@class='star']/a/text()") # 作者信息 author=item.xpath(".//div[@class='publisher_info'][1]/a[1]/@title") # 价格 price=item.xpath(".//div[@class='price']/p[1]/span[1]/text()") result_list.append( { "title":self.join_list(title), "comment":self.join_list(comment), "author":self.join_list(author), "price":self.join_list(price) } ) return result_list def insert_data(self,result_list): self.dangdang.insert_many(result_list) def main(): d = Dangdang() print("开始") import json for page in range(1, 26): items = d.get_dangdang(page=page) result=d.parse_item(items=items) # print(json.dumps(result)) print(result) # d.insert_data(result) if __name__ == '__main__': main()
注意单引号双引号的问题
孙子标签用//
获取最后一个li标签的a标签的href的值
xpath
//div[@class="result c-container new-pmd"]
//dd[@class="lemmaWgt-lemmaTitle-title J-lemma-title"]/a[3]/text()
//div[not(contains(@class,"para"))]
//div[last()-2]
遇到网络问题: ConnectionError异常
如果HTTP请求返回了不成功的状态码: HTTPError异常
若请求超时,则抛出一个Timeout异常
若请求超过了设定的最大重定向次数,则会抛出一个 TooManyRedirects异常
关闭SSL校验:
response = requests.get(url="https://www.baina.org/", verify=false)
print(response.text)
cookie
# 随便练习 print(1)