做笔记了1
基础不牢,地动山摇
创建数据库连接
开启多进程
#-*-coding:utf-8-*- import json import time import multiprocessing import requests from lxml import etree class HandelLaGou(object): def __init__(self): self.lagou_session = requests.session() self.header = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.18 Safari/525.19' } self.city_list = "" def handle_city(self): city_url = "https://www.lagou.com/jobs/allCity.html" city_result = self.handle_request(method="GET", url=city_url) list = city_result.xpath('//ul[contains(@class, "city_list")]/li/a/text()') self.city_list = [x for x in list] self.lagou_session.cookies.clear() def handle_city_job(self, city): first_request_url = "https://www.lagou.com/jobs/list_python?&px=default&city=%s"%city first_response = self.handle_request(method="GET",url=first_request_url) try: total_page = first_response.xpath('//span[contains(@class, "span totalNum")]/text()') print(total_page) except: return else: for i in range(1, int(total_page[0])+1): data = { "pn":i, "kd":"python" } print(i) page_url = "https://www.lagou.com/jobs/positionAjax.json?px=default&city=%s&needAddtionalResult=false"%city referer_url = "https://www.lagou.com/jobs/list_python?&px=default&city=%s"%city self.header['Referer'] = referer_url.encode('utf-8') response = self.handle_request(method="POST", url=page_url, data=data, info=city) lagou_data = json.loads(response) job_list = lagou_data['content']['positionResult']['result'] for job in job_list: print(job) def handle_request(self, method, url , data=None, info=None): while True: if method == "GET": response = self.lagou_session.get(url=url, headers=self.header) item = etree.HTML(response.text) elif method == "POST": response = self.lagou_session.post(url=url, headers=self.header, data=data) item = response.text if '频繁' in response.text: self.lagou_session.cookies.clear() first_request_url = "https://www.lagou.com/jobs/list_python?&px=default&city=%s" %info self.handle_request(method="GET", url=first_request_url) time.sleep(15) continue return item if __name__ == '__main__': lagou = HandelLaGou() lagou.handle_city() pool = multiprocessing.Pool(2) for city in lagou.city_list: pool.apply_async(lagou.handle_city_job, args=(city,)) pool.close() pool.join()
import requests from lxml import etree class HandelLaGou(object): def __init__(self): self.lagou_session = requests.session() self.hander = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.18 Safari/525.19' } self.city_list = "" def handle_city(self): city_url = "https://www.lagou.com/jobs/allCity.html" city_result = self.handle_request(method="GET", url=city_url) list = city_result.xpath('//ul[contains(@class, "city_list")]/li/a/text()') self.city_list = [x for x in list] def handle_city_job(self, city): def handle_request(self, method, url , data=None, info=None): if method == "GET": response = self.lagou_session.get(url=url, headers=self.hander) item = etree.HTML(response.text) return item if __name__ == '__main__': lagou = HandelLaGou() lagou.handle_city() print(lagou.city_list)
import re import requests #在此处设置取消警告信息 import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # 这是另一个大佬的办法,可惜过于复杂,我没看懂,简简单单才是真 # requests.packages.urllib3.disable_warnings() # requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += 'HIGH:!DH:!aNULL' # try: # requests.packages.urllib3.contrib.pyopenssl.DEFAULT_SSL_CIPHER_LIST += 'HIGH:!DH:!aNULL' # except AttributeError: # # no pyopenssl support used / needed / available # pass class Handle_Lagou(object): def __init__(self): #使用session保存cookies信息 self.lagou_session = requests.session() self.header = { 'Connection': 'close', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' } self.city_list="" #获取全国所有城市列表的方法 def handle_city(self): city_search = re.compile(r'zhaopin/">(.*?)</a>') city_url = "https://www.lagou.com/jobs/allCity.html" city_result = self.handle_request(method="GET",url=city_url) self.city_list = city_search.findall(city_result) def handle_request(self,method,url,data=None,info=None): if method == "GET": # 在此处设置verify = False response = self.lagou_session.get(url=url,headers=self.header,verify=False) return response.text if __name__=='__main__': lagou = Handle_Lagou() lagou.handle_city() print(lagou.city_list)