# coding=utf-8 import json import re import time import multiprocessing import requests class HandleLaGou(object): def __init__(self): self.lagou_session = requests.session() self.header = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' } self.city_list = "" # 获取全国所有城市列表 def handle_city(self): city_search = re.compile(r'zhaopin/">(.*?)</a>') city_url = 'https://www.lagou.com/jobs/allCity.html' city_result = self.handle_request(method='GET', url=city_url) # 使用正则表达式获取城市列表 self.city_list = city_search.findall(city_result) self.lagou_session.cookies.clear() def handle_city_job(self, city): first_request_url = 'https://www.lagou.com/jobs/list_python?&px=default&city=%s' % city first_response = self.handle_request(method='GET', url=first_request_url) total_page_search = re.compile(r'class="span\stotalNum">(\d+)</span>') try: total_page = total_page_search.search(first_response).group(1) except: return else: for i in range(1, int(total_page) + 1): data = { 'pn': i, 'kd': 'python' } page_url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=%s&needAddtionalResult=false' % city referer_url = 'https://www.lagou.com/jobs/list_python?&px=default&city=%s' % city self.header['Referer'] = referer_url.encode() response = self.handle_request(method='POST', url=page_url, data=data, info=city) print response lagou_data = json.loads(response) job_list = lagou_data['content']['positionResult']['result'] for job in job_list: print job def handle_request(self, method, url, data=None, info=None): global response while True: # 阿布云代理 # 代理服务器 proxyHost = "http-dyn.abuyun.com" proxyPort = "9020" # 代理隧道验证信息 proxyUser = "H6451437A9W24E7D" proxyPass = "A86CD1F6AF3AD760" proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxyHost, "port": proxyPort, "user": proxyUser, "pass": proxyPass, } proxies = { "http": proxyMeta, "https": proxyMeta, } try: if method == 'GET': response = self.lagou_session.get( url=url, headers=self.header, proxies=proxies, timeout=6 ) elif method == 'POST': response = self.lagou_session.post( url=url, headers=self.header, data=data, proxies=proxies, timeout=6 ) except: self.lagou_session.cookies.clear() first_request_url = 'https://www.lagou.com/jobs/list_python?&px=default&city=%s' % info self.handle_request(method='GET', url=first_request_url) time.sleep(10) continue response.encoding = 'utf-8' if '频繁' in response.text: print response.text self.lagou_session.cookies.clear() first_request_url = 'https://www.lagou.com/jobs/list_python?&px=default&city=%s' % info self.handle_request(method='GET', url=first_request_url) time.sleep(10) continue return response.text if __name__ == '__main__': lagou = HandleLaGou() lagou.handle_city() # 引入多进程 pool = multiprocessing.Pool(1) for city in lagou.city_list: pool.apply_async(lagou.handle_city_job, args=(city,1)) pool.close() pool.join()
执行结果
/usr/local/bin/python2.7 /Users/imooc_lagou/handle_crawl_lagou.pyProcess finished with exit code 0
pool.apply_async(lagou.handle_city_job, args
={city}
)