Micksun
2019-12-13 00:14
# coding=utf-8
import json
import re
import time
import multiprocessing
import requests
class HandleLaGou(object):
def __init__(self):
self.lagou_session = requests.session()
self.header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
self.city_list = ""
# 获取全国所有城市列表
def handle_city(self):
city_search = re.compile(r'zhaopin/">(.*?)</a>')
city_url = 'https://www.lagou.com/jobs/allCity.html'
city_result = self.handle_request(method='GET', url=city_url)
# 使用正则表达式获取城市列表
self.city_list = city_search.findall(city_result)
self.lagou_session.cookies.clear()
def handle_city_job(self, city):
first_request_url = 'https://www.lagou.com/jobs/list_python?&px=default&city=%s' % city
first_response = self.handle_request(method='GET', url=first_request_url)
total_page_search = re.compile(r'class="span\stotalNum">(\d+)</span>')
try:
total_page = total_page_search.search(first_response).group(1)
except:
return
else:
for i in range(1, int(total_page) + 1):
data = {
'pn': i,
'kd': 'python'
}
page_url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=%s&needAddtionalResult=false' % city
referer_url = 'https://www.lagou.com/jobs/list_python?&px=default&city=%s' % city
self.header['Referer'] = referer_url.encode()
response = self.handle_request(method='POST', url=page_url, data=data, info=city)
print response
lagou_data = json.loads(response)
job_list = lagou_data['content']['positionResult']['result']
for job in job_list:
print job
def handle_request(self, method, url, data=None, info=None):
global response
while True:
# 阿布云代理
# 代理服务器
proxyHost = "http-dyn.abuyun.com"
proxyPort = "9020"
# 代理隧道验证信息
proxyUser = "H6451437A9W24E7D"
proxyPass = "A86CD1F6AF3AD760"
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host": proxyHost,
"port": proxyPort,
"user": proxyUser,
"pass": proxyPass,
}
proxies = {
"http": proxyMeta,
"https": proxyMeta,
}
try:
if method == 'GET':
response = self.lagou_session.get(
url=url,
headers=self.header,
proxies=proxies,
timeout=6
)
elif method == 'POST':
response = self.lagou_session.post(
url=url,
headers=self.header,
data=data,
proxies=proxies,
timeout=6
)
except:
self.lagou_session.cookies.clear()
first_request_url = 'https://www.lagou.com/jobs/list_python?&px=default&city=%s' % info
self.handle_request(method='GET', url=first_request_url)
time.sleep(10)
continue
response.encoding = 'utf-8'
if '频繁' in response.text:
print response.text
self.lagou_session.cookies.clear()
first_request_url = 'https://www.lagou.com/jobs/list_python?&px=default&city=%s' % info
self.handle_request(method='GET', url=first_request_url)
time.sleep(10)
continue
return response.text
if __name__ == '__main__':
lagou = HandleLaGou()
lagou.handle_city()
# 引入多进程
pool = multiprocessing.Pool(1)
for city in lagou.city_list:
pool.apply_async(lagou.handle_city_job, args=(city,1))
pool.close()
pool.join()执行结果
/usr/local/bin/python2.7 /Users/imooc_lagou/handle_crawl_lagou.pyProcess finished with exit code 0
pool.apply_async(lagou.handle_city_job, args={city})
Python爬虫实战数据可视化分析
4276 学习 · 29 问题
相似问题