做笔记了1

基础不牢,地动山摇

创建数据库连接
开启多进程
#-*-coding:utf-8-*-
import json
import time
import multiprocessing
import requests
from lxml import etree
class HandelLaGou(object):
def __init__(self):
self.lagou_session = requests.session()
self.header = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.18 Safari/525.19'
}
self.city_list = ""
def handle_city(self):
city_url = "https://www.lagou.com/jobs/allCity.html"
city_result = self.handle_request(method="GET", url=city_url)
list = city_result.xpath('//ul[contains(@class, "city_list")]/li/a/text()')
self.city_list = [x for x in list]
self.lagou_session.cookies.clear()
def handle_city_job(self, city):
first_request_url = "https://www.lagou.com/jobs/list_python?&px=default&city=%s"%city
first_response = self.handle_request(method="GET",url=first_request_url)
try:
total_page = first_response.xpath('//span[contains(@class, "span totalNum")]/text()')
print(total_page)
except:
return
else:
for i in range(1, int(total_page[0])+1):
data = {
"pn":i,
"kd":"python"
}
print(i)
page_url = "https://www.lagou.com/jobs/positionAjax.json?px=default&city=%s&needAddtionalResult=false"%city
referer_url = "https://www.lagou.com/jobs/list_python?&px=default&city=%s"%city
self.header['Referer'] = referer_url.encode('utf-8')
response = self.handle_request(method="POST", url=page_url, data=data, info=city)
lagou_data = json.loads(response)
job_list = lagou_data['content']['positionResult']['result']
for job in job_list:
print(job)
def handle_request(self, method, url , data=None, info=None):
while True:
if method == "GET":
response = self.lagou_session.get(url=url, headers=self.header)
item = etree.HTML(response.text)
elif method == "POST":
response = self.lagou_session.post(url=url, headers=self.header, data=data)
item = response.text
if '频繁' in response.text:
self.lagou_session.cookies.clear()
first_request_url = "https://www.lagou.com/jobs/list_python?&px=default&city=%s" %info
self.handle_request(method="GET", url=first_request_url)
time.sleep(15)
continue
return item
if __name__ == '__main__':
lagou = HandelLaGou()
lagou.handle_city()
pool = multiprocessing.Pool(2)
for city in lagou.city_list:
pool.apply_async(lagou.handle_city_job, args=(city,))
pool.close()
pool.join()
import requests
from lxml import etree
class HandelLaGou(object):
def __init__(self):
self.lagou_session = requests.session()
self.hander = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.18 Safari/525.19'
}
self.city_list = ""
def handle_city(self):
city_url = "https://www.lagou.com/jobs/allCity.html"
city_result = self.handle_request(method="GET", url=city_url)
list = city_result.xpath('//ul[contains(@class, "city_list")]/li/a/text()')
self.city_list = [x for x in list]
def handle_city_job(self, city):
def handle_request(self, method, url , data=None, info=None):
if method == "GET":
response = self.lagou_session.get(url=url, headers=self.hander)
item = etree.HTML(response.text)
return item
if __name__ == '__main__':
lagou = HandelLaGou()
lagou.handle_city()
print(lagou.city_list)
import re
import requests
#在此处设置取消警告信息
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# 这是另一个大佬的办法,可惜过于复杂,我没看懂,简简单单才是真
# requests.packages.urllib3.disable_warnings()
# requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += 'HIGH:!DH:!aNULL'
# try:
# requests.packages.urllib3.contrib.pyopenssl.DEFAULT_SSL_CIPHER_LIST += 'HIGH:!DH:!aNULL'
# except AttributeError:
# # no pyopenssl support used / needed / available
# pass
class Handle_Lagou(object):
def __init__(self):
#使用session保存cookies信息
self.lagou_session = requests.session()
self.header = {
'Connection': 'close',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
self.city_list=""
#获取全国所有城市列表的方法
def handle_city(self):
city_search = re.compile(r'zhaopin/">(.*?)</a>')
city_url = "https://www.lagou.com/jobs/allCity.html"
city_result = self.handle_request(method="GET",url=city_url)
self.city_list = city_search.findall(city_result)
def handle_request(self,method,url,data=None,info=None):
if method == "GET":
# 在此处设置verify = False
response = self.lagou_session.get(url=url,headers=self.header,verify=False)
return response.text
if __name__=='__main__':
lagou = Handle_Lagou()
lagou.handle_city()
print(lagou.city_list)