问答详情
源自:2-3 加入代理逻辑隐藏爬虫 使用多进程加速抓取

apply_async里面的函数不执行

# coding=utf-8
import json
import re
import time
import multiprocessing

import requests


class HandleLaGou(object):
    def __init__(self):
        self.lagou_session = requests.session()
        self.header = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
        }
        self.city_list = ""

    # 获取全国所有城市列表
    def handle_city(self):
        city_search = re.compile(r'zhaopin/">(.*?)</a>')
        city_url = 'https://www.lagou.com/jobs/allCity.html'
        city_result = self.handle_request(method='GET', url=city_url)
        # 使用正则表达式获取城市列表
        self.city_list = city_search.findall(city_result)
        self.lagou_session.cookies.clear()

    def handle_city_job(self, city):
        first_request_url = 'https://www.lagou.com/jobs/list_python?&px=default&city=%s' % city
        first_response = self.handle_request(method='GET', url=first_request_url)
        total_page_search = re.compile(r'class="span\stotalNum">(\d+)</span>')
        try:
            total_page = total_page_search.search(first_response).group(1)
        except:
            return
        else:
            for i in range(1, int(total_page) + 1):
                data = {
                    'pn': i,
                    'kd': 'python'
                }
                page_url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=%s&needAddtionalResult=false' % city
                referer_url = 'https://www.lagou.com/jobs/list_python?&px=default&city=%s' % city
                self.header['Referer'] = referer_url.encode()
                response = self.handle_request(method='POST', url=page_url, data=data, info=city)
                print response
                lagou_data = json.loads(response)
                job_list = lagou_data['content']['positionResult']['result']
                for job in job_list:
                    print job

    def handle_request(self, method, url, data=None, info=None):
        global response
        while True:
            # 阿布云代理
            # 代理服务器
            proxyHost = "http-dyn.abuyun.com"
            proxyPort = "9020"

            # 代理隧道验证信息
            proxyUser = "H6451437A9W24E7D"
            proxyPass = "A86CD1F6AF3AD760"

            proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
                "host": proxyHost,
                "port": proxyPort,
                "user": proxyUser,
                "pass": proxyPass,
            }

            proxies = {
                "http": proxyMeta,
                "https": proxyMeta,
            }
            try:
                if method == 'GET':
                    response = self.lagou_session.get(
                        url=url,
                        headers=self.header,
                        proxies=proxies,
                        timeout=6
                    )
                elif method == 'POST':
                    response = self.lagou_session.post(
                        url=url,
                        headers=self.header,
                        data=data,
                        proxies=proxies,
                        timeout=6
                    )
            except:
                self.lagou_session.cookies.clear()
                first_request_url = 'https://www.lagou.com/jobs/list_python?&px=default&city=%s' % info
                self.handle_request(method='GET', url=first_request_url)
                time.sleep(10)
                continue
            response.encoding = 'utf-8'
            if '频繁' in response.text:
                print response.text
                self.lagou_session.cookies.clear()
                first_request_url = 'https://www.lagou.com/jobs/list_python?&px=default&city=%s' % info
                self.handle_request(method='GET', url=first_request_url)
                time.sleep(10)
                continue
            return response.text


if __name__ == '__main__':
    lagou = HandleLaGou()
    lagou.handle_city()
    # 引入多进程
    pool = multiprocessing.Pool(1)
    for city in lagou.city_list:
        pool.apply_async(lagou.handle_city_job, args=(city,1))
    pool.close()
    pool.join()

执行结果

/usr/local/bin/python2.7 /Users/imooc_lagou/handle_crawl_lagou.pyProcess 

finished with exit code 0


提问者:Micksun 2019-12-13 00:14

个回答

  • qq_慕妹9439293
    2020-03-14 17:58:02

    pool.apply_async(lagou.handle_city_job, args={city})