CrawlSpider爬取拉勾被重定向,我已经用selenium拿到了cookies,求教大神,帮帮我吧

我的代码如下:

# -*- coding: utf-8 -*-

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..utils.common import login_lagou
from scrapy.http import Request

class LagouSpider(CrawlSpider):
    name = 'lagou'
    allowed_domains = ['www.lagou.com']
    start_urls = ['https://www.lagou.com/']

    rules = (
        Rule(LinkExtractor(allow=("zhaopin/.*",)), follow=True),                            # 招聘的Rule
        Rule(LinkExtractor(allow=("gongsi/\d+.html",)), follow=True),                       # 公司的Rule
        Rule(LinkExtractor(allow=(r'jobs/\d+.html',)), callback='parse_job', follow=True),  # 具体职位的Rule
    )

    headers = {
        "Host": 'passport.lagou.com',
        "Origin": 'https://passport.lagou.com',
        "Referer": 'https://passport.lagou.com/login/login.html',
        "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/67.0.3396.87 Safari/537.36',
        "X-Requested-With": "XMLHttpRequest",
        "Content-Type": 'application/x-www-form-urlencoded;charset=UTF-8'
    }

    def start_requests(self):
        self.cookies = login_lagou()
        print(self.cookies)
        self.headers.update({
            "Cookie": self.cookies
        })

        print(self.headers)
        yield Request(url=self.start_urls[0],
                      cookies=self.cookies,
                      headers=self.headers,
                      callback=self.parse,
                      dont_filter=True)


    # def parse_start_url(self, response):
    #     return []
    #
    # def process_results(self, response, results):
    #     return results

    def parse_job(self, response):
        # 解析拉钩网的职位
        i = {}
        print(response)
        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()
        return i

setting.py的配置文件

HTTPERROR_ALLOWED_CODES = [302]
# HTTPERROR_ALLOWED_CODES = [400]

COOKIES_ENABLED = False
REDIRECT_ENABLED = False   # 禁止重定向

DOWNLOAD_DELAY = 6      # 设置时间间隔为6s,防止被禁
DOWNLOAD_TIMEOUT = 10   # 设置超时时间
RETRY_ENABLED = True    # 设置开启重试
RETRY_TIMES = 3         # 设置重试次数


Mr1011
浏览 2021回答 1
1回答
打开App,查看更多内容
随时随地看视频慕课网APP

相关分类

Python