我的代码如下:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..utils.common import login_lagou
from scrapy.http import Request
class LagouSpider(CrawlSpider):
name = 'lagou'
allowed_domains = ['www.lagou.com']
start_urls = ['https://www.lagou.com/']
rules = (
Rule(LinkExtractor(allow=("zhaopin/.*",)), follow=True), # 招聘的Rule
Rule(LinkExtractor(allow=("gongsi/\d+.html",)), follow=True), # 公司的Rule
Rule(LinkExtractor(allow=(r'jobs/\d+.html',)), callback='parse_job', follow=True), # 具体职位的Rule
)
headers = {
"Host": 'passport.lagou.com',
"Origin": 'https://passport.lagou.com',
"Referer": 'https://passport.lagou.com/login/login.html',
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/67.0.3396.87 Safari/537.36',
"X-Requested-With": "XMLHttpRequest",
"Content-Type": 'application/x-www-form-urlencoded;charset=UTF-8'
}
def start_requests(self):
self.cookies = login_lagou()
print(self.cookies)
self.headers.update({
"Cookie": self.cookies
})
print(self.headers)
yield Request(url=self.start_urls[0],
cookies=self.cookies,
headers=self.headers,
callback=self.parse,
dont_filter=True)
# def parse_start_url(self, response):
# return []
#
# def process_results(self, response, results):
# return results
def parse_job(self, response):
# 解析拉钩网的职位
i = {}
print(response)
#i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
#i['name'] = response.xpath('//div[@id="name"]').extract()
#i['description'] = response.xpath('//div[@id="description"]').extract()
return isetting.py的配置文件
HTTPERROR_ALLOWED_CODES = [302] # HTTPERROR_ALLOWED_CODES = [400] COOKIES_ENABLED = False REDIRECT_ENABLED = False # 禁止重定向 DOWNLOAD_DELAY = 6 # 设置时间间隔为6s,防止被禁 DOWNLOAD_TIMEOUT = 10 # 设置超时时间 RETRY_ENABLED = True # 设置开启重试 RETRY_TIMES = 3 # 设置重试次数
Mr1011
随时随地看视频慕课网APP
相关分类