# -*- coding: utf-8 -*- import scrapy from douban.items import DoubanItem class DoubanSpiderSpider(scrapy.Spider): #爬虫名 name = 'douban_spider' allowed_domains = ['movic.douban.com'] start_urls = ['https://movie.douban.com/top250'] def parse(self, response): movic_list=response.xpath("//div[@class='article']//ol[@class='grid_view']/li") for i_item in movic_list: douban_item=DoubanItem() douban_item['serial_number']=i_item.xpath(".//div[@class='item']//em/text()").extract_first() douban_item['movic_name']=i_item.xpath(".//div[@class='info']/div[@class='hd']/a/span[1]/text()").extract_first() content=i_item.xpath(".//div[@class='info']//div[@class='bd']/p[1]/text()").extract_first() for i_content in content: content_s="".join(i_content.split()) douban_item['introduce']=content_s douban_item['star']=i_item.xpath(".//span[@class='rating_num']/text()").extract_first() douban_item['evaluate']=i_item.xpath(".//div[@class='star']//span[4]/text()").extract_first() douban_item["des"]=i_item.xpath(".//p[@class='quote']/span/text()").extract_first() print(douban_item) yield douban_item next_link=response.xpath("//span[@class='next']/link/@href").extract() if next_link: next_link=next_link[0] yield scrapy.Request("https://movie.douban.com/top250" + next_link, callback=self.parse)
这个是scrapy中回调时被过滤了 解决方法
在 allowed_domains
中加入 url
在 scrapy.Request() 函数中将参数 dont_filter=True
设置为 True
修改这一行代码如下,我个人采取第二种方式dont_filter=Ture:
yield scrapy.Request("https://movie.douban.com/top250"+next_link, callback=self.parse, dont_filter=True)