慕田峪2088838
2019-10-28 16:59
# -*- coding: utf-8 -*- import scrapy from ..items import DoubanItem class DoubanSpiderSpider(scrapy.Spider): #爬虫名 name = 'douban_spider' #允许的域名 allowed_domains = ['movie.douban.com'] #入口URL,扔到调度器里面 start_urls = ['https://movie.douban.com/top250'] #默认解析方法 def parse(self, response): #print(response.text) #循环电影的条目 movie_list = response.xpath("//div[@class='article']//ol[@class='grid_view']/li") for i_item in movie_list: # print(i_item) #item文件导进来 douban_item = DoubanItem() douban_item['serial_number'] = i_item.xpath(".//div[@class='item']//em/text()").extract_first() douban_item['movie_name'] = i_item.xpath(".//div[@class='info']/div[@class='hd']/a/span[1]/text()").extract_first() content = i_item.xpath(".//div[@class='info']//div[@class='bd']/p[1]/text()").extract() #数据的处理 for i_content in content: content_s="".join(i_content.split()) douban_item['introduce']=content_s douban_item['star'] = i_item.xpath(".//span[@class='rating_num']/text()").extract_first() douban_item['evaluate'] = i_item.xpath(".//div[@class='star']//span[4]/text()").extract_first() douban_item['describe'] = i_item.xpath(".//p[@class='quote']/span/text()").extract_first() #将数据yield到pipeline yield douban_item #解析下一页规则,取得后页得xpath next_link = response.xpath("//span[@class='next']/link/@href").extract() if next_link: next_link = next_link[0] yield scrapy.Request("https://movie.douban.com/top250"+next_link,callback=self.parse)
使用了循环
Python最火爬虫框架Scrapy入门与实践
67418 学习 · 223 问题
相似问题