问答详情
源自:2-8 保存数据

生成的CSV文件里的数据都是双份的,,,不知道为啥

# -*- coding: utf-8 -*-
import scrapy

from ..items import DoubanItem


class DoubanSpiderSpider(scrapy.Spider):
    #爬虫名
    name = 'douban_spider'
    #允许的域名
    allowed_domains = ['movie.douban.com']
    #入口URL,扔到调度器里面
    start_urls = ['https://movie.douban.com/top250']

    #默认解析方法
    def parse(self, response):
        #print(response.text)
        #循环电影的条目
        movie_list = response.xpath("//div[@class='article']//ol[@class='grid_view']/li")
        for i_item in movie_list:
            # print(i_item)
            #item文件导进来
            douban_item = DoubanItem()
            douban_item['serial_number'] = i_item.xpath(".//div[@class='item']//em/text()").extract_first()
            douban_item['movie_name'] = i_item.xpath(".//div[@class='info']/div[@class='hd']/a/span[1]/text()").extract_first()
            content = i_item.xpath(".//div[@class='info']//div[@class='bd']/p[1]/text()").extract()
            #数据的处理
            for i_content in content:
                content_s="".join(i_content.split())
                douban_item['introduce']=content_s
                douban_item['star'] = i_item.xpath(".//span[@class='rating_num']/text()").extract_first()
                douban_item['evaluate'] = i_item.xpath(".//div[@class='star']//span[4]/text()").extract_first()
                douban_item['describe'] = i_item.xpath(".//p[@class='quote']/span/text()").extract_first()
                #将数据yield到pipeline
                yield douban_item
            #解析下一页规则,取得后页得xpath
            next_link = response.xpath("//span[@class='next']/link/@href").extract()
            if next_link:
                next_link = next_link[0]
                yield scrapy.Request("https://movie.douban.com/top250"+next_link,callback=self.parse)


提问者:慕田峪2088838 2019-10-28 16:59

个回答

  • xiupi酱
    2019-10-29 07:22:06

    使用了循环