spider
import scrapy from douban.items import DoubanItem class DoubanSpiderSpider(scrapy.Spider): name = 'douban_spider' allowed_domains = ['https://movie.douban.com'] start_urls = ['https://movie.douban.com/top250'] def parse(self, response): movie_list=response.xpath("//div[@class='article']//ol[@class='grid_view']//li") for i_item in movie_list: douban_item=DoubanItem() douban_item['serial_number']=i_item.xpath(".//div[@class='article']//ol[@class='grid_view']//li//div[@class='item']//em/text()").extract_first() print(douban_item)
items
import scrapy class DoubanItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() #序号 serial_number=scrapy.Field() #名字 movie_name=scrapy.Field() #介绍 introduce=scrapy.Field() #星级 star=scrapy.Field() #评论数 evaluate=scrapy.Field() #描述 describe=scrapy.Fi
settings
BOT_NAME = 'douban' SPIDER_MODULES = ['douban.spiders'] NEWSPIDER_MODULE = 'douban.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36' # Obey robots.txt rules ROBOTSTXT_OBEY = False
main
from scrapy import cmdline cmdline.execute('scrapy crawl douban_spider'.split())
返回200就说明是正常的啊