这里是代码:
items.py # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html importscrapy classZhihuItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() name =scrapy.Field() url_token =scrapy.Field() headline =scrapy.Field() follower_count =scrapy.Field() answer_count =scrapy.Field() articles_count =scrapy.Field() uid =scrapy.Field() gender =scrapy.Field() type=scrapy.Field() spider.py # -*- coding: utf-8 -*- importscrapy importjson importre fromzhihu.items importZhihuItem fromscrapy_redis.spiders importRedisCrawlSpider classUserinforSpider(RedisCrawlSpider): name ="userinfor" redis_key ='myspider:start_urls' allowed_domains =["zhihu.com"] # start_urls = ['https://www.zhihu.com/api/v4/members/liuyu-43-97/followers?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit=20'] defparse(self, response): temp_data =json.loads(response.body.decode("utf-8"))["data"] count =len(temp_data) ifcount < 20: pass else: page_offset =int(re.findall("&offset=(.*?)&", response.url)[0]) new_page_offset =page_offset +20 next_page_url =response.url.replace("&offset="+str(page_offset) +"&", "&offset="+str(new_page_offset) +"&") yieldscrapy.Request(url=next_page_url, callback=self.parse) foreve_user intemp_data: item =ZhihuItem() item['name'] =eve_user['name'] item['url_token'] =eve_user['url_token'] item['headline'] =eve_user['headline'] item['follower_count'] =eve_user['follower_count'] item['answer_count'] =eve_user['answer_count'] item['articles_count'] =eve_user['articles_count'] item['uid'] =eve_user['id'] item['gender'] =eve_user['gender'] item['type'] =eve_user['type'] with open("userinfor.txt") as f: user_list =f.read() ifeve_user['url_token'] notinuser_list: with open("userinfor.txt", "a") as f: f.write(eve_user['url_token'] +"----") yielditem new_url ="https://www.zhihu.com/api/v4/members/"+eve_user['url_token'] +"/followers?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit=20" yieldscrapy.Request(url=new_url, callback=self.parse) settings.py # Scrapy settings for zhihu project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html BOT_NAME ='zhihu' SPIDER_MODULES =['zhihu.spiders'] NEWSPIDER_MODULE ='zhihu.spiders' MONGODB_HOST ='127.0.0.1' MONGODB_PORT =27017 MONGODB_DBNAME ='zhihu' MONGODB_DOCNAME ='userinfor' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT ='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36' # Obey robots.txt rules ROBOTSTXT_OBEY =False # Configure maximum concurrent requests performed by Scrapy (default: 16) # CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs # DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: # CONCURRENT_REQUESTS_PER_DOMAIN = 16 # CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) # COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) # TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS ={ "accept": 'application/json, text/plain, */*', "Accept-Language": 'zh-CN,zh;q=0.9,en;q=0.8', "authorization": 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20', "Cache-Control": 'no-cache', "Connection": 'keep-alive', "Cookie": '_zap=74a1bc89-5927-46f1-a903-26b9d4f9906c; q_c1=679bbf981bc54edaa36a718a757d7110|1506423740000|1502849291000; d_c0="AFBCMYYIigyPTn-w9gPOx5CNrckgcsQKrhk=|1508201688"; q_c1=f3521e394ce8459094ba76547cddd3e5|1517535767000|1502849291000; aliyungf_tc=AQAAACykS2tz0ggA5KAxJPLJJw8rf8SF; _xsrf=c8e59c5f-190a-4b71-ad56-1425517c7a9b; r_cap_id="Yjc3Y2Y1ODkxYzcxNGZkOGFhMDUwZjBhNjFhZTEyYjI=|1519810983|a19b0558ddd2a119ed7581c8fd59427ab2298d03"; cap_id="ZDM1Y2UzZTBhZTQ2NDc3OWIzYmE3YzczYmY0YmVlNzE=|1519810983|4c6504306036f99443b659ce4f8ea2723ebb6a96"; l_cap_id="NDcyOGU5YzUyNTdmNDc1OTlhMGU1Mzc3MjQ4NDY5YjI=|1519810983|ed1d25b9a6905ad1891a94984d8cecd51b8a96e0"; __utma=51854390.1002977338.1508201688.1516880301.1519810987.10; __utmc=51854390; __utmz=51854390.1519810987.10.10.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/people/liuyu-43-97/activities; __utmv=51854390.000--|2=registration_date=20160118=1^3=entry_date=20170816=1; capsion_ticket="2|1:0|10:1519878553|14:capsion_ticket|44:N2NhNTJmNGQ5M2EyNDUzODk1MzIxYjgzNjFkM2FiZmY=|6b0b25b31dbdc0c80f49a9db073ec4953c5c4f6edd1bb1978bcee89c9b64f0b9"', "Host": 'www.zhihu.com', "Pragma": 'no-cache', "Referer": 'https://www.zhihu.com/', "X-UDID": 'AFBCMYYIigyPTn-w9gPOx5CNrckgcsQKrhk=', } # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html # SPIDER_MIDDLEWARES = { # 'zhihu.middlewares.ZhihuSpiderMiddleware': 543, # } # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES ={ # 'zhihu.middlewares.MyCustomDownloaderMiddleware': 543, 'zhihu.middlewares.ChangeProxy': 543, } # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html # EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, # } # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES ={ 'zhihu.pipelines.ZhihuPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html # AUTOTHROTTLE_ENABLED = True # The initial download delay # AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies # AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: # AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings # HTTPCACHE_ENABLED = True # HTTPCACHE_EXPIRATION_SECS = 0 # HTTPCACHE_DIR = 'httpcache' # HTTPCACHE_IGNORE_HTTP_CODES = [] # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' SCHEDULER ="scrapy_redis.scheduler.Scheduler" DUPEFILTER_CLASS ="scrapy_redis.dupefilter.RFPDupeFilter" SCHEDULER_PERSIST =True SCHEDULER_QUEUE_CLASS ='scrapy_redis.queue.SpiderQueue' middlewares.py # -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # http://doc.scrapy.org/en/latest/topics/spider-middleware.html fromscrapy importsignals importrequests importjson classZhihuSpiderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod deffrom_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s =cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) returns defprocess_spider_input(response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. returnNone defprocess_spider_output(response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. fori inresult: yieldi defprocess_spider_exception(response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Response, dict # or Item objects. pass defprocess_start_requests(start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). forr instart_requests: yieldr defspider_opened(self, spider): spider.logger.info('Spider opened: %s'%spider.name) classChangeProxy(object): ''' 需要思考的几个问题: 1)什么时候需要切换IP 本身的IP被ban,被拉黑了,无法继续使用该IP请求目标网站了 2)切换ip是否需要支出 (一般需要购买)免费的IP,不需要花钱,不免费的IP,需要花钱,但是,大部分绝大部分很大一部分的免费IP是不能用 3)如何更优秀的切换IP A)代理IP给我们的API,是有一个请求限制的,例如有的限制3S,有的限制5S,还有的限制1S B)可能我们的一个代理IP获得之后,很快就会失效了,所以,一般情况下,代理IP都是先验证,后使用 C)很有可能一个代理IP,我们可以访问网页多次,才会被ban I)我们一次获得多少代理IP? 小批量多次获取 II)我们一个代理IP用多少次,再切换 完善代理IP切换功能要考虑的几个问题: 1)IP是否可用 2)IP用几次清除掉 3)每次获得多少IP 1 2 3(不可用) 4 5 6 7 8 9 10 ''' def__init__(self): ''' 初始化变量 get_url 是请求的api temp_url 是验证的地址 ip_list 是ip ''' self.get_url ="http://api.xdaili.cn/xdaili-api//greatRecharge/getGreatIp?spiderId=e805d7cd7baa41ef87dfc5cec0ed9614&orderno=YZ20173293618kDUVrD&returnType=2&count=10" self.temp_url ="http://ip.chinaz.com/getip.aspx" self.ip_list =[] # 用来记录使用ip的个数,或者是目前正在使用的是第几个IP,本程序,我一次性获得了10个ip,所以count最大默认为9 self.count =0 # 用来记录每个IP的使用次数,此程序,我设置为最大使用4次换下一个ip self.evecount =0 defgetIPData(self): ''' 这部分是获得ip,并放入ip池(先清空原有的ip池) :return: ''' temp_data =requests.get(url=self.get_url).text self.ip_list.clear() foreve_ip injson.loads(temp_data)["RESULT"]: print(eve_ip) self.ip_list.append({ "ip":eve_ip["ip"], "port":eve_ip["port"] }) defchangeProxy(self,request): ''' 修改代理ip :param request: 对象 :return: ''' request.meta["proxy"] ="http://"+str(self.ip_list[self.count-1]["ip"]) +":"+str(self.ip_list[self.count-1]["port"]) defyanzheng(self): ''' 验证代理ip是否可用,默认超时5s :return: ''' requests.get(url=self.temp_url,proxies={"http":str(self.ip_list[self.count-1]["ip"]) +":"+str(self.ip_list[self.count-1]["port"])},timeout=5) defifUsed(self,request): ''' 切换代理ip的跳板 :param request:对象 :return: ''' try: self.changeProxy(request) self.yanzheng() except: ifself.count ==0orself.count ==10: self.getIPData() self.count =1 self.evecount =0 self.count =self.count +1 self.ifUsed(request) defprocess_request(self, request, spider): ifself.count ==0orself.count==10: self.getIPData() self.count =1 ifself.evecount ==80: self.count =self.count +1 self.evecount =0 else: self.evecount =self.evecount +1 self.ifUsed(request) pipelines.py # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html fromscrapy.conf importsettings importpymongo classZhihuPipeline(object): def__init__(self): host =settings['MONGODB_HOST'] port =settings['MONGODB_PORT'] dbName =settings['MONGODB_DBNAME'] client =pymongo.MongoClient(host=host,port=port) tdb =client[dbName] self.post =tdb[settings['MONGODB_DOCNAME']] defprocess_item(self, item, spider): zhihu =dict(item) self.post.insert(zhihu) returnitem
作者:醋留香
链接:https://www.jianshu.com/p/437ce9bf560a