runfile('C:/Users/Administrator/Desktop/新建文件夹/SpiderMain.py', wdir='C:/Users/Administrator/Desktop/新建文件夹') Reloaded modules: html_downloader, html_outputer, html_parser, url_manager Traceback (most recent call last): File "<ipython-input-21-acc2b5e5b102>", line 1, in <module> runfile('C:/Users/Administrator/Desktop/新建文件夹/SpiderMain.py', wdir='C:/Users/Administrator/Desktop/新建文件夹') File "C:\ProgramData\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 880, in runfile execfile(filename, namespace) File "C:\ProgramData\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile exec(compile(f.read(), filename, 'exec'), namespace) File "C:/Users/Administrator/Desktop/新建文件夹/SpiderMain.py", line 56, in <module> obj_spider.craw(root_url) File "C:/Users/Administrator/Desktop/新建文件夹/SpiderMain.py", line 27, in craw self.urls.add_new_url(root_url) AttributeError: 'SpiderMain' object has no attribute 'urls' 上面这是报错的,然后下面的代码。 import html_downloader import html_outputer import html_parser import url_manager class SpiderMain(object): def _init_(self): #初始化url管理器 self.urls=url_manager.UrlManager() #初始化url下载器 self.downloader=html_downloader.HtmlDownloader() #初始化url解析器 self.parser=html_parser.HtmlParser() #初始化url输出 self.outputer=html_outputer.HtmlOutputer() def craw(self,root_url): count=1 #url管理器中添加一个new url self.urls.add_new_url(root_url) #判断是否有新的url 开始爬去 while self.urls.has_new_url(): try: #得到新的url new_url=self.urls.get_new_url() print ('craw %d:%s' % (count,new_url)) #下载新的url数据 html_cont=self.downloader.download(new_url) #解析出来url的内容和地址 new_urls,new_data=self.parser.parse(new_url,html_cont) #新的url补充到url管理器 self.urls.add_new_urls(new_urls) #输出数据 self.outputer.collect_data(new_data) if count==1000: print ("finished") break count=count+1 print (count) except: print ("ceaw failed!") self.outputer.output_html() if __name__=="__main__": root_url="http://baike.baidu.com/view/21087.htm" obj_spider=SpiderMain() obj_spider.craw(root_url)
Phenomenal_0
相关分类