#
#2的教程在3的环境下运行,改掉一些后好不容易能完成最后运行还是出现问题,真心求教
#coding:utf8
from baike_spider import url_manager, html_outputer, html_downloader,\
html_parser
print ('http://baike.baidu.com/view/21087.htm')
class SpiderMain(object):
def __int__(self):
self.urls=url_manager.ulrmanager()#初始化对象
self.dowload=html_downloader.htmldownloader()
self.parser=html_parser.htmlpaser()
self.outputer=html_outputer.htmloutputer()
def craw(self, url):
count = 1
self.urls.add_new_url(url)
while self.urls.has_new_urls():
try:
new_url =self.urls.get_new_url()#获取新的url
print ('craw %d: %s'%(count,new_url))
html_cont =self.downloader.download(new_url)#调用网页解析器下载
new_urls,new_data=self.parser.paser(new_url,html_cont)
self.urls.add_new_urls(new_urls)#新的url补充进去
self.outputer.collect_data(new_data)
if count ==100:
break
count =count+1
except:
print ('erro 1')
self.outputer.output_html()
#设置待爬取目标url
if __name__=="__main__":
url = 'http://baike.baidu.com/view/21087.htm'
obj_spider = SpiderMain()
obj_spider.craw(url)
安小暖