# coding:utf8 from taigong import url_manager,html_downloader, html_parser, html_outputer class SpiderMain(object): #在构造函数中对象进行初始化 def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() #爬虫的总调度程序 def craw(self,root_url): count = 1 #记录当前爬取的时第几个url self.urls.add_new_url(root_url)#将入口url添加进url管理器中 while self.urls.has_new_urls():#当存在可爬取的url时 try: new_url = self.urls.get_new_url()#获取有待爬取的url print 'craw %d:%s'%(count,new_url) #启动下载器 html_cont = self.downloader.doanload(new_url)#下载好的页面存储下来 #调用解析器解析 new_urls,new_data = self.parser.parse(new_url,html_cont) self.urls.add_new_urls(new_urls) #收集数据 self.outputer.collect_data(new_data) if count ==1000: break count = count + 1 except: print 'craw failed' self.outputer.output_html() if __name__=="__main__": root_url = "http://jwc.tit.edu.cn/" obj_spider = SpiderMain() obj_spider.craw(root_url) #启动爬虫 # coding:utf8 class UrlManager(object):#需要两个列表,待爬取的url和已经爬取过得url def __init__(self): self.new_urls = set() self.old_urls = set() def add_new_url(self,url):#向管理器中添加新的url if url is None: return if url not in self.new_urls and url not in self.old_urls:#如果这个url既不在待爬取中也不在爬取过的 self.new_urls.add(url) #将其添加在待爬取中 def add_new_urls(self,urls):#向管理器中添加批量的url if urls is None or len(urls) == 0: return for url in urls: self.add_new_url(url) def has_new_url(self):#判断管理器中是否有新的待爬取的url return len(self.new_urls) != 0 def get_new_url(self):#获取新的待爬取的url new_url = self.new_urls.pop() self.old_urls.add(new_url) return new_url # coding:utf8 ''' Created on 2016年5月8日 @author: de ''' import urllib2 class HtmlDownloader(object): def doanload(self,url): if url is None: return None response = urllib2.urlopen(url) if response.getcode() != 200: return None return response.read() # coding:utf8 ''' Created on 2016年5月8日 @author: de ''' from bs4 import BeautifulSoup import re import urlparse class HtmlParser(object): def _get_new_urls(self, page_url, soup): new_urls = set()#将结果new_full_url存到新的列表里 #获取到所有的连接/info/view/content/123.htm links = soup.find_all('a',href=re.compile(r"/view/\d+\.htm")) for link in links: new_url = link['href'] #将不完整的新的url按照page_url的格式拼接成相对应的完整的url new_full_url = urlparse.urljoin(page_url,new_url) new_urls.add(new_full_url) return new_urls def _get_new_data(self, page_url, soup):#解析数据 res_data = {}#存放数据 #url res_data['url'] = page_url #<td class="titlestyle1129870260_54099" align="center"> #关于组织2016年院级教学改革研究项目立项答辩的通知 #</td> title_node = soup.find('td',class_="titlestyle1129870260_54099").find("tr") res_data['title'] = title_node.get_text() #<div id="vsb_content"> summary_node = soup.find('div',class_="vsb_content") res_data['summary'] = summary_node.get_text() return res_data def parse(self,page_url,html_cont):#参数是新的url列表和数据 #首先进行参数判断 if page_url is None or html_cont is None: return soup = BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8') new_urls = self._get_new_urls(page_url,soup) new_data = self._get_new_data(page_url,soup) return new_urls,new_data # coding:utf8 class HtmlOutputer(object): #需要一个列表维护收集的数据 def __init__(self): self.datas = [] def collect_data(self,data): if data is None: return self.datas.append(data) def output_html(self): #建立文件的输出对象 fout = open('output.html','w')#w表示写模式 fout.write("<HTML>") fout.write("<BODY>") fout.write("<TABLE>") #ascii for data in self.datas: fout.write("<TR>") fout.write("<TD>%s</TD>"%data['url']) fout.write("<TD>%s</TD>"%data['title'].encode('utf-8')) fout.write("<TD>%s</TD>"%data['summary'].encode('utf-8')) fout.write("</TR>") fout.write("</TABLE>") fout.write("</BODY>") fout.write("</HTML>") fout.close()
怪盗饭团