import urlparse,urllib2,re
def download(url,num_retries=2):
print 'Downloading:',url
try:
html=urllib2.urlopen(url).read()
except urllib2.URLError as e:
print 'Download error:',e.reason
html=None
if num_retries>0:
if hasattr(e,'code') and 500<=e.code<600:
return download(url,num_retries-1)
return html
def get_links(html):
webpage_regex=re.compile(r'<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE)
return webpage_regex.findall(html)
def link_crawler(seed_url,link_regex):
crawl_queue=[seed_url]
while crawl_queue:
url=crawl_queue.pop()
html=download(url)
for link in get_links(html):
if re.match(link_regex,link):
link=urlparse.urljoin(seed_url,link)
crawl_queue.append(link)
运行link_crawler('http://example.webscraping.com/','/(index|view)')
Downloading: http://example.webscraping.com/
Traceback (most recent call last):
File "<pyshell#12>", line 1, in <module>
link_crawler('http://example.webscraping.com/','/(index|view)')
File "C:/Python27/lianxi/pachong4.py", line 23, in link_crawler
for link in get_links(html):
File "C:/Python27/lianxi/pachong4.py", line 16, in get_links
return webpage_regex.findall(html)
TypeError: expected string or buffer
什么问题呀?!!小白
慕瓜9220888
慕桂英4516509
聆听轩辕
相关分类