第一关
主要是构建链接,令它一直获取网页,知道获取到除了输入数字以外的内容,提供两种方式,原理都差不多,都是主要用到正则表达式和requests进行获取,小编用的是python3哦
one.py
import requests
import re
from lxml import etree
def get_Html(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.3 Safari/537.36"
}
req = requests.get(url, headers=headers, timeout=20)
# print(req.content.decode('utf-8'))
return req.content.decode('utf-8')
def next():
html = get_Html(url)
number = re.findall('<h3>.*?(\d+)</h3>', html)
while number:
next_url = "http://www.heibanke.com/lesson/crawler_ex00/%s" % number[0]
print(next_url)
html = requests.get(next_url).content.decode('utf-8')
number = re.findall(r'<h3>.*?(\d+)\.', html)
res = re.findall('<a href="(.*?)" class', html)
print("下一关的连接:http://www.heibanke.com:%s" %res[0])
if __name__ == '__main__':
url = "http://www.heibanke.com/lesson/crawler_ex00/"
next()
import requests
import re
import datetime
if __name__ == '__main__':
begin_time = datetime.datetime.now()
url = 'http://www.heibanke.com/lesson/crawler_ex00/'
new_url = url
num_re = re.compile(r'<h3>[^\d<]*?(\d+)[^\d<]*?</h3')
while True:
print('正在读取网址 ', new_url)
html = requests.get(new_url).text
num = num_re.findall(html)
if len(num) == 0:
new_url = 'http://www.heibanke.com' + re.findall(r'<a href="(.*?)" class', html)[0]
break;
else:
new_url = url + num[0]
print('最后通关的的网址是%s, 耗时%s' % (new_url, (datetime.datetime.now() - begin_time)))
- 更多代码详情参考我的Github