第三关相对之前的增加了登录这一块,首先需要先进行模拟登录,然后获取到cookie里面的登录验证csrf才能进行数据获取,这里其实可以用selenium比较简单,但是这个东西小编也用得少,感兴趣的同学可以去了解一下,这里我们还是用基础的re和requests实现一下
流程
- 按照常规思路直接进行内容获取应该会返回一个csrf的403错误,说明少了和身份验证相关的东西
- 然后我们注册账号进去,发现如下页面:
- 仔细观察请求头信息,可以找到关于csrftoken的这一块(也可以直接页面搜索),估计这就是我们需要的令牌了
- 然后就是在代码块里面构建登录信息进行模拟登录,然后在新页面获取数据了
- 注意:
- 模拟登录的页面和请求数据的页面是不同的,
- 模拟登录是设置的认证参数和你在cookie中设置的参数名字不同
这里提供三种方案供大家参考,原理都一样,主要是页面抓取的时候方法不一样,里面的诸如xpath或者etre饿、如果有问题可以参考我之前的文章
one.py
import re
import requests
if __name__ == '__main__':
url = "http://www.heibanke.com/lesson/crawler_ex02/"
url_login = "http://www.heibanke.com/accounts/login/?next=/lesson/crawler_ex02/"
session = requests.Session()
session.get(url_login)
token = session.cookies['csrftoken']
session.post(url_login, data={"username": "ncjnyzmhsz", "password": "aaaaaa", "csrfmiddlewaretoken": token})
for number in range(1, 31):
session.get(url)
token = session.cookies['csrftoken']
html = session.post(url, data={"username": "test", "password": number, "csrfmiddlewaretoken": token}).text
result = re.findall('您输入的密码错误, 请重新输入', html)
if result:
print("密码%s错误" %number)
else:
print('用户test闯关成功,下一关网址是:http://www.heibanke.com' + re.findall('<a href="(.*?)" class="btn btn-primary">下一关</a>', html)[0])
print("密码是:%s" % number)
break
import requests
import re
def main():
login_data = {'username': 'user', 'password': 'password'}
url = 'http://www.heibanke.com/lesson/crawler_ex02/'
login_url = 'http://www.heibanke.com/accounts/login/?next=/lesson/crawler_ex02/'
r2 = requests.get(login_url)
c2 = r2.cookies
login_data['csrfmiddlewaretoken'] = c2['csrftoken']
r3 = requests.post(login_url, data=login_data, allow_redirects=False, cookies=c2)
c3 = r3.cookies
pass_data = {'username': 'user', 'csrfmiddlewaretoken': c3['csrftoken']}
for passwd in range(31):
pass_data['password'] = passwd
r5 = requests.post(url, pass_data, cookies=c3)
text = r5.text
result = re.findall(r'密码错误', text)
if u'密码错误' in text:
print("%s密码错误" % passwd)
else:
print("%s密码正确" % passwd)
title = re.findall("<title>(.*?)</title>", text)
word = re.findall("<h1>(.*?)</h1>", text)
word2 = re.findall("<h3>(.*?)</h3>", text)
html = re.findall('<a href="(.*?)" class="btn btn-primary">下一关</a>', text)
print('\n'.join([title[0], word[0], word2[0], '下一关地址是', 'http://www.heibanke.com' + html[0]]))
break
if __name__ == '__main__':
main()
import requests
from lxml import etree
se = requests.session()
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"
}
login_url = "http://www.heibanke.com/accounts/login"
url = 'http://www.heibanke.com/lesson/crawler_ex02/'
username = "lichen123"
password = "123kingstone"
res = se.get(url=login_url, headers=headers, timeout=30).text
csrf = se.cookies['csrftoken']
data = {
"csrfmiddlewaretoken": csrf,
"username": username,
"password": password
}
se.post(url=login_url, headers=headers, data=data, timeout=30)
se.get(url, headers=headers, timeout=30)
# 获取csrf
csrf = se.cookies['csrftoken']
for pwd in range(1, 31):
data = {
"csrfmiddlewaretoken": csrf,
"username": "lichen",
"password": str(pwd)
}
res = se.post(url, headers=headers, data=data, timeout=30).text
tree = etree.HTML(res)
h3 = tree.xpath('/html/body/div/div/div[2]/h3/text()')[0]
hre = tree.xpath('/html/body/div/div/div[2]/a/@href')
if not u'错误' in h3:
print(h3)
print("密码是:%s" %pwd)
print("下一关地址:http://www.heibanke.com%s" %hre[0])
break
else:
print('密码{}错误'.format(pwd))
- 更多代码详情参考我的Github