手记

黑板课爬虫闯关 - 第三关

    第三关相对之前的增加了登录这一块,首先需要先进行模拟登录,然后获取到cookie里面的登录验证csrf才能进行数据获取,这里其实可以用selenium比较简单,但是这个东西小编也用得少,感兴趣的同学可以去了解一下,这里我们还是用基础的re和requests实现一下

流程

  1. 按照常规思路直接进行内容获取应该会返回一个csrf的403错误,说明少了和身份验证相关的东西
  2. 然后我们注册账号进去,发现如下页面:
  3. 仔细观察请求头信息,可以找到关于csrftoken的这一块(也可以直接页面搜索),估计这就是我们需要的令牌了
  4. 然后就是在代码块里面构建登录信息进行模拟登录,然后在新页面获取数据了
  5. 注意:
  • 模拟登录的页面和请求数据的页面是不同的,
  • 模拟登录是设置的认证参数和你在cookie中设置的参数名字不同

    这里提供三种方案供大家参考,原理都一样,主要是页面抓取的时候方法不一样,里面的诸如xpath或者etre饿、如果有问题可以参考我之前的文章
one.py

import re
import requests

if __name__ == '__main__':
    url = "http://www.heibanke.com/lesson/crawler_ex02/"
    url_login = "http://www.heibanke.com/accounts/login/?next=/lesson/crawler_ex02/"

    session = requests.Session()
    session.get(url_login)
    token = session.cookies['csrftoken']
    session.post(url_login, data={"username": "ncjnyzmhsz", "password": "aaaaaa", "csrfmiddlewaretoken": token})
    for number in range(1, 31):
        session.get(url)
        token = session.cookies['csrftoken']
        html = session.post(url, data={"username": "test", "password": number, "csrfmiddlewaretoken": token}).text

        result = re.findall('您输入的密码错误, 请重新输入', html)
        if result:
            print("密码%s错误" %number)
        else:
            print('用户test闯关成功,下一关网址是:http://www.heibanke.com'  + re.findall('<a href="(.*?)" class="btn btn-primary">下一关</a>', html)[0])
            print("密码是:%s" % number)
            break
import requests
import re


def main():
    login_data = {'username': 'user', 'password': 'password'}
    url = 'http://www.heibanke.com/lesson/crawler_ex02/'
    login_url = 'http://www.heibanke.com/accounts/login/?next=/lesson/crawler_ex02/'
    r2 = requests.get(login_url)
    c2 = r2.cookies
    login_data['csrfmiddlewaretoken'] = c2['csrftoken']
    r3 = requests.post(login_url, data=login_data, allow_redirects=False, cookies=c2)
    c3 = r3.cookies
    pass_data = {'username': 'user', 'csrfmiddlewaretoken': c3['csrftoken']}
    for passwd in range(31):
        pass_data['password'] = passwd
        r5 = requests.post(url, pass_data, cookies=c3)
        text = r5.text
        result = re.findall(r'密码错误', text)
        if u'密码错误' in text:
            print("%s密码错误" % passwd)
        else:
            print("%s密码正确" % passwd)
            title = re.findall("<title>(.*?)</title>", text)
            word = re.findall("<h1>(.*?)</h1>", text)
            word2 = re.findall("<h3>(.*?)</h3>", text)
            html = re.findall('<a href="(.*?)" class="btn btn-primary">下一关</a>', text)
            print('\n'.join([title[0], word[0], word2[0], '下一关地址是', 'http://www.heibanke.com' + html[0]]))
            break


if __name__ == '__main__':
    main()

import requests
from lxml import etree

se = requests.session()
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"
}
login_url = "http://www.heibanke.com/accounts/login"
url = 'http://www.heibanke.com/lesson/crawler_ex02/'
username = "lichen123"
password = "123kingstone"

res = se.get(url=login_url, headers=headers, timeout=30).text
csrf = se.cookies['csrftoken']
data = {
    "csrfmiddlewaretoken": csrf,
    "username": username,
    "password": password
}
se.post(url=login_url, headers=headers, data=data, timeout=30)

se.get(url, headers=headers, timeout=30)

# 获取csrf
csrf = se.cookies['csrftoken']

for pwd in range(1, 31):

    data = {
        "csrfmiddlewaretoken": csrf,
        "username": "lichen",
        "password": str(pwd)
    }
    res = se.post(url, headers=headers, data=data, timeout=30).text

    tree = etree.HTML(res)
    h3 = tree.xpath('/html/body/div/div/div[2]/h3/text()')[0]
    hre = tree.xpath('/html/body/div/div/div[2]/a/@href')
    if not u'错误' in h3:
        print(h3)
        print("密码是:%s" %pwd)
        print("下一关地址:http://www.heibanke.com%s" %hre[0])
        break
    else:
        print('密码{}错误'.format(pwd))
  • 更多代码详情参考我的Github
0人推荐
随时随地看视频
慕课网APP