接下来很容易就解析出电影的ftp下载链接和磁力链接:
image
理论部分讲解完成后,接下来的Python实现代码如下:
# -*- coding:utf-8 -*-import urllibimport urllib2import reimport requestsimport timeimport requestsimport requests_cache# User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64)# AppleWebKit/537.36 (KHTML, like Gecko)# Chrome/65.0.3325.181 Safari/537.36 OPR/52.0.2871.64requests_cache.install_cache('demo_cache')global fp url = 'https://www.dy2018.com/html/gndy/dyzz/index.html'# url = 'https://www.dy2018.com/i/99901.html'user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'headers = {'User-Agent': user_agent}try: r = requests.get(url) print type(r) print r.status_code print r.encoding html = requests.get(url, headers=headers).text html = html.encode(r.encoding) html = html.decode("gbk") content = html # print content fp = open(unicode("temp_pachong.txt", 'utf-8'), 'w') # 文件名不乱码 fp.write(content.encode('utf-8')) fp.close() # <a href="/i/99901.html" class="ulink" title="2018年美国7.6分恐怖片《遗传厄运》BD中英双字">2018年美国7.6分恐怖片《遗传厄运》BD中英双字</a> pattern = re.compile('<b>.*?<a href="/i/(.*?).html" class="ulink" title="(.*?)">.*?</a>.*?</b>', re.S) items = re.findall(pattern, content) fp = open(unicode("电影天堂爬虫.txt",'utf-8'),'w') # 文件名不乱码 localtime=time.strftime('%Y-%m-%d-%H:%M:%S', time.localtime(time.time())) count=0 fp.write("********************" + localtime +"********************".encode('utf-8') + '\n') print '本页总资源数为:' + str(len(items)) for item in items: count=count+1 temp=str(count) + ": " + item[1] print temp fp.write(temp.encode('utf-8')+ '\n') temp='https://www.dy2018.com/i/' + item[0] + '.html' print temp #获取下载链接 url = temp r = requests.get(url) user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' headers = {'User-Agent': user_agent} html = requests.get(url, headers=headers).text html = html.encode(r.encoding) html = html.decode("gbk") content = html # print content link_temp = re.compile('<td ><a href="(.*?)">.*?</a></td>', re.S) link = re.findall(link_temp, content) print link[0] fp.write(link[0].encode('utf-8') + '\n') fp.write("********************" + localtime +"********************".encode('utf-8')) fp.close()except urllib2.URLError, e: if hasattr(e, "code"): print e.code if hasattr(e, "reason"): print e.reason fp.close()
实际效果如下:
view-source_https____i_99901.html.png
作者:看星星的天空
链接:https://www.jianshu.com/p/e9b5518bcdae