网址为,奇书网
以前下载小说是爬取所有的章节,提取文章,构成小说,那样太慢,一本小说几千章节,好久才能下载一本, 现在找到一个有TXT下载的地方,直接读写文章了.
准备按照首页,那边的栏目,下载玄幻奇幻 武侠仙侠 女频言情 现代都市 历史军事 游戏竞技 科幻灵异 美文同人 剧本教程 名著杂志10个专题的小说
整个过程用到了不少东西
html=requests.get(url,headers=headers,verify=False).text
soup=BeautifulSoup(html,'lxml')
#按照标签查找
tag_a=soup.p
#获取属性
name = tag_p.name
title = tag_p.attrs.get('title')
title = tag_p.get('title')
title = tag_p['title']
find&&find_all查找
soup.find('a')
soup.find('a',title='hhh')
soup.find('a',id='')
soup.find('a',class_='')
去掉开头,结尾的字符,用strip()函数
分割字符串使用 split()函数,追加元素到list中使用append()函数,如果需要把另外一个list的元素一一追加到另外一个list需要使用extend函数
html=get_html(url[i]).text
#title=re.find(re0,html)[0]
href=re.findall(re1,html)
#print(href)
infor=re.split(",",href[0])
get_inf=[]
#print(infor)
for j in range(len(infor)):
te=infor[j].strip("'").strip("'")
get_inf.append(te)
# 这里采用正则表达式,也可以使用 title=cont[i].get("title")
text=re.findall(re0,str(cont[i]))
title.extend(text)
注意,这里是坑,被坑了好一会
Python3的dictionary的keys(), values(), items()返回的都是迭代器,如果需要像Python2一样返回列表,只要传给list就行了:
novel_path=list(novel_inf.keys())
novel_url=list(novel_inf.values())
下载小说,准备使用map函数实现多线程,加快速度,
全局变量,在函数外,最开始地方定义,在函数中想改变其中,必须使用global,不然函数中会屏蔽其值
参考文献
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 16 22:36:52 2018
@author: dflx
"""
import requests
import os
import re
from bs4 import BeautifulSoup
import time
from multiprocessing.dummy import Pool as ThreadPool
def get_html(url):
try:
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.3.2.17331',}
response=requests.get(url,headers=headers,verify=False)
print(response.status_code)
response.raise_for_status
response.encoding=response.apparent_encoding
#print(response.text)
return response
except Exception as e:
print("has exception:",e)
name=[]
def get_url(url):
html=get_html(url).text
re0=r'title="(.*?)">'
soup=BeautifulSoup(html,"lxml")
cont=soup.find_all('a')
title=[]
global name
href=[]
print("length=",len(cont))
print(type(cont[0]))
for i in range(2,12): #[2,12]
print(cont[i])
name.append(cont[i].string)
# 这里采用正则表达式,也可以使用 title=cont[i].get("title")
text=re.findall(re0,str(cont[i]))
title.extend(text)
# print(cont[i].get("href"))
url="https://www.qisuu.la"+cont[i].get("href")
href.append(url)
print("title length=",len(title),type(title[0]),title[0])
print("name length=",len(name),type(name[0]),name[0])
print("href length",len(href),type(href[0]),href[0])
return href
def mkdir(name,path):
print(name)
for i in name:
file=path+i
if not os.path.exists(file):
os.makedirs(file)
print("makdie successful")
def getChapterUrl(url,page):
chapturl=[]
href=get_url(url)
for url in href:
for num in range(1,page+1):
chapt=url+"index_"+str(num)+".html"
chapturl.append(chapt)
print("down page length=",len(chapturl),chapturl[0])
return chapturl
def getdownloadUrl(url,page):
novel_href=[]
chapturl=getChapterUrl(url,page)
for ul in chapturl:
html=get_html(ul).text
soup=BeautifulSoup(html,"lxml")
soup_div=soup.find_all("ul")
tag_div=soup_div[5]
#print(len(soup_div),type(soup_div[0]),soup_div[5])
soup2=BeautifulSoup(str(tag_div),"lxml")
tag_a=soup2.find_all('a')
#print(tag_a)
flag=1
#print(tag_a[0])
for i in range(len(tag_a)):
url="https://www.qisuu.la"+tag_a[i].get("href")
if flag%2!=0:
novel_href.append(url)
flag+=1
print("the length novrl-href ",len(novel_href),novel_href[0])
return novel_href
def downAllUrl(url,page):
url=getdownloadUrl(url,page)
print("----lenth=",len(url))
print(name)
novel_dict={}
path="/home/dflx/下载/novel/"
re0=r'<h1>(.*?)</h1>'
re1=r'<script type="text/javascript">get_down_url\((.*?)\);</script>'
for i in range(len(url)):
try:
html=get_html(url[i]).text
#title=re.find(re0,html)[0]
href=re.findall(re1,html)
#print(href)
infor=re.split(",",href[0])
get_inf=[]
#print(infor)
for j in range(len(infor)):
te=infor[j].strip("'").strip("'")
get_inf.append(te)
print(get_inf)
#获取小说的名字
soup=BeautifulSoup(html,"lxml")
title=soup.find("h1").string
print(title)
index=i//(15*page)
print(name[index])
road=path+name[index]+"/"+get_inf[2]+".txt"
print(type(road),road)
print(type(get_inf[1]),get_inf[1])
t1=time.time()
#download(get_inf[1],pase,get_inf[2])
novel_dict[road]=get_inf[1]
t2=time.time()
print("download spend "+str(t1-t2))
except Exception as e:
print("has excepton continue ",e)
continue
print(len(novel_dict))
return novel_dict
novel_inf={}
def threadDownload(url,page):
global novel_inf
novel_inf=downAllUrl(url,page)
novel_path=list(novel_inf.keys())
novel_url=list(novel_inf.values())
pool=ThreadPool(2)
#print(novel_path)
print(type(novel_path))
# thread download novel
t1=time.time()
result=pool.map(download,novel_path)
t2=time.time()
print("download spend "+str(t1-t2))
pool.close()
pool.join()
def download(ur):
path=ur
url=novel_inf[ur]
html=get_html(url)
text=html.content
with open(path,'wb') as f:
f.write(text)
print("download "+ur+"success")
def main():
url="https://dzs.qisuu.la/txt/22617.txt"
url0="https://www.qisuu.la/"
path="/home/dflx/下载/novel/"
#getdownloadUrl(url0,1)
#name=[]
#get_url(url0)
#mkdir(name,path)
#downAllUrl(url0,3)
threadDownload(url0,2)
参考文献
BS4使用方法
Python-去除字符串中不想要的字符
Python 字典(Dictionary)操作详解
Python 字典(Dictionary)
一行 Python 实现并行化 -- 日常多线程操作的新思路
python中map()函数的用法讲解
Python map() 函数
Python3中如何实现dict.keys()的功能?