继续浏览精彩内容
慕课网APP
程序员的梦工厂
打开
继续
感谢您的支持,我会继续努力的
赞赏金额会直接到老师账户
将二维码发送给自己后长按识别
微信支付
支付宝支付

python下载奇书网的小说

东风冷雪
关注TA
已关注
手记 62
粉丝 73
获赞 369

网址为,奇书网
以前下载小说是爬取所有的章节,提取文章,构成小说,那样太慢,一本小说几千章节,好久才能下载一本, 现在找到一个有TXT下载的地方,直接读写文章了.
图片.png
准备按照首页,那边的栏目,下载玄幻奇幻 武侠仙侠 女频言情 现代都市 历史军事 游戏竞技 科幻灵异 美文同人 剧本教程 名著杂志10个专题的小说
整个过程用到了不少东西

html=requests.get(url,headers=headers,verify=False).text
soup=BeautifulSoup(html,'lxml')
#按照标签查找
tag_a=soup.p
#获取属性
name = tag_p.name
title = tag_p.attrs.get('title')
title = tag_p.get('title')
title = tag_p['title']

find&&find_all查找

soup.find('a')
soup.find('a',title='hhh')
soup.find('a',id='')
soup.find('a',class_='')

去掉开头,结尾的字符,用strip()函数
分割字符串使用 split()函数,追加元素到list中使用append()函数,如果需要把另外一个list的元素一一追加到另外一个list需要使用extend函数

            html=get_html(url[i]).text
            #title=re.find(re0,html)[0]
            href=re.findall(re1,html)
            #print(href)
            infor=re.split(",",href[0])
            get_inf=[]
            #print(infor)
            for j in range(len(infor)):
                te=infor[j].strip("'").strip("'")
                get_inf.append(te)

    # 这里采用正则表达式,也可以使用 title=cont[i].get("title")
        text=re.findall(re0,str(cont[i]))
        title.extend(text)

注意,这里是坑,被坑了好一会
Python3的dictionary的keys(), values(), items()返回的都是迭代器,如果需要像Python2一样返回列表,只要传给list就行了:

   novel_path=list(novel_inf.keys())
   novel_url=list(novel_inf.values())

下载小说,准备使用map函数实现多线程,加快速度,
全局变量,在函数外,最开始地方定义,在函数中想改变其中,必须使用global,不然函数中会屏蔽其值
参考文献

相关code
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 16 22:36:52 2018

@author: dflx
"""

import requests
import os
import re
from bs4 import BeautifulSoup
import time
from multiprocessing.dummy import Pool as ThreadPool

def get_html(url):
    try:
        headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.3.2.17331',}
        response=requests.get(url,headers=headers,verify=False)
        print(response.status_code)
        response.raise_for_status
        response.encoding=response.apparent_encoding
        #print(response.text)
        return response
    except Exception as e:
        print("has exception:",e)

name=[]
def get_url(url):
    html=get_html(url).text
    re0=r'title="(.*?)">'
    soup=BeautifulSoup(html,"lxml")
    cont=soup.find_all('a')
    title=[]
    global name
    href=[]
    print("length=",len(cont))
    print(type(cont[0]))
    for i in range(2,12): #[2,12]
        print(cont[i])
        name.append(cont[i].string)
    # 这里采用正则表达式,也可以使用 title=cont[i].get("title")
        text=re.findall(re0,str(cont[i]))
        title.extend(text)
    # print(cont[i].get("href"))
        url="https://www.qisuu.la"+cont[i].get("href")
        href.append(url)

    print("title length=",len(title),type(title[0]),title[0])
    print("name length=",len(name),type(name[0]),name[0])
    print("href length",len(href),type(href[0]),href[0])
    return href

def mkdir(name,path):
    print(name)
    for i in name:
        file=path+i
        if not os.path.exists(file):
            os.makedirs(file)
    print("makdie successful")

def getChapterUrl(url,page):
    chapturl=[]
    href=get_url(url)
    for url in href:
        for num in range(1,page+1):
            chapt=url+"index_"+str(num)+".html"
            chapturl.append(chapt)
    print("down page length=",len(chapturl),chapturl[0])
    return chapturl

def getdownloadUrl(url,page):
    novel_href=[]
    chapturl=getChapterUrl(url,page)
    for ul in chapturl:
        html=get_html(ul).text
        soup=BeautifulSoup(html,"lxml")
        soup_div=soup.find_all("ul")
        tag_div=soup_div[5]
        #print(len(soup_div),type(soup_div[0]),soup_div[5])
        soup2=BeautifulSoup(str(tag_div),"lxml")
        tag_a=soup2.find_all('a')
        #print(tag_a)
        flag=1
        #print(tag_a[0])
        for i in range(len(tag_a)):
            url="https://www.qisuu.la"+tag_a[i].get("href")
            if flag%2!=0:
                novel_href.append(url)
            flag+=1
    print("the length novrl-href ",len(novel_href),novel_href[0])
    return novel_href

def downAllUrl(url,page):
    url=getdownloadUrl(url,page)
    print("----lenth=",len(url))
    print(name)
    novel_dict={}
    path="/home/dflx/下载/novel/"
    re0=r'<h1>(.*?)</h1>'
    re1=r'<script type="text/javascript">get_down_url\((.*?)\);</script>'
    for i in range(len(url)):
        try:
            html=get_html(url[i]).text
            #title=re.find(re0,html)[0]
            href=re.findall(re1,html)
            #print(href)
            infor=re.split(",",href[0])
            get_inf=[]
            #print(infor)
            for j in range(len(infor)):
                te=infor[j].strip("'").strip("'")
                get_inf.append(te)
            print(get_inf)

        #获取小说的名字
            soup=BeautifulSoup(html,"lxml")
            title=soup.find("h1").string
            print(title)

            index=i//(15*page)
            print(name[index])
            road=path+name[index]+"/"+get_inf[2]+".txt"
            print(type(road),road)
            print(type(get_inf[1]),get_inf[1])
            t1=time.time()
            #download(get_inf[1],pase,get_inf[2])
            novel_dict[road]=get_inf[1]
            t2=time.time()
            print("download spend "+str(t1-t2))
        except Exception as e:
            print("has excepton continue ",e)
            continue
    print(len(novel_dict))
    return novel_dict

novel_inf={}          
def threadDownload(url,page):
    global novel_inf
    novel_inf=downAllUrl(url,page)
    novel_path=list(novel_inf.keys())
    novel_url=list(novel_inf.values())
    pool=ThreadPool(2)
    #print(novel_path)
    print(type(novel_path))

    # thread download novel
    t1=time.time()
    result=pool.map(download,novel_path)
    t2=time.time()
    print("download spend "+str(t1-t2))
    pool.close()
    pool.join()

def download(ur):
    path=ur
    url=novel_inf[ur]
    html=get_html(url)
    text=html.content
    with open(path,'wb') as f:
        f.write(text)
    print("download "+ur+"success")

def main():
    url="https://dzs.qisuu.la/txt/22617.txt"
    url0="https://www.qisuu.la/"
    path="/home/dflx/下载/novel/"
    #getdownloadUrl(url0,1)
    #name=[]
    #get_url(url0)
    #mkdir(name,path)
    #downAllUrl(url0,3)
    threadDownload(url0,2)

小说专题

专题里面的小说情况

参考文献

BS4使用方法
Python-去除字符串中不想要的字符
Python 字典(Dictionary)操作详解
Python 字典(Dictionary)
一行 Python 实现并行化 -- 日常多线程操作的新思路
python中map()函数的用法讲解
Python map() 函数
Python3中如何实现dict.keys()的功能?

打开App,阅读手记
7人推荐
发表评论
随时随地看视频慕课网APP