手记

node.js爬虫 爬取知乎图片存储到本地

前言:

一直对node.js有兴趣,但因为时间关系,却一直没有坚持下去,平时都是拿node.js打包做一些杀鸡用牛刀的事情,最近终于下狠心挤出游戏的时间来学习node.js。干学不动手没个意思,看网上爬虫那么火,也来趁一把热度,分享更是记录一下自己学习node.js的经历。

一.目标

目标网站:知乎
说明:爬取知乎某个问题下所有除用户头像的图片,存储到本地文件夹。

二.准备工作

想根据本文学习node爬虫的童鞋注意:本文需要一点es6基础,因为这个node爬虫是基于es6写的。

三.依赖包

cheerio:cheerio的用法在网上有很多,cheerio可以让我们很方便的处理爬虫爬取的数据,简单来说你会用jQuery就会用cheerio。给个地址:https://cnodejs.org/topic/5203a71844e76d216a727d2e
superagent:一个http请求,网上也有很多,基本用法都能找得到。给个地址:https://cnodejs.org/topic/5378720ed6e2d16149fa16bd
主要是依赖以上两个包,基本用法懂就可以了。
给出package.json:

{
  "name": "Crawler",
  "version": "0.0.0",
  "private": true,
  "scripts": {
    "start": "node ./app"
  },
  "dependencies": {
    "babel": "^6.23.0",
    "babel-cli": "^6.24.1",
    "babel-preset-stage-3": "^6.22.0",
    "babel-register": "^6.24.0",
    "babel-core": "^6.25.0",
    "babel-preset-es2015": "^6.24.1",
    "body-parser": "~1.16.0",
    "cookie-parser": "~1.4.3",
    "debug": "~2.6.0",
    "ejs": "~2.5.5",
    "express": "~4.14.1",
    "morgan": "~1.7.0",
    "nodemailer": "^4.0.1",
    "nodemailer-smtp-transport": "^2.7.4",
    "serve-favicon": "~2.3.2",
    "async": "^2.0.0-rc.6",
    "cheerio": "^0.20.0",
    "eventproxy": "^0.3.4",
    "superagent": "^2.0.0"
  },
  "devDependencies": {
    "babel-plugin-transform-async-to-generator": "^6.24.1",
    "babel-plugin-transform-es2015-classes": "^6.24.1",
    "babel-plugin-transform-es2015-modules-commonjs": "^6.24.1",
    "babel-plugin-transform-export-extensions": "^6.22.0",
    "iconv-lite": "^0.4.19",
    "jsonwebtoken": "^7.3.0",
    "mongoose": "^4.9.5"
  }
}

四.了解目标网站

想要爬取网站,首先你得了解该网站的结构。
知乎需要登录验证身份,每个问题下的答案都是ajax请求。
我们只需要找到这个ajax,然后在用superagent发起请求就行了,听起来很简单吧。
了解目标网站,我说的有些简单了(主要是偷懒),给出个python爬取知乎分析网站的案例:https://zhuanlan.zhihu.com/p/28190133 我也是看了该文后,才想用node来爬取知乎的。网上也找了下没有node爬知乎图片的案例,只有自己捣鼓一个。需注意该案例与现在知乎网站的结构有一点不一样了。

五.爬取主要代码分析

let api = 'https://www.zhihu.com/api/v4/questions/' + question_number + '/answers';
            let params = {
                'offset': offset * 20,
                'limit': 20,
                'sort_by': 'default',
                'include': 'data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,relevant_info,question,excerpt,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,upvoted_followees;data[*].mark_infos[*].url;data[*].author.follower_count,badge[?(type=best_answerer)].topics'
            };
            superagent
                .get(api)
                .send(params)
                .set({
                    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
                    'Accept-Encoding': 'gzip, deflate',
                    'Accept-Language': 'zh-CN,zh;q=0.8',
                    'Referrer': 'https://www.zhihu.com/question/' + question_number,
                    'Cookie': '_zap=a9ab675a-86b5-4321-a470-6a6fa719c592; d_c0="ABAChSwEuwuPTsVi6z1Dhz74xjjcELKItwY=|1494309012"; q_c1=84247641386f42da89d380baa324b695|1507791590000|1490256815000; r_cap_id="ZTM3ZTdhZjgwYmI3NDFjZmE1NDY3ZTJkMjgyYTVhNGM=|1512982040|832117bc35fa9640624b84b90d1d5e62403e8874"; cap_id="ZjM2OTRlMjY4NzE2NDEwYmEzN2NkNjlmNzlmZDQwMzc=|1512982040|cb2ae0375d7bcdfe314f4377a455d4b678f90275"; z_c0=Mi4xWGJIUkFnQUFBQUFBRUFLRkxBUzdDeGNBQUFCaEFsVk5JSlFiV3dBMUxmSE5mSmZiWGgyUEdTTnVrdmhuQWt1aEtB|1512982048|61da95df7ad7e07c46f4178364a720f31a0fb705; _xsrf=305d9f2d912732fb436abc754ae69baf; q_c1=84247641386f42da89d380baa324b695|1514164200000|1490256815000; __utma=51854390.575502078.1506408307.1514445619.1514533066.7; __utmz=51854390.1514533066.7.6.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=51854390.100--|2=registration_date=20160401=1^3=entry_date=20160401=1; aliyungf_tc=AQAAAI7TeTyWOwwAfi343K/lCrMz7Bgx; _xsrf=305d9f2d912732fb436abc754ae69baf',
                    'Host': 'www.zhihu.com',
                    'Connection': 'keep-alive',
                    'Accept': 'application/json, text/plain, */*',
                    'authorization': config.authorization,
                    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
                    'X-UDID': 'ABAChSwEuwuPTsVi6z1Dhz74xjjcELKItwY=',
                })
                .end((err, res) => {
                    resolve();
                    res && (result = res.text);
                });

该代码就是用superagent发起的请求,然后用cheerio分析爬取的数据,遍历出图片:

 let content = item.content;
 let $ = cheerio.load(content);
  $('img').each((index, item) => {
    let photo_url = $(item).data('original') || '';
    photo_url && photo_url_arr.push(photo_url);
   });

去重之后下载就可以了

request.head(url, (err, res, body) => {
                    try {
                        let startTime = new Date().getTime();
                        !err && request(url).on('response', () => {
                            let endTime = new Date().getTime();
                            this._log('一共%s=>下载...%s.. %s, 耗时: %ss', [arr_url.length,arr_url.indexOf(url) + 1, url, (endTime - startTime) / 1000]);
                        }).pipe(fs.createWriteStream(config.storage_path + url.substring(url.lastIndexOf('/')+1)));
                        callback(null, result);}
                    catch (err) {
                        this._log('失败:下载图片=>' + url + '时');
                    }
                });

如果想要遍历问题下所有答案下的图片,就需要做个循环。

let loop = async (offset = 0) => {
            if (options.count_number && options.count_number < offset) return this._log('执行成功=>');
            let result = await this._fetchHtmlByApi(options.question_number, offset);
            let data = JSON.parse(result).data || [];
            let totals = JSON.parse(result).paging.totals || 0;
            if (!photo_url_arr.length && totals) this._log('该问题回答数=>' + totals + '个');
            if (!data.length || !totals || (offset + 1) * 20 >= totals) return this._log('执行成功=>没有更多答案了');
            data.forEach((item) => {
                let content = item.content;
                let $ = cheerio.load(content);
                $('img').each((index, item) => {
                    let photo_url = $(item).data('original') || '';
                    photo_url && photo_url_arr.push(photo_url);
                });
            });
            return loop(++offset);
        };
        await loop();

六.最后

最后我爬取了知乎29814297问题下所以的图片得到了1920个图片


本项目已上传到github,有兴趣的童鞋可以下载下拉运行下。
https://github.com/Woshiajuana/Crawler

如果对你有所帮助或者绝对应用不错,欢迎给星。如果有什么疑问或建议,也欢迎随时找我。o(∩_∩)o

好啦,溜了溜了~~~

16人推荐
随时随地看视频
慕课网APP

热门评论

兄dei,好像跑不了

程序启动=>  2018-4-19 09:35:17
即将:解析页面=>当前页0  2018-4-19 09:35:17
成功:解析页面=>当前页0  2018-4-19 09:35:17
(node:38424) UnhandledPromiseRejectionWarning: Unhandled promise rejection (rejection id: 1): TypeError: Cannot read property 'totals' of undefined


查看全部评论