手记

HTTP小爬虫(在原视频代码的基础上加以修改完善了)

运行效果图:

代码

/**
 * @file http小爬虫
 * @author <a href="www.llms.com">hwj</a>
 * @version 1.0
 */

/**
 * 获取http模块
 */
var http = require('http');
/**
 * 获取cheerio模块
 */
var cheerio = require('./node_modules/cheerio');
/**
 * 解析的目标网页
 */
var url = 'http://www.imooc.com/learn/348';

/**
 * 
 * 将爬取到的网页内容进行过滤调整
 * @param {string} html
 * @returns {{chapterTitle:string,videos:[{title:string,time:string,id:string}]}} 返回过滤到的对象
 */
function filterChapters(html) {
    // cheerio加载html
    var $ = cheerio.load(html);
    var chapters = $('.chapter');

    var courseData = [];
    var chapter, Title, videos, chapterData;
    var videos, videoTitle, id;
    chapters.each(function (value) {
        chapter = $(this);
        /** nodeType返回值说明
         * 1-ELEMENT 
         * 2-ATTRIBUTE
         * 3-TEXT
         * 4-CDATA
         * 5-ENTITY REFERENCE
         * 6-ENTITY
         * 7-PI (processing instruction)
         * 8-COMMENT
         * 9-DOCUMENT
         * 10-DOCUMENT TYPE
         * 11-DOCUMENT FRAGMENT
         * 12-NOTATION
         */
        // 过滤不提取子类中的text
        Title = chapter.find('strong').contents().filter(function () {
            return this.nodeType == 3;
        }).text().trim();

        chapterData = {
            "chapterTitle": Title,
            "videos": []
        }

        videos = chapter.find('.video').children('li');
        videos.each(function (value) {
            video = $(this).find('.J-media-item');

            // 这个title包含了video的title和这个video的时间,两者用换行符分割
            videoTitles = video.contents().filter(function () {
                return this.nodeType == 3;
            }).text().trim().split('\n');

            id = video.attr('href').split('video/')[1];

            chapterData.videos.push({
                "title": videoTitles[0].trim(),
                "time": videoTitles[1].trim(),
                "id": id
            });
        });
        courseData.push(chapterData);
    });
    return courseData;
}

/**
 * 打印课程信息
 * @param {{chapterTitle:string,videos:[{title:string,time:string,id:string}]}} courseData 课程信息
 */
function printCoursrInfo(courseData) {
    var courseMessage = '';
    courseData.forEach(function (value, index) {
        courseMessage += value.chapterTitle + '\n';

        value.videos.forEach(function (value, index) {
            courseMessage += '[' + value.id + '] ' + value.title + ' time:' + value.time + '\n';
        });

        courseMessage += '\n';
    });

    console.log(courseMessage);
}

http.get(url, function (res) {
    var html = '';

    res.on('data', function (data) {
        html += data;
    });

    res.on('end', function () {
        var courseData = filterChapters(html);
        printCoursrInfo(courseData);
    });

}).on('error', function () {
    console.log('爬取失败!!!!');
});
19人推荐
随时随地看视频
慕课网APP

热门评论

老哥写的真心不错,我基本照抄还分析不好dom。

查看全部评论