手记

Promise重构http爬虫代码

var http = require('http');
var cheerio = require('./node_modules/cheerio');
var url = 'http://www.imooc.com/learn/348';
var baseUrl = {
    "htmlUrl": 'http://www.imooc.com/learn/',
    "numberUrl": 'http://www.imooc.com/course/AjaxCourseMembers?ids='
};
var videoIds = ['728', '637', '348', '259', '197', '134', '75'];
/**
 * 
 * 将爬取到的网页内容进行过滤调整
 * @param {string} html
 * @returns {{chapterTitle:string,videos:[{title:string,time:string,id:string}]}} 返回过滤到的对象
 */
function filterChapters(html) {
    // cheerio加载html
    var $ = cheerio.load(html);
    var chapters = $('.chapter');

    var coursesData = {
        title: $('.hd h2').text().trim(),
        number: '',
        chapters: []
    }

    var chapter, Title, videos, chapterData;
    var videos, videoTitle, id;

    chapters.each(function (value) {
        chapter = $(this);
        // 过滤不提取子类中的text
        Title = chapter.find('strong').contents().filter(function () {
            return this.nodeType == 3;
        }).text().trim();

        chapterData = {
            "chapterTitle": Title,
            "videos": []
        }

        videos = chapter.find('.video').children('li');
        videos.each(function (value) {
            video = $(this).find('.J-media-item');

            // 这个title包含了video的title和这个video的时间,两者用换行符分割
            videoTitles = video.contents().filter(function () {
                return this.nodeType == 3;
            }).text().trim().split('\n');

            id = video.attr('href').split('video/')[1];

            chapterData.videos.push({
                "title": videoTitles[0].trim(),
                "time": videoTitles[1].trim(),
                "id": id
            });
        });
        coursesData.chapters.push(chapterData);
    });
    return coursesData;
}

/**
 * 打印课程信息
 * @param {{chapterTitle:string,videos:[{title:string,time:string,id:string}]}} courseData 课程信息
 */
function printCoursrInfo(coursesData) {
    var courseMessage = '';
    var chapter;
    coursesData.forEach(function (course, index) {
        courseMessage += course.title + ' 学习人数:' + course.number + '\n';
        chapters = course.chapters;

        chapters.forEach(function (chapter, index) {
            courseMessage += '\n' + chapter.chapterTitle + '\n';
            chapter.videos.forEach(function (video, index) {
                courseMessage += '[' + video.id + '] ' + video.title + ' time:' + video.time + '\n';
            });

        });

        courseMessage += '\n\n';
    });

    console.log(courseMessage);
}

/**
 * 同步爬取多个网页内容
 * 
 * @param {string} url
 * @returns
 */
function getPageAsync(url) {
    return new Promise(function (resolve, reject) {
        console.log('正在爬取网页的内容: ' + url.htmlUrl + '\n');
        var html = '';
        var number = 0;
        http.get(url.htmlUrl, function (res) {

            res.on('data', function (data) {
                html += data;
            });

            res.on('end', function () {
                console.log('html获取完毕,开始获取学习人数......')
                // 获取完html以后,继续获取学习人数
                http.get(url.numberUrl, function (res) {
                    var resData = '';
                    res.on('data', function (data) {
                        resData += data;
                    });
                    res.on('end', function (res) {
                        console.log('获取学习人数成功')
                        number = JSON.parse(resData).data[0].numbers;

                        resolve({
                            "html": html,
                            "number": number
                        });
                    });
                }).on('error', function (e) {
                    console.log('获取人数失败: ' + e.message);
                    reject(e);
                })
            });

        }).on('error', function (e) {
            console.log('获取html失败: ' + e.message);
            reject(e);
        });

    });
}

var fetchCourseArray = [];

videoIds.forEach(function (id) {
    fetchCourseArray.push(getPageAsync({
        "htmlUrl": baseUrl.htmlUrl + id,
        "numberUrl": baseUrl.numberUrl + id
    }));
});

Promise
    .all(fetchCourseArray)
    .then(function (page) {
        var coursesData = [];
        page.forEach(function (content) {
            var course = filterChapters(content.html);
            course.number = parseInt(content.number);
            coursesData.push(course);
        });

        coursesData.sort(function (a, b) {
            return a.number < b.number;
        });

        printCoursrInfo(coursesData);
    })
2人推荐
随时随地看视频
慕课网APP