进击node.js(二) 使用promise对象的爬虫，个人代码-原创手记-慕课网

如果直接复制代码到sublime里变成了一行，请参考如下地址：
http://www.imooc.com/article/6981

几处解释：
1.步骤1-3都是针对单个页面时写的代码，标注‘扩展’二字的步骤都是从单个课程扩展到多课程时候添加的代码。编写顺序也是先由单课程页面爬取再到多课程的爬取。每一个函数的作用也都进行了尽可能的注释

2.mytext()这个函数的作用同于text(),只不过消除了子元素的文本（因为text（）函数默认会把子元素的文本包括进来）

3.课程人数因为html文件中并不包含，所以无法获取。这个BUG我找了好久才发现，坑爹啊
’
绝大部分地方我都尽量注释了，如果有不明白的可以询问。有不正确的地方请指摘。
promise对象是个难点，一定要弄清楚他的用法再去看代码。

var http = require('http')
var cheerio = require('cheerio')
var basicUrl = 'http://www.imooc.com/learn/'
var Promise = require('bluebird')
var baseUrl = 'http://www.imooc.com/learn/637'
    //***扩展变量
var coursesIds = [348, 637, 259, 75, 197]; //要爬取的课程ID
var pagesArr = []
    //***扩展变量完毕
    //***4.单页面扩展至多课程----批量获取页面html
coursesIds.forEach(function(id) {
        pagesArr.push(getPageAsync(basicUrl + id))
    })
    //1.获取界面html并返回准备传入解析函数,利用Promise对象进行Async的页面获取
function getPageAsync(url) {
    return new Promise(function(resolve, reject) {
        http.get(url, function(res) {
            console.log('正在爬取' + url + '\n')
            var html = ''
            res.on('data', function(data) {
                html += data
            })
            res.on('end', function() {
                console.log('页面' + url + '爬取完毕')
                resolve(html)
            })
        }).on('error', function(e) {
            reject(e)
            console.log('爬取页面信息失败')
        })
    })
}
//2.解析函数，用于将传入的页面HTML代码解析，提取其中所需要的信息，并保存在courseData对象中

function filterChapters(html) {
    var $ = cheerio.load(html)

    //下面是自定义函数，用于text()函数删除子元素的影响
    $.fn.mytext = function() {
            return $(this).clone()
                .children()
                .remove()
                .end()
                .text()
        }
    //自定义函数完毕

    /*数据信息格式
    var courseData={
        courseTitle:'',
        courseLearners:'',
        chapters:[]
    }
    chapters=[{
        chapterTitle:chapterTitle,
        videos:[{
                title:title,
                id:id
                }
                ]

    }]
    */

    var courseTitle = $($($('.course-infos').find('.hd')).find('h2')).mytext().trim()
    var courseLearners = parseInt($($('.statics').children('.static-item')).find('span[class="meta-value js-learn-num"]').mytext()) //注意 因为html文件中没有人数信息，所以获取不到,这里调了好久才发现
    var courseData = {
            courseTitle: courseTitle,
            courseLearners: courseLearners,
            chapters: []
        } //所有信息存储用的数组
    var chapters = $('.chapter')
        //遍历所需要的变量
    var $thisChapter, $chapterTitle
    var $videoTitle, $videoId, $thisVideo
    var $videos
        //

    //张杰遍历开始
    chapters.each(function(item) {
            $thisChapter = $(this)
            $chapterTitle = $($thisChapter.find('strong')).mytext().replace(/\s/g, '')
                //用于保存信息和最终输出的courseData对象
            var chapterData = {
                    chapterTitle: $chapterTitle,
                    videos: []
                }
                //完毕
                //获取并保存videos数据
            $videos = $thisChapter.find('.video').children('li')
            $videos.each(function(item) { //开始遍历这一章中的所有视频
                    $thisVideo = $(this)
                    $videoTitle = $thisVideo.find('.J-media-item').mytext().replace(/\s/g, '')
                    $videoId = $thisVideo.find('.J-media-item').attr('href').split('video/')[1]
                        //合成一个对象，保存至chapterData的videos数组中
                    chapterData.videos.push({
                            title: $videoTitle,
                            id: $videoId
                        })
                        //video推入courseData完毕

                })
                //videos存储完毕
            courseData.chapters.push(chapterData)
        }) //张杰遍历结束

    //返回courseData
    return courseData
}

//3.输出函数，按照指定格式输出courseData中保存的信息
function printInfo(courseData) {
    console.log('#####################课程:' + courseData.courseTitle + '#######################')
    console.log('                                                   ->该课程学习人数:' + courseData.courseLearners + '\n')
    courseData.chapters.forEach(function(item) {
        console.log('   @@@章节：' + item.chapterTitle + '@@@\n')
            //输出张杰下的视频信息
        item.videos.forEach(function(item) {
            console.log('        >课程:[' + item.id + ']' + item.title)
        })

    })
    console.log('\n')
}

// //函数运行主体
// getPageAsync(baseUrl)
//     .then(function(html){
//         var courseData=filterChapters(html)//利用解析函数解析所爬取的页面并返回课程信息对象
//         printInfo(courseData)//输出最终整理完毕的课程信息对象
//     })

//扩展--函数运行主体
Promise
    .all(pagesArr)
    .then(function(pages) {
        pages.forEach(function(page) {
            var courseData = filterChapters(page)
            printInfo(courseData)
        })

    })