手记

基于nodejs网站爬虫程序开发

webside_parser_node

基于nodejs网站爬虫程序

1. 安装依赖
yarn #npm install
2. 文件目录结构
+-- src
|   HtmlDownloader  //网页下载器
|   HtmlParser      //网页解析器
|   Outputer        //内容输出
|   UrlManager      //url管理
|   main            //主入口和调度
3. 主要实现
//main.js
class ParserScheduler {
  constructor() {
    this.parseCount = 0;
    this.urls = new UrlManager();
    this.downloader = new HtmlDownloader();
    this.outputer = new Outputer();
    this.parser = new HtmlParser();
  }
  parse() {
    const {urls, downloader, parser, outputer} = this;
    let newUrl = urls.getNewUrl();
    console.log('new url:' + newUrl);
    return downloader
      .download(newUrl)
      .then(html => {
          const [newUrls,content] = parser.parse(html);
          urls.addNewUrls(newUrls);
          outputer.collectData(content);
          if(this.urls.hasNewUrl()) {
            this.parse();
            this.parseCount++;
          }else{
            console.log('complete!');
            this.outputer.output();
          }
      });
  }
  start(count) {
    this.urls.addNewUrl(rootUrl);
    this.parse();
  }
}

//parser
class HtmlParser {
  parse(html) {
    let aLinks = [];
    let images = [];
    const $ = cheerio.load(html);
    $('.text-page-tag').each((index, item) => {
      let href = item.attribs.href;
      if (href.indexOf('/course/list') >= 0) {
        aLinks.push(href);
      }
    });
    $('.course-banner').each((index,item)=>{
      const src = item.attribs.src;
      const alt = $(item).closest('.course-card-container')
      .find('.course-card-name').text();
      if (src) {
        images.push({src, alt});
      }
    });
    return [aLinks, images];
  }
}

//downloader
class HtmlDownloader {
  download(url) {
    return new Promise((resolve, reject) => {
      request(url, {
        headers: {
          'User-Agent': 'Mozilla/5.0',
        },
      }, (error, response, body) => {
        if (error) {
          reject(error);
        }
        resolve(body);
      });
    });
  }
}

//urlmanager
class UrlManager {
  constructor() {
    this.newUrls = [];
    this.oldUrls = [];
  }
  hasNewUrl() {
    return this.newUrls.length !== 0;
  }
  getNewUrl() {
    const url = this.newUrls.shift();
    if (!this.oldUrls.includes(url)) {
      this.oldUrls.push(url);
    }
    return url;
  }
  addNewUrl(url) {
    const {newUrls} = this;
    url = (url.indexOf('http')>=0)?url:('http://www.imooc.com'+url);
    if (!newUrls.includes(url) && !this.oldUrls.includes(url)) {
      newUrls.push(url);
    }
  }
  addNewUrls(urls) {
    if (Array.isArray(urls)) {
      urls.forEach(url=>{
        this.addNewUrl(url);
      });
    }
  }
}
//outputer
class Outputer {
  constructor() {
    this.data = [];
  }
  _getImage(url, filename) {
    console.log('写入图片文件:'+filename);
    url = url.indexOf('http:')>=0?url:('http:'+url);
    let bufferArray = [];
    const opts = {
      headers: {
        'User-Agent': 'Mozilla/5.0',
      },
    };
    request(url).pipe(fs.createWriteStream('D:\parser_pics\\'+encodeURIComponent(filename)+'.jpg'))
  }
  collectData(datas) {
    if (datas && Array.isArray(datas)) {
      datas.forEach(data => {
        this
          .data
          .push(data);
      });
    }
  }
  output() {
    // console.log('output');
    const {data} = this;
    for (let i = 0, len = data.length; i < len; i++) {
      let item = data[i];
      try{
        this._getImage(item.src, item.alt);
      }catch(e){
        console.log(e);
      }
    }
  }
}
4. 运行
node main.js
5. 说明

目前支持的nodejs版本为node 10.0.0

0人推荐
随时随地看视频
慕课网APP