用node写小爬虫

    先前有用php写过小爬虫,其实本质就是获取到网页中的Dom结构,然后分析里面的元素,提取出自己想要的东西。最近想找东西练练node,就想到用node做小爬虫。去看了看慕课网scott老师讲的。感觉很棒,学到了不少东西。自己跟着做了一篇,爬了下慕课网的课程。明天准备再去写一个脚本爬自己博客。下面是学到东西的总结。感谢scott老师。

    最后结果大概是这样的。

    中间我们要去分析慕课网的地址,然后得到类似下面的结构:


    包模块的选择

    这里除了http核心模块外,还用到了bluebird和cheerio,这两个模块。bluebird封装了promise,可以异步来调用。cheerio则是可以让我们更方便的操作Dom,就可以像jquery一样来操作Dom。我们的目的是,分析dom结构,然后操作dom节点,获取我们想要的,并且拼接成下面的数组:

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    var coursesData = {
    title:title,
    number:number,
    videos:[
    title:title,
    videos:[
    ]
    ]
    }


    直接上代码

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    var http = require('http')
    var Promise = require('bluebird')
    var cheerio = require('cheerio')
    var baseUrl = 'http://www.imooc.com/learn/'
    var videoIds = [348,259,197,134,75]
    function filterChapter(data){
    var $ = cheerio.load(data)
    var title = $('#main .w .path span').text()
    var number = parseInt($($('.statics .static-item')[2]).find('strong').text().trim(),10)
    var chapters = $('.chapter')
    var chapterCourses = []
    var courseData = {
    title:title,
    number:number,
    videos:[]
    }
    //每一个章节的标题和视频,这里把声明放在外面,或许会更好
    var chapter;
    var chapterTitle = ""
    var chapterVideos = []
    chapters.each(function(item){
    chapter = $(this)
    chapterTitle = chapter.find('strong').text()
    chapterVideos = chapter.find('.video').children('li')
    chapterCourses = {
    chapterTitle:chapterTitle,
    chapterVideos:[]
    }
    chapterVideos.each(function(item){
    var video = $(this).find('.studyvideo')
    var videoTitle = video.text()
    var id = video.attr('href').split('/')[1]
    chapterCourses.chapterVideos.push({
    title:videoTitle,
    id:id
    })
    })
    courseData.videos.push(chapterCourses)
    })
    return courseData
    }
    //打印数组
    function printChapter(coursesData){
    coursesData.forEach(function(courseData){
    //console.log(courseData)
    console.log(courseData.number + '个人学过 ' + courseData.title+' \n')
    })
    coursesData.forEach(function(courseData){
    console.log(courseData.title+"###"+"\n")
    courseData.videos.forEach(function(item){
    var chapterTitle = item.chapterTitle
    console.log(chapterTitle+'\n')
    item.chapterVideos.forEach(function(video) {
    console.log(' ['+video.id+']'+video.title + '\n')
    })
    })
    })
    }
    //通过promise的resolve和reject返回promise
    function getPageAsync(url){
    return new Promise(function(resolve,reject){
    http.get(url,function(res){
    var html = "";
    res.on('data',function(data){
    html += data
    })
    res.on('end',function(){
    resolve(html)
    })
    }).on('error',function(e){
    reject(e)
    console.log('获取失败')
    })
    })
    }
    var fetchCourseArr = []
    videoIds.forEach(function(id){
    console.log('开始爬取网站'+baseUrl+id)
    fetchCourseArr.push(getPageAsync(baseUrl + id))
    })
    //promise的方法all和then
    Promise.all(fetchCourseArr)
    .then(function(pages){
    var coursesData = []
    pages.forEach(function(html){
    var courses = filterChapter(html)
    coursesData.push(courses)
    })
    coursesData.sort(function(a,b){
    return a.number-b.number;
    })
    printChapter(coursesData)
    })

    中间收获

    用request发起请求

    如何用node伪造一个请求。比如就在本地提交评论到慕课网?

    注意这里的headers可以直接先去评论一个,去看看请求headers,然后复制过来就可以啦。

    中间有个错误。querystring都是小写。虽然官方文档介绍它的时候是queryString。这个地方要注意了。然后可以用request方法发起一个请求。然后最后把数据post过去就可以了。Cookie那个地方要用本地cookie.

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    var http = require('http')
    var querystring = require('querystring')
    var postData = querystring.stringify({
    'content':'老师你真帅,测试',
    'cid':348
    })
    var options = {
    hostname:'www.imooc.com',
    port:80,
    path:'/course/docomment',
    method:'POST',
    headers:{
    'Accept':'application/json, text/javascript, */*; q=0.01',
    'Accept-Encoding':'gzip, deflate',
    'Accept-Language':'zh-CN,zh;q=0.8',
    'Connection':'keep-alive',
    'Content-Length':82,
    'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
    'Cookie':'......',
    'Host':'www.imooc.com',
    'Origin':'http://www.imooc.com',
    'Referer':'http://www.imooc.com/comment/348',
    'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/40.0.2214.111 Chrome/40.0.2214.111 Safari/537.36',
    'X-Requested-With':'XMLHttpRequest'
    }
    }
    var req = http.request(options,function(res){
    console.log('res statecode'+res.statusCode)
    console.log('res headers'+ JSON.stringify(res.headers))
    res.on('data',function(chunk){
    console.log(Buffer.isBuffer(chunk))
    console.log(typeof chunk)
    })
    res.on('end',function(){
    console.log('评论完毕')
    })
    res.on('error',function(e){
    console.log('评论出现错误'+e.message())
    })
    })
    req.write(postData)
    req.end()

    这中间又出了个错,就是content-lengthpost过去的数据长度不一致。导致socket hang up解决方法就是改长度啦。


    关于promise

    promise的话,node新版本是已经当成核心模块了。但是如果是旧版本就可以用bluebird这些东西。然后再看看promise有哪里好。代码或许可以解释好多。下面是三个小球。我们让他们一个接着一个产生动画。

    看下dom结构:

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    <style>
    .box{
    width:30px;
    height:30px;
    -webkit-border-radius:50%;
    -moz-border-radius:50%;
    border-radius:50%;
    }
    .box1{
    background:red;
    }
    .box2{
    background:yellow;
    }
    .box3{
    background:blue;
    }
    </style>
    <div class="box box1" style="margin-left:0px"></div>
    <div class="box box2" style="margin-left:0px"></div>
    <div class="box box3" style="margin-left:0px"></div>

    没用promise之前,callback嵌套。

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    <script src="node_modules/bluebird/js/browser/bluebird.js"></script>
    <script>
    var box1 = document.querySelector(".box1");
    var box2 = document.querySelector(".box2");
    var box3 = document.querySelector(".box3");
    function animate(ball,distance,callback){
    setTimeout(function(){
    var marginLeft = parseInt(ball.style.marginLeft,10);
    if(marginLeft === distance){
    callback && callback();
    }else{
    if(marginLeft<distance){
    marginLeft++;
    }else{
    marginLeft--;
    }
    console.log(marginLeft);
    ball.style.marginLeft = marginLeft + "px";
    animate(ball,distance,callback);
    }
    },13)
    }
    animate(box1,100,function(){
    animate(box2,200,function(){
    animate(box3,300,function(){
    animate(box3,150,function(){
    animate(box2,150,function(){
    animate(box1,150,function(){
    console.log("animate done");
    })
    })
    })
    })
    })
    })
    </script>

    用了promise后,也是异步调用,但是确实同步编写。是不是看着很好?

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    <!DOCTYPE html>
    <script src="node_modules/bluebird/js/browser/bluebird.js"></script>
    <script>
    var box1 = document.querySelector(".box1");
    var box2 = document.querySelector(".box2");
    var box3 = document.querySelector(".box3");
    var Promise = window.Promise;
    function promiseAnimate(ball,distance){
    return new Promise(function(resolve,reject){
    function _animate(){
    setTimeout(function(){
    var marginLeft = parseInt(ball.style.marginLeft,10);
    if(marginLeft === distance){
    resolve();
    }else{
    if(marginLeft<distance){
    marginLeft++;
    }else{
    marginLeft--;
    }
    ball.style.marginLeft = marginLeft + "px";
    _animate();
    }
    },5)
    }
    _animate();
    })
    }
    promiseAnimate(box1,100)
    .then(function(){
    return promiseAnimate(box2,200);
    })
    .then(function(){
    return promiseAnimate(box3,300);
    })
    .then(function(){
    return promiseAnimate(box3,150);
    })
    .then(function(){
    return promiseAnimate(box2,150);
    })
    .then(function(){
    return promiseAnimate(box1,150);
    });
    </script>

    官网上关于promise有个例子,里面我看到了这个insertAdjacentHTML,觉得好棒。大部分浏览器都支持了。

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    <script>
    'use strict';
    var promiseCount = 0;
    function testPromise() {
    var thisPromiseCount = ++promiseCount;
    var log = document.getElementById('log');
    log.insertAdjacentHTML('beforeend', thisPromiseCount +
    ') Started (<small>Sync code started</small>)<br/>');
    // We make a new promise: we promise a numeric count of this promise, starting from 1 (after waiting 3s)
    var p1 = new Promise(
    // The resolver function is called with the ability to resolve or
    // reject the promise
    function(resolve, reject) {
    log.insertAdjacentHTML('beforeend', thisPromiseCount +
    ') Promise started (<small>Async code started</small>)<br/>');
    // This is only an example to create asynchronism
    window.setTimeout(
    function() {
    // We fulfill the promise !
    resolve(thisPromiseCount);
    }, Math.random() * 2000 + 1000);
    }
    );
    // We define what to do when the promise is resolved/fulfilled with the then() call,
    // and the catch() method defines what to do if the promise is rejected.
    p1.then(
    // Log the fulfillment value
    function(val) {
    log.insertAdjacentHTML('beforeend', val +
    ') Promise fulfilled (<small>Async code terminated</small>)<br/>');
    })
    .catch(
    // Log the rejection reason
    function(reason) {
    console.log('Handle rejected promise ('+reason+') here.');
    });
    log.insertAdjacentHTML('beforeend', thisPromiseCount +
    ') Promise made (<small>Sync code terminated</small>)<br/>');
    }
    </script>

    总结

    觉得学到挺多东西的。其实感觉node不知道怎么入手,书也不知道看啥好,有点迷茫。有看一些node书,感觉大部分都是讲语法和API,还是多做点东西,实践中学习,心里稍微踏实点。再次谢谢scott老师。