Node2.js - 秒客网

Node.js简单爬虫的爬取，也是跟着慕课网上抄的，网站有一点点改动，粘上来好复习嘛

var http = require('http')

var cheerio = require('cheerio')

var url = 'http://www.imooc.com/learn/348'

function filterChapters(html){

    var $ = cheerio.load(html)

    var chapters =$('.chapter')

    // [{

    //     chapterTitle:'',

    //     videos:[

    //     title:'',

    //     id:''

    //     ]

    // }]

    // 

    var courseData=[]

    chapters.each(function(item){

        var chapter = $(this)

        var chapterTitle = chapter.find('h3').text()

        // var videos =chapter.find('.video').children('li')

         var videos =chapter.find('.video').children('li')

        var chapterData = {

            chapterTitle: chapterTitle,

            videos:[]

        }

        videos.each(function(item){

            var video = $(this).find('.J-media-item')

            var videoTitle = video.text()

           //  var id = video.attr('href').split('video/')[1]

              // var id = video.find('.data-media-id').text();

               var id = video.attr('href').split('/video/')[1]

            chapterData.videos.push({

                title: videoTitle,

                id: id

            })

        })

        courseData.push(chapterData)

    })

    return courseData

}

function printCourseInfo(courseData){

    courseData.forEach(function(item){

         var chapterTitle = item.chapterTitle

         console.log(chapterTitle+ '\n')

         item.videos.forEach(function(video){

            console.log('【'+video.id+'】'+video.title+'\n')

         })

    })

}

http.get(url, function(res){

    var html = ''

    res.on('data', function(data){

        html += data;

    })

    res.on('end', function(){

       // filterChapters(html)

        var courseData = filterChapters(html)

//console.log(courseData+'finish'+'\n')

      printCourseInfo(courseData)

    })

}).on('error',function(){

    console.log('获取课程数据出错')

})

效果

Node2.js

数据还有一点没整理好得日后再弄

Node2.js

就是把不想要的也取回来了，现在还不懂怎么数据清洗干净，先记下来。