Node2.js

时间:2023-03-08 18:47:16

Node.js简单爬虫的爬取,也是跟着慕课网上抄的,网站有一点点改动,粘上来好复习嘛

var http = require('http')
var cheerio = require('cheerio')
var url = 'http://www.imooc.com/learn/348' function filterChapters(html){
var $ = cheerio.load(html) var chapters =$('.chapter') // [{
// chapterTitle:'',
// videos:[
// title:'',
// id:''
// ]
// }]
// var courseData=[] chapters.each(function(item){ var chapter = $(this) var chapterTitle = chapter.find('h3').text() // var videos =chapter.find('.video').children('li')
var videos =chapter.find('.video').children('li')
var chapterData = {
chapterTitle: chapterTitle,
videos:[]
} videos.each(function(item){
var video = $(this).find('.J-media-item')
var videoTitle = video.text()
// var id = video.attr('href').split('video/')[1]
// var id = video.find('.data-media-id').text();
var id = video.attr('href').split('/video/')[1]
chapterData.videos.push({
title: videoTitle,
id: id
}) }) courseData.push(chapterData)
}) return courseData
} function printCourseInfo(courseData){
courseData.forEach(function(item){
var chapterTitle = item.chapterTitle console.log(chapterTitle+ '\n') item.videos.forEach(function(video){
console.log('【'+video.id+'】'+video.title+'\n')
})
})
} http.get(url, function(res){
var html = '' res.on('data', function(data){
html += data;
}) res.on('end', function(){
// filterChapters(html)
var courseData = filterChapters(html)
//console.log(courseData+'finish'+'\n')
printCourseInfo(courseData)
})
}).on('error',function(){
console.log('获取课程数据出错')
})

效果

Node2.js

数据还有一点没整理好得日后再弄

Node2.js

就是把不想要的也取回来了,现在还不懂怎么数据清洗干净,先记下来。