I'm trying to learn some web scraping with node.js. I chose some sample page for example http://www.imdb.com/chart/top. Then I tried to scrape all titles with rating. I created a PhantomJS script (I need to use it because it's dynamic so it uses JavaScript on the site). It's working, but I don't know how to make a loop for each title.
我正在尝试使用node.js学习一些网页抓取。我选择了一些示例页面,例如http://www.imdb.com/chart/top。然后我试图用评级刮掉所有的标题。我创建了一个PhantomJS脚本(我需要使用它,因为它是动态的,所以它在网站上使用JavaScript)。它工作正常,但我不知道如何为每个标题制作一个循环。
For example:
$('.lister-list tr').each(
function(){
$(this).find('.titleColumn').text().replace(/\n/g, '');
$(this).find('.imdbRating').text().replace(/\n/g, '');
}
);
Then I put it all in a JSON file. At this moment I can only put data without loop. This is my script:
然后我把它全部放在一个JSON文件中。此时我只能在没有循环的情况下放入数据。这是我的脚本:
var phantom = require('phantom');
var fs = require('fs');
phantom.create(function (ph) {
ph.createPage(function (page) {
page.open("http://www.imdb.com/chart/top", function (status) {
page.evaluate(function () {
//search datajquer
var k_title = $('.lister-list tr .titleColumn').first().text().replace(/\n/g, '');
var k_rating = $('.lister-list tr .imdbRating').first().text().replace(/\n/g, '');
// create json data
var metadata = JSON.stringify({
Title: k_title,
Rating: k_rating
});
return metadata;
}
, function (result) {
//save json data
fs.appendFile('java.json', "\n" + result, function (err) {
if (err) throw err;
console.log('file is updated!');
});
//display data in console
console.log('Result: ' + result);
ph.exit();
});
});
});
});
How I can make a loop which will loop over all TR elements?
我如何能够循环遍历所有TR元素?
1 个解决方案
#1
2
What you need is an array. You could initialize an empty array and push new objects onto it like this:
你需要的是一个数组。您可以初始化一个空数组并将新对象推送到它上面,如下所示:
page.evaluate(function () {
var metadataList = [];
$('.lister-list tr').each(function(){
var metadata = {
Title: $(this).find('.titleColumn').text().replace(/\n/g, ''),
Rating: $(this).find('.imdbRating').text().replace(/\n/g, '')
};
metadataList.push(metadata);
});
return JSON.stringify(metadataList);
}, function(result){ ... });
You could also use the jQuery map()
function to map each row to an object and create an array that way:
您还可以使用jQuery map()函数将每一行映射到一个对象,并创建一个数组:
page.evaluate(function () {
var metadataList = $('.lister-list tr').map(function(){
return {
Title: $(this).find('.titleColumn').text().replace(/\n/g, ''),
Rating: $(this).find('.imdbRating').text().replace(/\n/g, '')
};
}).get();
return JSON.stringify(metadataList);
}, function(result){ ... });
Notice that get()
, must be called on the map()
result to retrieve the actual array and not the jQuery object.
请注意,必须在map()结果上调用get()来检索实际数组而不是jQuery对象。
You don't need jQuery to do this:
你不需要jQuery来做到这一点:
page.evaluate(function () {
var metadataList = [];
[].forEach.call(document.querySelectorAll('.lister-list tr'), function(tr){
var metadata = {
Title: tr.querySelector('.titleColumn').textContent.replace(/\n/g, ''),
Rating: tr.querySelector('.imdbRating').textContent.replace(/\n/g, '')
};
metadataList.push(metadata);
});
return JSON.stringify(metadataList);
}, function(result){ ... });
#1
2
What you need is an array. You could initialize an empty array and push new objects onto it like this:
你需要的是一个数组。您可以初始化一个空数组并将新对象推送到它上面,如下所示:
page.evaluate(function () {
var metadataList = [];
$('.lister-list tr').each(function(){
var metadata = {
Title: $(this).find('.titleColumn').text().replace(/\n/g, ''),
Rating: $(this).find('.imdbRating').text().replace(/\n/g, '')
};
metadataList.push(metadata);
});
return JSON.stringify(metadataList);
}, function(result){ ... });
You could also use the jQuery map()
function to map each row to an object and create an array that way:
您还可以使用jQuery map()函数将每一行映射到一个对象,并创建一个数组:
page.evaluate(function () {
var metadataList = $('.lister-list tr').map(function(){
return {
Title: $(this).find('.titleColumn').text().replace(/\n/g, ''),
Rating: $(this).find('.imdbRating').text().replace(/\n/g, '')
};
}).get();
return JSON.stringify(metadataList);
}, function(result){ ... });
Notice that get()
, must be called on the map()
result to retrieve the actual array and not the jQuery object.
请注意,必须在map()结果上调用get()来检索实际数组而不是jQuery对象。
You don't need jQuery to do this:
你不需要jQuery来做到这一点:
page.evaluate(function () {
var metadataList = [];
[].forEach.call(document.querySelectorAll('.lister-list tr'), function(tr){
var metadata = {
Title: tr.querySelector('.titleColumn').textContent.replace(/\n/g, ''),
Rating: tr.querySelector('.imdbRating').textContent.replace(/\n/g, '')
};
metadataList.push(metadata);
});
return JSON.stringify(metadataList);
}, function(result){ ... });