仅仅是初步的解析,对于动态网页还没有好的办法。
希望高人可以给出建议。
Jsoup比较方便,只要记住了select的规则就ok了
不明白的多试试就ok了!
代码贴出来分分享下:
//package org.jsoup.examples;
import java.net.UnknownHostException;
import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.Mongo;
import com.mongodb.MongoException;
import org.jsoup.Jsoup;
import org.jsoup.helper.Validate;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
/**
* Example program parse the foursquare!.
*/
public class jsouptest {
/*public void LoginMongodb (Mongo mongo) { //login the mongodb
Mongo mongo = new Mongo("localhost", 27017);
}*/
public static void main(String[] args) throws IOException {
Mongo mongo = new Mongo("localhost", 27017);
DB db = mongo.getDB("PaChong");
DBCollection collection = db.getCollection("PaCHongColl");
BasicDBObject document = new BasicDBObject();
BasicDBObject document2 = new BasicDBObject();
String url = "https://foursquare.com/v/singapore-zoo/4b05880ef964a520b8ae22e3";
print("Fetching %s...", url);
Document doc = Jsoup.connect(url).get();
//get the request for the usr and transfer into doc*/
String title = new String(doc.title());
document.put("Title", title);
// get the title of this URL;
print("%s", title);
Elements Class = doc.select(".rating>span");
String score= new String(Class.text());
document.put("Score", score);
print("%s ", score);
/*for (Element score : Class){
print("%s", score.text());
//print("\r\n");
}*/
//print("%s", Class.text());
//get the score for every place.
String tempstr[];
Elements Similar = doc.select("#similarVenues>a");
//tempstr = new String[Similar.size()];
int count = 0;
for (Element Place : Similar){
String str = new String(Place.text());
count++;
String temp = String.valueOf(count);
document2.put(temp, str);
//document.put("similar", temp);
//print("%s", Place.text());
//print("\r\n");
}
String SimilarPlace = new String(Similar.text());
document.put("Similar", document2);
print("%s", Similar.text());
Elements Event = doc.select("#exploreNearby>a");
count = 0;
BasicDBObject document3 = new BasicDBObject();
for (Element temp : Event){
String str = new String(temp.text());
count++;
String tempint = String.valueOf(count);
document3.put(tempint, str);
}
document.put("Events", document3);
collection.insert(document);
}
private static void print(String msg, Object... args) {
System.out.println(String.format(msg, args));
}
}
//org/jsoup/examples/ListLinks.java
爬下来的东西存在了mongodb中,以方便今后的查看。
代码主要是查了一个title,div,的信息。