Jsoup解析HTML文件

时间:2022-02-09 08:14:18

仅仅是初步的解析,对于动态网页还没有好的办法。

希望高人可以给出建议。

Jsoup比较方便,只要记住了select的规则就ok了

不明白的多试试就ok了!

代码贴出来分分享下:

//package org.jsoup.examples;
import java.net.UnknownHostException;
import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.Mongo;
import com.mongodb.MongoException;

import org.jsoup.Jsoup;
import org.jsoup.helper.Validate;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;

/**
* Example program parse the foursquare!.
*/
public class jsouptest {

/*public void LoginMongodb (Mongo mongo) { //login the mongodb

Mongo mongo = new Mongo("localhost", 27017);
}*/

public static void main(String[] args) throws IOException {


Mongo mongo = new Mongo("localhost", 27017);
DB db = mongo.getDB("PaChong");
DBCollection collection = db.getCollection("PaCHongColl");

BasicDBObject document = new BasicDBObject();
BasicDBObject document2 = new BasicDBObject();
String url = "https://foursquare.com/v/singapore-zoo/4b05880ef964a520b8ae22e3";
print("Fetching %s...", url);

Document doc = Jsoup.connect(url).get();
//get the request for the usr and transfer into doc*/
String title = new String(doc.title());
document.put("Title", title);

// get the title of this URL;
print("%s", title);

Elements Class = doc.select(".rating>span");
String score= new String(Class.text());
document.put("Score", score);
print("%s ", score);
/*for (Element score : Class){

print("%s", score.text());
//print("\r\n");
}*/
//print("%s", Class.text());
//get the score for every place.

String tempstr[];

Elements Similar = doc.select("#similarVenues>a");
//tempstr = new String[Similar.size()];
int count = 0;
for (Element Place : Similar){

String str = new String(Place.text());
count++;
String temp = String.valueOf(count);
document2.put(temp, str);
//document.put("similar", temp);
//print("%s", Place.text());
//print("\r\n");
}
String SimilarPlace = new String(Similar.text());
document.put("Similar", document2);
print("%s", Similar.text());

Elements Event = doc.select("#exploreNearby>a");
count = 0;
BasicDBObject document3 = new BasicDBObject();
for (Element temp : Event){

String str = new String(temp.text());
count++;
String tempint = String.valueOf(count);
document3.put(tempint, str);
}

document.put("Events", document3);
collection.insert(document);
}

private static void print(String msg, Object... args) {
System.out.println(String.format(msg, args));
}


}
//org/jsoup/examples/ListLinks.java

爬下来的东西存在了mongodb中,以方便今后的查看。

代码主要是查了一个title,div,的信息。