如果不太懂Jsoup,看看这里就会了:http://blog.csdn.net/disiwei1012/article/details/51614177
要爬取的网站:http://www.ltaaa.com
======================下面是正文=========================
先看下需要爬取的网页
先看下龙腾网:http://www.ltaaa.com的firebug
再看下我抓取后的:
实体:
public class News {
private int id;
private String title;
private String href;
private String content;
private String imghref;
public News() {}
public News(String title,String href,String content,String imghref,int id){
this.imghref = imghref;
this.title = title;
this.content = content;
this.href = href;
this.id = id;
}
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getImghref() {
return imghref;
}
public void setImghref(String imghref) {
this.imghref = imghref;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getHref() {
return href;
}
public void setHref(String href) {
this.href = href;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
}
代码
public class JsoupLong {
/**
* @param args
*/
public static void main(String[] args) throws Exception{
getNews();
}
static ArrayList getNews() throws Exception{
ArrayList<News> newsList = new ArrayList<News>();
String url = "http://www.ltaaa.com";
Document doc = Jsoup.connect(url).get();
Element element = doc.getElementById("ngro1");
Elements elements = element.getElementsByTag("li");
int i = 1;
for(Element ele:elements){
News news = new News();
Element img = ele.select("img").first();
Element title = ele.select("a").first();
news.setImghref(img.attr("src"));
news.setTitle(title.text());
news.setHref(url+title.attr("href"));
news.setId(i++);
newsList.add(news);
}
File file = new File("d://longteng.html");
FileWriter fw = new FileWriter(file);
fw.write("<html><head><meta charset='UTF-8'><title>by dqf</title></head><body><center>");
fw.write("<br><br><br><br><table>");
fw.write("<tr style='background-color: gray'>");
fw.write("<td align='center'>id</td>");
fw.write("<td align='center'>国籍</td>");
fw.write("<td align='center'>标题</td>");
fw.write("</tr>");
for(News news:newsList){
fw.write("<tr>");
fw.write("<td>"+news.getId()+"</td>");
fw.write("<td><img src='"+news.getImghref()+"'></td>");
fw.write("<td><a href='"+news.getHref()+"'>"+news.getTitle()+"</td>");
fw.write("</tr>");
}
fw.write("</table></body></center></html>");
fw.flush();
fw.close();
return null;
}
}