利用Jsoup爬取新冠疫情数据并存至数据库

时间:2023-12-09 17:00:31

  需要用到的jar包(用来爬取的jsoup,htmlunit-2.37.0-bin以及连接数据库中的mysql.jar)

  链接:https://pan.baidu.com/s/1VlylWmlhjd8Ka8VTruQEnA    提取码:dxeq

  爬取的原网站为:https://wp.m.163.com/163/page/news/virus_report/index.html?_nw_=1&_anw_=1

Paqu.java

package control;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlInput;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput; import Dao.AddService; public class Paqu { public static void main(String args[]) {
// TODO Auto-generated method stub
String sheng="";
String xinzeng="";
String leiji="";
String zhiyu="";
String siwang="";
String url = "https://wp.m.163.com/163/page/news/virus_report/index.html?_nw_=1&_anw_=1"; int i=0; try {
//构造一个webClient 模拟Chrome 浏览器
WebClient webClient = new WebClient(BrowserVersion.CHROME);
//支持JavaScript
webClient.getOptions().setJavaScriptEnabled(true);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setActiveXNative(false);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setTimeout(8000);
HtmlPage rootPage = webClient.getPage(url);
//设置一个运行JavaScript的时间
webClient.waitForBackgroundJavaScript(6000);
String html = rootPage.asXml();
Document doc = Jsoup.parse(html);
//System.out.println(doc);
Element listdiv1 = doc.select(".wrap").first();
Elements listdiv2 = listdiv1.select(".province");
for(Element s:listdiv2) {
Elements span = s.getElementsByTag("span");
Elements real_name=span.select(".item_name");
Elements real_newconfirm=span.select(".item_newconfirm");
Elements real_confirm=span.select(".item_confirm");
Elements real_dead=span.select(".item_dead");
Elements real_heal=span.select(".item_heal");
sheng=real_name.text();
xinzeng=real_newconfirm.text();
leiji=real_confirm.text();
zhiyu=real_heal.text();
siwang=real_dead.text();
//System.out.println(sheng+" 新增确诊:"+xinzeng+" 累计确诊:"+leiji+" 累计治愈:"+zhiyu+" 累计死亡:"+siwang);
Date currentTime=new Date();
SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm");
String time = formatter.format(currentTime);//获取当前时间
AddService dao=new AddService();
dao.add("myinfo", sheng, xinzeng, leiji, zhiyu, siwang,time);//将爬取到的数据添加至数据库
} } catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("爬取失败");
}
} }

  

  AddService.java:

package Dao;

import java.sql.Connection;
import java.sql.Statement; import utils.DBUtils; public class AddService {
public void add(String table,String sheng,String xinzeng,String leiji,String zhiyu,String dead,String time) {
String sql = "insert into "+table+" (Province,Newconfirmed_num ,Confirmed_num,Cured_num,Dead_num,Time) values('" + sheng + "','" + xinzeng +"','" + leiji +"','" + zhiyu + "','" + dead+ "','" + time+ "')";
System.out.println(sql);
Connection conn = DBUtils.getConn();
Statement state = null;
int a = 0;
try {
state = conn.createStatement();
a=state.executeUpdate(sql);
} catch (Exception e) {
e.printStackTrace();
} finally {
DBUtils.close(state, conn);
}
}
}

数据库建表如下:

利用Jsoup爬取新冠疫情数据并存至数据库

 遇到的问题

  一开始的数据是动态加载的,无法获取确定的数据,最后在代码中添加了一段js内容来获取动态数据。

  其中还尝试过爬取其他的网站上的数据,但doc并不能很好的输出,只能输出网站的大框架,无法获取具体到内容。