记得还是在学校的时候听说过网络爬虫的,最近闲的蛋疼,想到爬虫这个稀奇的玩意儿感觉挺好玩的,所以就动手做了个
在起初的爬取中用的httpClient进行爬取的,发现越用越麻烦,代码过于繁琐而且解析html太麻烦,抱着应该有更好的框架可以用的心态,上网搜到了Jsoup这个神奇
项目中用到的包结构
项目使用Jsoup进行网络的链接与网页的解析,使用dbutils进行dao操作,使用c3p0进行链接的管理
源代码下载地址:http://download.csdn.net/detail/chen1chen2chen3/9598202点击打开链接
爬虫程序的入口:
package com.crawlercity.main; import org.jsoup.nodes.Document; import com.crawlercity.util.HttpUtils; import com.crawlercity.util.JsoupUtils; public class Main { public static void main(String[] args) { String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/index.html";// 初始解析网页地址 // 设置代理ip HttpUtils.setProxyIp(); Document document = JsoupUtils.getDocument(url);// 得到的document一定是正常 的document JsoupUtils.analysisDocument(document); } }
用于动态ip代理的工具类HttpUtils
package com.crawlercity.util; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; import java.util.Random; public class HttpUtils { /** * 设置代理ip * @throws IOException */ public static void setProxyIp() { try { List<String> ipList = new ArrayList<>(); BufferedReader proxyIpReader = new BufferedReader(new InputStreamReader(HttpUtils.class.getResourceAsStream("/proxyip.txt"))); String ip = ""; while((ip = proxyIpReader.readLine()) != null) { ipList.add(ip); } Random random = new Random(); int randomInt = random.nextInt(ipList.size()); String ipport = ipList.get(randomInt); String proxyIp = ipport.substring(0, ipport.lastIndexOf(":")); String proxyPort = ipport.substring(ipport.lastIndexOf(":") + 1, ipport.length()); System.setProperty("http.maxRedirects", "50"); System.getProperties().setProperty("proxySet", "true"); System.getProperties().setProperty("http.proxyHost", proxyIp); System.getProperties().setProperty("http.proxyPort", proxyPort); System.out.println("设置代理ip为:" + proxyIp + "端口号为:" + proxyPort); } catch (Exception e) { System.out.println("重新设置代理ip"); setProxyIp(); } } }
用于获取document对象的工具类JsoupUtils
public static Document getDocument(String url) { try { Document document = Jsoup.connect(url).timeout(70).get(); if(document == null || document.toString().trim().equals("")) {// 表示ip被拦截或者其他情况 System.out.println("出现ip被拦截或者其他情况"); HttpUtils.setProxyIp(); getDocument(url); } return document; } catch (Exception e) { // 链接超时等其他情况 System.out.println("出现链接超时等其他情况"); HttpUtils.setProxyIp();// 换代理ip getDocument(url);// 继续爬取网页 } return getDocument(url); }
用于解析html文档的工具类 JsoupUtils
public static void analysisDocument(Document document) { try { String baseUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/"; CityInfo cityInfo1 = new CityInfo(); CityInfo cityInfo2 = new CityInfo(); CityInfo cityInfo3 = new CityInfo(); CityInfo cityInfo4 = new CityInfo(); CityInfo cityInfo5 = new CityInfo(); // 将类型为1(省)的数据分析并且插入 Elements elements1 = document.getElementsByAttributeValue("class", "provincetr"); for(Element element1 : elements1) { Elements elements2 = element1.getElementsByTag("a"); for(Element element2 : elements2) { cityInfo1.setName(element2.text()); cityInfo1.setParentId(0); cityInfo1.setType(1); cityInfo1.setUrl(baseUrl + element2.attr("href")); // System.out.println("cityInfo1" + cityInfo1.toString()); int key1 = DBUtils.insertCityInfo(cityInfo1); Document document2 = getDocument(cityInfo1.getUrl()); Elements elements3 = document2.getElementsByAttributeValue("class", "citytr"); for(Element element3 : elements3) { Elements elements4 = element3.getElementsByTag("a"); if(elements4.toString().trim().equals("")) { Elements diffElements = element3.getElementsByTag("td"); cityInfo2.setCode(diffElements.get(0).text()); cityInfo2.setName(diffElements.get(1).text()); cityInfo2.setParentId(key1); cityInfo2.setType(2); continue; } cityInfo2.setCode(elements4.get(0).text()); cityInfo2.setName(elements4.get(1).text()); cityInfo2.setUrl(baseUrl + elements4.get(1).attr("href")); cityInfo2.setParentId(key1); cityInfo2.setType(2); /*System.out.println("cityInfo2" + cityInfo2.toString());*/ int key2 = DBUtils.insertCityInfo(cityInfo2); Document document3 = getDocument(cityInfo2.getUrl()); Elements elements5 = document3.getElementsByAttributeValue("class", "countytr"); for(Element element5 : elements5) { Elements elements6 = element5.getElementsByTag("a"); if(elements6.toString().trim().equals("")) { Elements diffElements = element5.getElementsByTag("td"); cityInfo3.setCode(diffElements.get(0).text()); cityInfo3.setName(diffElements.get(1).text()); cityInfo3.setParentId(key2); cityInfo3.setType(3); continue; } cityInfo3.setCode(elements6.get(0).text()); cityInfo3.setName(elements6.get(1).text()); String cityInfo2Url = cityInfo2.getUrl(); cityInfo3.setUrl(cityInfo2Url.substring(0, cityInfo2Url.lastIndexOf("/") + 1) + elements6.get(1).attr("href")); cityInfo3.setParentId(key2); cityInfo3.setType(3); /* System.out.println("cityInfo3" + cityInfo3.toString());*/ int key3 = DBUtils.insertCityInfo(cityInfo3); Document document4 = getDocument(cityInfo3.getUrl()); Elements elements7 = document4.getElementsByAttributeValue("class", "towntr"); for(Element element7 : elements7) { Elements elements8 = element7.getElementsByTag("a"); System.out.println(elements8.toString()); if(elements8.toString().trim().equals("")) {// 表示没有a标签 Elements diffElements = element7.getElementsByTag("td"); cityInfo4.setCode(diffElements.get(0).text()); cityInfo4.setName(diffElements.get(1).text()); cityInfo4.setParentId(key3); cityInfo4.setType(4); continue; } cityInfo4.setCode(elements8.get(0).text()); cityInfo4.setName(elements8.get(1).text()); String cityInfo3Url = cityInfo3.getUrl(); cityInfo4.setUrl(cityInfo3Url.substring(0, cityInfo3Url.lastIndexOf("/") + 1) + elements8.get(1).attr("href")); cityInfo4.setParentId(key3); cityInfo4.setType(4); // System.out.println("cityInfo4" + cityInfo4.toString()); int key4 = DBUtils.insertCityInfo(cityInfo4); Document document5 = getDocument(cityInfo4.getUrl()); Elements elements9 = document5.getElementsByAttributeValue("class", "villagetr"); for(Element element8 : elements9) { Elements elements10 = element8.getElementsByTag("td"); cityInfo5.setCode(elements10.get(0).text()); cityInfo5.setName(elements10.get(2).text()); cityInfo5.setParentId(key4); cityInfo5.setType(5); /*System.out.println("cityInfo5" + cityInfo5.toString());*/ DBUtils.insertCityInfo(cityInfo5); } } } } } } } catch (Exception e) { e.printStackTrace(); }
用于dao操作的工具类DbUtils
package com.crawlercity.util; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import javax.sql.DataSource; import org.apache.commons.dbutils.QueryRunner; import org.apache.commons.dbutils.handlers.ScalarHandler; import com.crawlercity.model.CityInfo; import com.mchange.v2.c3p0.ComboPooledDataSource; public class DBUtils { private static DataSource ds = null; public static Connection getConnection() { if(ds == null) { ds = new ComboPooledDataSource(); } try { return ds.getConnection(); } catch (SQLException e) { e.printStackTrace(); } return null; } public static DataSource getDataSource() { return ds == null ? new ComboPooledDataSource() : ds; } public static void releaseSource(Connection conn, Statement st, ResultSet rs) { try { if(rs != null && !rs.isClosed()) { rs.close(); } if(st != null && !st.isClosed()) { st.close(); } if(conn != null && !conn.isClosed()) { conn.close(); } } catch (Exception e) { e.printStackTrace(); } } public static int insertCityInfo(CityInfo cityInfo) { Connection connection = DBUtils.getConnection(); QueryRunner qr = new QueryRunner(); String sql1 = "insert into cityinfo values (?,?,?,?,?,?)"; // 返回主键 String sql2 = "SELECT LAST_INSERT_ID()"; try { int result = qr.update(connection, sql1, null, cityInfo.getParentId(), cityInfo.getType(), cityInfo.getName(), cityInfo.getCode(), cityInfo.getUrl()); int key = Integer.parseInt(qr.query(connection, sql2, new ScalarHandler<>()).toString()); releaseSource(connection, null, null); return key; } catch (SQLException e) { e.printStackTrace(); } return 0; } }
写代码的过程中出现了一些问题如: Jsoup如何在设置编码的同时设置连接超时,如何在超时或者动态ip代理无效的时候重新获取动态ip代理,如何在解析html失败后继续解析等。
通过这次编程发现自己在java网络方面的只是还是有待提高,以后继续努力!