记得还是在学校的时候听说过网络爬虫的,最近闲的蛋疼,想到爬虫这个稀奇的玩意儿感觉挺好玩的,所以就动手做了个
在起初的爬取中用的httpClient进行爬取的,发现越用越麻烦,代码过于繁琐而且解析html太麻烦,抱着应该有更好的框架可以用的心态,上网搜到了Jsoup这个神奇
项目中用到的包结构
项目使用Jsoup进行网络的链接与网页的解析,使用dbutils进行dao操作,使用c3p0进行链接的管理
源代码下载地址:http://download.csdn.net/detail/chen1chen2chen3/9598202点击打开链接
爬虫程序的入口:
package com.crawlercity.main;
import org.jsoup.nodes.Document;
import com.crawlercity.util.HttpUtils;
import com.crawlercity.util.JsoupUtils;
public class Main {
public static void main(String[] args) {
String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/index.html";// 初始解析网页地址
// 设置代理ip
HttpUtils.setProxyIp();
Document document = JsoupUtils.getDocument(url);// 得到的document一定是正常 的document
JsoupUtils.analysisDocument(document);
}
}
用于动态ip代理的工具类HttpUtils
package com.crawlercity.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
public class HttpUtils {
/**
* 设置代理ip
* @throws IOException
*/
public static void setProxyIp() {
try {
List<String> ipList = new ArrayList<>();
BufferedReader proxyIpReader = new BufferedReader(new InputStreamReader(HttpUtils.class.getResourceAsStream("/proxyip.txt")));
String ip = "";
while((ip = proxyIpReader.readLine()) != null) {
ipList.add(ip);
}
Random random = new Random();
int randomInt = random.nextInt(ipList.size());
String ipport = ipList.get(randomInt);
String proxyIp = ipport.substring(0, ipport.lastIndexOf(":"));
String proxyPort = ipport.substring(ipport.lastIndexOf(":") + 1, ipport.length());
System.setProperty("http.maxRedirects", "50");
System.getProperties().setProperty("proxySet", "true");
System.getProperties().setProperty("http.proxyHost", proxyIp);
System.getProperties().setProperty("http.proxyPort", proxyPort);
System.out.println("设置代理ip为:" + proxyIp + "端口号为:" + proxyPort);
} catch (Exception e) {
System.out.println("重新设置代理ip");
setProxyIp();
}
}
}
用于获取document对象的工具类JsoupUtils
public static Document getDocument(String url) {
try {
Document document = Jsoup.connect(url).timeout(70).get();
if(document == null || document.toString().trim().equals("")) {// 表示ip被拦截或者其他情况
System.out.println("出现ip被拦截或者其他情况");
HttpUtils.setProxyIp();
getDocument(url);
}
return document;
} catch (Exception e) { // 链接超时等其他情况
System.out.println("出现链接超时等其他情况");
HttpUtils.setProxyIp();// 换代理ip
getDocument(url);// 继续爬取网页
}
return getDocument(url);
}
用于解析html文档的工具类 JsoupUtils
public static void analysisDocument(Document document) {
try {
String baseUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/";
CityInfo cityInfo1 = new CityInfo();
CityInfo cityInfo2 = new CityInfo();
CityInfo cityInfo3 = new CityInfo();
CityInfo cityInfo4 = new CityInfo();
CityInfo cityInfo5 = new CityInfo();
// 将类型为1(省)的数据分析并且插入
Elements elements1 = document.getElementsByAttributeValue("class", "provincetr");
for(Element element1 : elements1) {
Elements elements2 = element1.getElementsByTag("a");
for(Element element2 : elements2) {
cityInfo1.setName(element2.text());
cityInfo1.setParentId(0);
cityInfo1.setType(1);
cityInfo1.setUrl(baseUrl + element2.attr("href"));
//System.out.println("cityInfo1" + cityInfo1.toString());
int key1 = DBUtils.insertCityInfo(cityInfo1);
Document document2 = getDocument(cityInfo1.getUrl());
Elements elements3 = document2.getElementsByAttributeValue("class", "citytr");
for(Element element3 : elements3) {
Elements elements4 = element3.getElementsByTag("a");
if(elements4.toString().trim().equals("")) {
Elements diffElements = element3.getElementsByTag("td");
cityInfo2.setCode(diffElements.get(0).text());
cityInfo2.setName(diffElements.get(1).text());
cityInfo2.setParentId(key1);
cityInfo2.setType(2);
continue;
}
cityInfo2.setCode(elements4.get(0).text());
cityInfo2.setName(elements4.get(1).text());
cityInfo2.setUrl(baseUrl + elements4.get(1).attr("href"));
cityInfo2.setParentId(key1);
cityInfo2.setType(2);
/*System.out.println("cityInfo2" + cityInfo2.toString());*/
int key2 = DBUtils.insertCityInfo(cityInfo2);
Document document3 = getDocument(cityInfo2.getUrl());
Elements elements5 = document3.getElementsByAttributeValue("class", "countytr");
for(Element element5 : elements5) {
Elements elements6 = element5.getElementsByTag("a");
if(elements6.toString().trim().equals("")) {
Elements diffElements = element5.getElementsByTag("td");
cityInfo3.setCode(diffElements.get(0).text());
cityInfo3.setName(diffElements.get(1).text());
cityInfo3.setParentId(key2);
cityInfo3.setType(3);
continue;
}
cityInfo3.setCode(elements6.get(0).text());
cityInfo3.setName(elements6.get(1).text());
String cityInfo2Url = cityInfo2.getUrl();
cityInfo3.setUrl(cityInfo2Url.substring(0, cityInfo2Url.lastIndexOf("/") + 1) + elements6.get(1).attr("href"));
cityInfo3.setParentId(key2);
cityInfo3.setType(3);
/*System.out.println("cityInfo3" + cityInfo3.toString());*/
int key3 = DBUtils.insertCityInfo(cityInfo3);
Document document4 = getDocument(cityInfo3.getUrl());
Elements elements7 = document4.getElementsByAttributeValue("class", "towntr");
for(Element element7 : elements7) {
Elements elements8 = element7.getElementsByTag("a");
System.out.println(elements8.toString());
if(elements8.toString().trim().equals("")) {// 表示没有a标签
Elements diffElements = element7.getElementsByTag("td");
cityInfo4.setCode(diffElements.get(0).text());
cityInfo4.setName(diffElements.get(1).text());
cityInfo4.setParentId(key3);
cityInfo4.setType(4);
continue;
}
cityInfo4.setCode(elements8.get(0).text());
cityInfo4.setName(elements8.get(1).text());
String cityInfo3Url = cityInfo3.getUrl();
cityInfo4.setUrl(cityInfo3Url.substring(0, cityInfo3Url.lastIndexOf("/") + 1) + elements8.get(1).attr("href"));
cityInfo4.setParentId(key3);
cityInfo4.setType(4);
//System.out.println("cityInfo4" + cityInfo4.toString());
int key4 = DBUtils.insertCityInfo(cityInfo4);
Document document5 = getDocument(cityInfo4.getUrl());
Elements elements9 = document5.getElementsByAttributeValue("class", "villagetr");
for(Element element8 : elements9) {
Elements elements10 = element8.getElementsByTag("td");
cityInfo5.setCode(elements10.get(0).text());
cityInfo5.setName(elements10.get(2).text());
cityInfo5.setParentId(key4);
cityInfo5.setType(5);
/*System.out.println("cityInfo5" + cityInfo5.toString());*/
DBUtils.insertCityInfo(cityInfo5);
}
}
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
用于dao操作的工具类DbUtils
package com.crawlercity.util;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import javax.sql.DataSource;
import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.ScalarHandler;
import com.crawlercity.model.CityInfo;
import com.mchange.v2.c3p0.ComboPooledDataSource;
public class DBUtils {
private static DataSource ds = null;
public static Connection getConnection() {
if(ds == null) {
ds = new ComboPooledDataSource();
}
try {
return ds.getConnection();
} catch (SQLException e) {
e.printStackTrace();
}
return null;
}
public static DataSource getDataSource() {
return ds == null ? new ComboPooledDataSource() : ds;
}
public static void releaseSource(Connection conn, Statement st, ResultSet rs) {
try {
if(rs != null && !rs.isClosed()) {
rs.close();
}
if(st != null && !st.isClosed()) {
st.close();
}
if(conn != null && !conn.isClosed()) {
conn.close();
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static int insertCityInfo(CityInfo cityInfo) {
Connection connection = DBUtils.getConnection();
QueryRunner qr = new QueryRunner();
String sql1 = "insert into cityinfo values (?,?,?,?,?,?)";
// 返回主键
String sql2 = "SELECT LAST_INSERT_ID()";
try {
int result = qr.update(connection, sql1, null, cityInfo.getParentId(), cityInfo.getType(), cityInfo.getName(), cityInfo.getCode(), cityInfo.getUrl());
int key = Integer.parseInt(qr.query(connection, sql2, new ScalarHandler<>()).toString());
releaseSource(connection, null, null);
return key;
} catch (SQLException e) {
e.printStackTrace();
}
return 0;
}
}
写代码的过程中出现了一些问题如: Jsoup如何在设置编码的同时设置连接超时,如何在超时或者动态ip代理无效的时候重新获取动态ip代理,如何在解析html失败后继续解析等。
通过这次编程发现自己在java网络方面的只是还是有待提高,以后继续努力!