网络爬虫爬取全国省市区（动态ip代理的获取，实现对ip限制的突破）

记得还是在学校的时候听说过网络爬虫的，最近闲的蛋疼，想到爬虫这个稀奇的玩意儿感觉挺好玩的，所以就动手做了个

在起初的爬取中用的httpClient进行爬取的，发现越用越麻烦，代码过于繁琐而且解析html太麻烦，抱着应该有更好的框架可以用的心态，上网搜到了Jsoup这个神奇

项目中用到的包结构

项目使用Jsoup进行网络的链接与网页的解析，使用dbutils进行dao操作，使用c3p0进行链接的管理

源代码下载地址：http://download.csdn.net/detail/chen1chen2chen3/9598202点击打开链接

爬虫程序的入口：

package com.crawlercity.main;


import org.jsoup.nodes.Document;

import com.crawlercity.util.HttpUtils;
import com.crawlercity.util.JsoupUtils;

public class Main {
	public static void main(String[] args) {
		String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/index.html";// 初始解析网页地址
		// 设置代理ip
		HttpUtils.setProxyIp();
		Document document = JsoupUtils.getDocument(url);// 得到的document一定是正常 的document
		JsoupUtils.analysisDocument(document);
		
	}
}

用于动态ip代理的工具类HttpUtils

package com.crawlercity.util;


import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;

public class HttpUtils {
	/**
	 * 设置代理ip
	 * @throws IOException
	 */
	public static void setProxyIp() {
		try {
			List<String> ipList = new ArrayList<>();
			BufferedReader proxyIpReader = new BufferedReader(new InputStreamReader(HttpUtils.class.getResourceAsStream("/proxyip.txt")));
			
			String ip = "";
			while((ip = proxyIpReader.readLine()) != null) {
				ipList.add(ip);
			}
			
			Random random = new Random();
			int randomInt = random.nextInt(ipList.size());
			String ipport = ipList.get(randomInt);
			String proxyIp = ipport.substring(0, ipport.lastIndexOf(":"));
			String proxyPort = ipport.substring(ipport.lastIndexOf(":") + 1, ipport.length());
			
			System.setProperty("http.maxRedirects", "50");  
	        System.getProperties().setProperty("proxySet", "true");   
	        System.getProperties().setProperty("http.proxyHost", proxyIp);  
	        System.getProperties().setProperty("http.proxyPort", proxyPort);
	        
	        System.out.println("设置代理ip为：" + proxyIp + "端口号为：" + proxyPort);
		} catch (Exception e) {
			System.out.println("重新设置代理ip");
			setProxyIp();
		}
		  
		
	}
}

用于获取document对象的工具类JsoupUtils

public static Document getDocument(String url) {
			try {
				Document document = Jsoup.connect(url).timeout(70).get();
				
				if(document == null || document.toString().trim().equals("")) {// 表示ip被拦截或者其他情况
					System.out.println("出现ip被拦截或者其他情况");
					HttpUtils.setProxyIp();
					getDocument(url);
				}
				
				return document;
			} catch (Exception e) { // 链接超时等其他情况
				System.out.println("出现链接超时等其他情况");
				HttpUtils.setProxyIp();// 换代理ip
				getDocument(url);// 继续爬取网页
			}
			return getDocument(url);
		}

用于解析html文档的工具类 JsoupUtils

public static void analysisDocument(Document document) {
		try {
			String baseUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/";
			CityInfo cityInfo1 = new CityInfo();
			CityInfo cityInfo2 = new CityInfo();
			CityInfo cityInfo3 = new CityInfo();
			CityInfo cityInfo4 = new CityInfo();
			CityInfo cityInfo5 = new CityInfo();
			// 将类型为1（省）的数据分析并且插入
			Elements elements1 = document.getElementsByAttributeValue("class", "provincetr");
			for(Element element1 : elements1) {
				Elements elements2 = element1.getElementsByTag("a");
				for(Element element2 : elements2) {
					cityInfo1.setName(element2.text());
					cityInfo1.setParentId(0);
					cityInfo1.setType(1);
					cityInfo1.setUrl(baseUrl + element2.attr("href"));
//					System.out.println("cityInfo1" + cityInfo1.toString());
					int key1 = DBUtils.insertCityInfo(cityInfo1);
					Document document2 = getDocument(cityInfo1.getUrl());
					Elements elements3 = document2.getElementsByAttributeValue("class", "citytr");
					for(Element element3 : elements3) {
						Elements elements4 = element3.getElementsByTag("a");
						if(elements4.toString().trim().equals("")) {
							Elements diffElements = element3.getElementsByTag("td");
							cityInfo2.setCode(diffElements.get(0).text());
							cityInfo2.setName(diffElements.get(1).text());
							cityInfo2.setParentId(key1);
							cityInfo2.setType(2);
							continue;
						}
						cityInfo2.setCode(elements4.get(0).text());
						cityInfo2.setName(elements4.get(1).text());
						cityInfo2.setUrl(baseUrl + elements4.get(1).attr("href"));
						cityInfo2.setParentId(key1);
						cityInfo2.setType(2);
						/*System.out.println("cityInfo2" + cityInfo2.toString());*/
						int key2 = DBUtils.insertCityInfo(cityInfo2);
						Document document3 = getDocument(cityInfo2.getUrl());
						Elements elements5 = document3.getElementsByAttributeValue("class", "countytr");
						for(Element element5 : elements5) {
							Elements elements6 = element5.getElementsByTag("a");
							if(elements6.toString().trim().equals("")) {
								Elements diffElements = element5.getElementsByTag("td");
								cityInfo3.setCode(diffElements.get(0).text());
								cityInfo3.setName(diffElements.get(1).text());
								cityInfo3.setParentId(key2);
								cityInfo3.setType(3);
								continue;
							}
							cityInfo3.setCode(elements6.get(0).text());
							cityInfo3.setName(elements6.get(1).text());
							String cityInfo2Url = cityInfo2.getUrl();
							cityInfo3.setUrl(cityInfo2Url.substring(0, cityInfo2Url.lastIndexOf("/") + 1) + elements6.get(1).attr("href"));
							cityInfo3.setParentId(key2);
							cityInfo3.setType(3);
					/*		System.out.println("cityInfo3" + cityInfo3.toString());*/
							int key3 = DBUtils.insertCityInfo(cityInfo3);
							Document document4 = getDocument(cityInfo3.getUrl());
							Elements elements7 = document4.getElementsByAttributeValue("class", "towntr");
							for(Element element7 : elements7) {
								Elements elements8 = element7.getElementsByTag("a");
								System.out.println(elements8.toString());
								if(elements8.toString().trim().equals("")) {// 表示没有a标签
									Elements diffElements = element7.getElementsByTag("td");
									cityInfo4.setCode(diffElements.get(0).text());
									cityInfo4.setName(diffElements.get(1).text());
									cityInfo4.setParentId(key3);
									cityInfo4.setType(4);
									continue;
								}
								cityInfo4.setCode(elements8.get(0).text());
								cityInfo4.setName(elements8.get(1).text());
								String cityInfo3Url = cityInfo3.getUrl();
								cityInfo4.setUrl(cityInfo3Url.substring(0, cityInfo3Url.lastIndexOf("/") + 1) + elements8.get(1).attr("href"));
								cityInfo4.setParentId(key3);
								cityInfo4.setType(4);
//								System.out.println("cityInfo4" + cityInfo4.toString());
								int key4 = DBUtils.insertCityInfo(cityInfo4);
								Document document5 = getDocument(cityInfo4.getUrl());
								Elements elements9 = document5.getElementsByAttributeValue("class", "villagetr");
								for(Element element8 : elements9) {
									Elements elements10 = element8.getElementsByTag("td");
									cityInfo5.setCode(elements10.get(0).text());
									cityInfo5.setName(elements10.get(2).text());
									cityInfo5.setParentId(key4);
									cityInfo5.setType(5);
									/*System.out.println("cityInfo5" + cityInfo5.toString());*/
									DBUtils.insertCityInfo(cityInfo5);
								}
							}
						}
					}
				}
			}
			
		} catch (Exception e) {
			e.printStackTrace();
		}

用于dao操作的工具类DbUtils

package com.crawlercity.util;

import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;

import javax.sql.DataSource;

import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.ScalarHandler;

import com.crawlercity.model.CityInfo;
import com.mchange.v2.c3p0.ComboPooledDataSource;

public class DBUtils {

	private static DataSource ds = null;

	public static Connection getConnection() {
		if(ds == null) {
			ds = new ComboPooledDataSource();
		}
		try {
			return ds.getConnection();
		} catch (SQLException e) {
			e.printStackTrace();
		}
		return null;
	}

	public static DataSource getDataSource() {
		return ds == null ? new ComboPooledDataSource() : ds;
	}
	

	public static void releaseSource(Connection conn, Statement st, ResultSet rs) {
		try {
			if(rs != null && !rs.isClosed()) {
				rs.close();
			}
			if(st != null && !st.isClosed()) {
				st.close();
			}
			if(conn != null && !conn.isClosed()) {
				conn.close();
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	
	public static int insertCityInfo(CityInfo cityInfo) {
		
		Connection connection = DBUtils.getConnection();
		QueryRunner qr = new QueryRunner();
		String sql1 = "insert into cityinfo values (?,?,?,?,?,?)";
		// 返回主键
		String sql2 = "SELECT LAST_INSERT_ID()";
		
		try {
			int result = qr.update(connection, sql1, null, cityInfo.getParentId(), cityInfo.getType(), cityInfo.getName(), cityInfo.getCode(), cityInfo.getUrl());
			int key = Integer.parseInt(qr.query(connection, sql2, new ScalarHandler<>()).toString());
			releaseSource(connection, null, null);
			return key;
		} catch (SQLException e) {
			e.printStackTrace();
		}
		return 0;
	}
}

写代码的过程中出现了一些问题如： Jsoup如何在设置编码的同时设置连接超时，如何在超时或者动态ip代理无效的时候重新获取动态ip代理，如何在解析html失败后继续解析等。

通过这次编程发现自己在java网络方面的只是还是有待提高，以后继续努力！

秒客网

网络爬虫爬取全国省市区（动态ip代理的获取，实现对ip限制的突破）

相关文章