引言
因为涉及到一个省市县三级联动的模块,但是由于在2016年7月国家统计局重新发布了统计用区划代码。对一些县区进行了调整,但是这次又添加了关于镇和村一级。想了一下就把到他们的数据全部爬出来,但是水平有限。爬到村的总是有数据丢失。我使用的jsoup连接超时导致。也做了调整获取速率的方法。废话不多说,直接上代码
还有一点需要注意的就是:中国的这5个地级市,既不设市辖区,又不管辖县、自治县、旗、自治旗,亦不代管县级市,而是直接辖乡级行政区,俗称“直筒子市”。 分别是:1、东莞市(广东省)2、中山市(广东省)3、三沙市(海南省)4、儋州市(海南省)5、嘉峪关市(甘肃省)特别注意!!!
正文
代码写的有点臃肿,但是还在重复率不多,还是一次性使用的东西就没有进行优化
package com.jsoup; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.ibatis.session.SqlSession; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; import com.region.dao.Region; import com.region.dao.RegionMapper; import com.region.factory.MybatisFactory; public class Html { /** * 根据url 返回Document结点 * * @param url * @return * @throws InterruptedException */ private Document htmlTextByUrl(String url) { Document doc = null; try { int i = (int) (Math.random() * 100); // 随机延迟,防止网站屏蔽 try { Thread.sleep(i); } catch (InterruptedException e) { e.printStackTrace(); } doc = Jsoup.connect(url).data("query", "Java").userAgent("Mozilla").cookie("auth", "token").timeout(3000000) .post(); // System.out.println("获取成功已返回!"); } catch (IOException e) { e.printStackTrace(); } try { doc = Jsoup.connect(url).timeout(50000000).get(); } catch (IOException e1) { e1.printStackTrace(); } return doc; } /** * 从本地获取Document * * @param path * 文档路径 * @return */ @SuppressWarnings("unused") private Document htmlTextByPath(String path) { Document doc = null; File input = new File(path); try { doc = Jsoup.parse(input, "GBK"); System.out.println("本地网页已获取成功,正在返回"); } catch (IOException e) { e.printStackTrace(); } return doc; } /** * 从目标url获取网页并下载到本地 * * @param url * 目标网站url * @param path * 保持路径 */ @SuppressWarnings("unused") private void Save_Html(String url, String path) { try { File dest = new File(path); InputStream is; FileOutputStream fos = new FileOutputStream(dest); URL temp = new URL(url); is = temp.openStream(); BufferedInputStream bis = new BufferedInputStream(is); BufferedOutputStream bos = new BufferedOutputStream(fos); int length; byte[] bytes = new byte[1024 * 20]; while ((length = bis.read(bytes, 0, bytes.length)) != -1) { fos.write(bytes, 0, length); } bos.close(); fos.close(); bis.close(); is.close(); } catch (IOException e) { e.printStackTrace(); } } /** * 根据文档标签的类型解析文档(支持市、县、区镇) * * @param doc * 文档 * @param typeName * 标签类型 * @return */ private List<Map<String, String>> analysis(Document doc, String typeName) { List<Map<String, String>> resultList = new ArrayList<>(); Map<String, String> resultMap = null; Elements selects = doc.select(typeName + ">*>*"); for (int i = 0, length = selects.size(); i < length; i = i + 2) { resultMap = new HashMap<>(); resultMap.put("code", selects.get(i).text()); resultMap.put("name", selects.get(i + 1).text()); resultMap.put("url", selects.get(i).absUrl("href")); resultList.add(resultMap); } return resultList; } /** * 根据文档标签的类型解析文档(村委会 特殊处理) * * @param doc * 文档 * @param typeName * 标签类型 * @return */ @SuppressWarnings("unused") private List<Map<String, String>> analysisVillagetr(Document doc, String typeName) { List<Map<String, String>> resultList = new ArrayList<>(); Map<String, String> resultMap = null; Elements selects = doc.select(typeName + ">*"); System.out.println(selects.size()); for (int i = 0, length = selects.size(); i < length; i = i + 3) { resultMap = new HashMap<>(); resultMap.put("code", selects.get(i).text()); resultMap.put("name", selects.get(i + 2).text()); resultList.add(resultMap); System.out.println("名称:" + selects.get(i + 2).text() + " 编号:" + selects.get(i).text()); } return resultList; } /** * 获取某县下属的所有村委会 */ public void getTowntrData(String url, SqlSession session) { Html html = new Html(); Document document = html.htmlTextByUrl(url); List<Map<String, String>> analysisList = html.analysis(document, "tr.towntr"); List<Region> resultlist = new ArrayList<>(); RegionMapper mapper = session.getMapper(RegionMapper.class); Region region = null; for (Map<String, String> map : analysisList) { String towntrUrl = map.get("url"); region = new Region(); region.setName(map.get("name")); region.setUrl(towntrUrl); region.setCode(Long.parseLong(map.get("code"))); resultlist.add(region); } if (resultlist.size() > 0) { mapper.insertBatch(resultlist); session.commit(); } } /* * public void getTowntrData(String url, RegionMapper mapper) { StringBuffer * stringBuffer = new StringBuffer(); Html html = new Html(); Document * document = html.htmlTextByUrl(url); List<Map<String, String>> * analysisList = html.analysis(document, "tr.towntr"); * * for (Map<String, String> map : analysisList) { * stringBuffer.append("this.regionMap.put(" + map.get("code") + ",\"" + * map.get("name") + "\");\r\n"); String mapUrl = map.get("url"); Document * htmlTextByUrl = html.htmlTextByUrl(mapUrl); List<Map<String, String>> * analysisVillagetr = html.analysisVillagetr(htmlTextByUrl, * "tr.villagetr"); * * for (Map<String, String> villMap : analysisVillagetr) { * stringBuffer.append("this.regionMap.put(" + villMap.get("code") + ",\"" + * villMap.get("name") + "\");\r\n"); } } * SaveUtil.save(stringBuffer.toString()); } */ /** * 获取某市下属的所有的村委会 * * @param url * @return */ public void getCountytrData(String url, SqlSession session) { Html html = new Html(); Document document = html.htmlTextByUrl(url); List<Map<String, String>> countyList = html.analysis(document, "tr.countytr"); List<Region> resultlist = new ArrayList<>(); RegionMapper mapper = session.getMapper(RegionMapper.class); Region region = null; System.out.println(" 共有 县区:" + countyList.size()); for (Map<String, String> map : countyList) { String countyUrl = map.get("url"); region = new Region(); region.setName(map.get("name")); region.setUrl(countyUrl); region.setCode(Long.parseLong(map.get("code"))); resultlist.add(region); // 获取某县下属所有村委会 getTowntrData(countyUrl, session); } if (resultlist.size() > 0) { mapper.insertBatch(resultlist); session.commit(); } } /** * 获得某省下属所有的村委会 * * @param url * @return */ public void getCityData(String url, SqlSession session) { Html html = new Html(); Document document = html.htmlTextByUrl(url); List<Map<String, String>> cityList = html.analysis(document, "tr.citytr"); List<Region> resultlist = new ArrayList<>(); RegionMapper mapper = session.getMapper(RegionMapper.class); Region region = null; for (Map<String, String> map : cityList) { String cityUrl = map.get("url"); region = new Region(); region.setName(map.get("name")); region.setUrl(cityUrl); region.setCode(Long.parseLong(map.get("code"))); resultlist.add(region); System.out.println("目前所扫描的市:"+ map.get("name")); // 获取某市下属所有村委会 getCountytrData(cityUrl, session); } if (resultlist.size() > 0) { session.commit(); mapper.insertBatch(resultlist); } } /** * 获得国内下属所有的村委会 * * @param url * @return */ public void getProvincetr(String url) { Html html = new Html(); Document document = html.htmlTextByUrl(url); Elements selects = document.select("tr.provincetr>*>*"); SqlSession session = MybatisFactory.getSession(); RegionMapper mapper = session.getMapper(RegionMapper.class); Region region = null; List<Region> resultlist = new ArrayList<>(); for (int i = 0, length = selects.size(); i < length; i++) { String cityUrl = selects.get(i).absUrl("href"); region = new Region(); region.setName(selects.get(i).text()); region.setUrl(cityUrl); resultlist.add(region); // 获取某省下属所有村委会 getCityData(cityUrl, session); } if (resultlist.size() > 0) { mapper.insertBatch(resultlist); session.commit(); session.close(); } } }具体调用的函数是这样
package com; import java.io.IOException; import org.apache.ibatis.session.SqlSession; import com.jsoup.Html; import com.region.factory.MybatisFactory; public class Main { public static void main(String[] args) throws IOException, InterruptedException { /** * 省直辖市 provincetr * 市 citytr * 县区 countytr * 镇街道 towntr * 村社区 villagetr */ Html html = new Html(); long currentTimeMillis = System.currentTimeMillis(); // 截止2016年 String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html"; SqlSession session = MybatisFactory.getSession(); // html.getCityData("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/46.html", session); // html.getTowntrData("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/46/4604.html", session); session.close(); // String url1 = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/46.html"; // html.getCityData(url1); System.out.println("共耗时:" + (System.currentTimeMillis() - currentTimeMillis) + "ms"); } }
这里对代码进行过多的解释了,整理出来的数据我自己写了一个小demo,一个jar包,获取速度在5ms左右,大约0.98M。
统计用区划代码和城乡划分代码所涉及的数据的sql文件:https://download.csdn.net/download/weixin_39923425/10297338
统计用区划代码和城乡划分代码整理 region-1.0.0链接:https://github.com/shouyeHua/region