1 数据来源
*国家统计局
点击打开链接
2 Jsoup解析
//Jsoup.parse()似乎获取不了数据 Document document = Jsoup.connect("http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201703/t20170310_1471429.html").post(); //在浏览器在分析其DOM结构得到如下解析步骤 Elements select = document.select(".MsoNormal"); List<Area> areas = new ArrayList<>(); int size =0; Area area =null; Integer parentCode =null; for (Element e:select) { Elements span = e.select("span"); size = span.size(); area = new Area(); if(size==3){//是省级或直辖市 ////比较坑,String s="110000 ";拿到的文本内容如s,直接用String.trim()出来的字符串还是无法解析成数字所以自己写了一个 parentCode=Integer.parseInt(StringUtil.trim(span.get(0).text().trim())); System.out.println(span.get(0).text().trim()+"|"); area.setCode(parentCode); area.setName(span.get(2).text().trim()); System.out.println(span.get(2).text().trim()+"|"); } else if(size==4){//是省级下面 area.setParentCode(parentCode); area.setCode(Integer.parseInt(StringUtil.trim(span.get(1).text().trim()))); System.out.println(span.get(1).text().trim()+"|"); area.setName(span.get(3).text().trim()); System.out.println(span.get(3).text().trim()+"|"); } areas.add(area); //System.out.println(span.toString()); } System.out.println("总共解析到"+areas.size()+"个地区数据"); /** * 入库,建表和数据库相关操作不再缀述 */ java.sql.Connection connection = MyTest.getConnection(); connection.setAutoCommit(false); PreparedStatement preparedStatement = connection.prepareStatement("insert into area values(?,?,?)"); for (Area a:areas ) { preparedStatement.setInt(1,a.getCode()); preparedStatement.setString(2,a.getName()); preparedStatement.setObject(3,a.getParentCode()); preparedStatement.execute(); } connection.commit(); connection.close(); public static String trim(String s){ byte[] bytes = s.getBytes(); StringBuffer stringBuffer =new StringBuffer(); int cnt =0; for (byte b:bytes){ if(b>=0){ cnt++; } } byte [] newb = new byte[cnt]; cnt=0; for (int i = 0; i <bytes.length ; i++) { if(bytes[i]>=0){ newb[cnt++]=bytes[i]; } } return new String(newb); }