实战二:网络爬虫

时间:2024-10-24 12:47:38
public class test1 { public static void main(String[] args) throws IOException { //1.定义变量记录网址 String familyName = "https://hanyu.baidu.com/shici/detail?pid=0b2f26d4c0ddb3ee693fdb1137ee1b0d&from=kg0"; String boyName = "http://www.haoming8.cn/baobao/10881.html"; String girlName = "http://www.haoming8.cn/baobao/7641.html"; //2.爬取数据,把网址上所有的数据拼接成一个字符串 String FamilyName = webCrawler(familyName); String BoyName = webCrawler(boyName); String GirlName = webCrawler(girlName); //System.out.println(FamilyName); ArrayList<String> FamilyNameList = getData(FamilyName, "([\\u4e00-\\u9fa5]{4})(,|。)", 1); ArrayList<String> boyNameList = getData(BoyName, "([\\u4e00-\\u9fa5]{2})(、|。)", 1); ArrayList<String> girlNameList = getData(GirlName,"([\\u4e00-\\u9fa5]{2})( )", 1); System.out.println(FamilyNameList); System.out.println(boyNameList); System.out.println(girlNameList); } private static ArrayList<String> getData(String str, String regex, int index) { //1.创建集合存放数据 ArrayList<String> list = new ArrayList<>(); //2.按照正则表达式的规则,去获取数据 Pattern pattern = Pattern.compile(regex); //按照pattern的规则,到str当中获取数据 Matcher matcher = pattern.matcher(str); while(matcher.find()){ list.add(matcher.group(index)); } return list; } public static String webCrawler(String net) throws IOException { StringBuilder sb = new StringBuilder(); //创建一个url对象 URL url = new URL(net); URLConnection conn = url.openConnection(); InputStreamReader isr = new InputStreamReader(conn.getInputStream()); int ch; while ((ch = isr.read()) != -1) { sb.append((char) ch); } isr.close(); return sb.toString(); } }