【网络爬虫】HttpClient抓取+解析+存储数据

时间:2025-04-09 13:55:37
package ; import ; import ; import ; import ; import ; import ; import ; import ; import ; import ; import ; import ; /** * 网易贷抓取管理器 * @author tsj-pc * */ public class WangYiDaiCrawlManager { public static HttpClientCrawlerImpl httpClientCrawlerImpl = new HttpClientCrawlerImpl(); public static String[] column_key = { "platName", "locationAreaName", "locationCityName", "platUrl" }; private static CrawlResultPojo crawlOnePage(UrlPojo urlPojo) { CrawlResultPojo resultPojo = httpClientCrawlerImpl.crawl4Post(urlPojo); return resultPojo; } public static int item_count = 0; public static String parserOnePage(String jsonStr) { // 解析该json JSONObject jsonObj = (jsonStr); JSONArray jsonArray = (("list") .toString()); StringBuilder stringBuilder = new StringBuilder(); for (Object json : jsonArray) { JSONObject itemJson = (JSONObject) json; for (String column : column_key) { ((column) + "\t"); } ("\n"); item_count++; } return (); } public static void processWangYiDai(String url, int max_page_number, String filePath) { // 存储所有的抓取条目 StringBuilder all_items = new StringBuilder(); UrlPojo urlPojo = new UrlPojo(url); Map<String, Object> parasMap = new HashMap<String, Object>(); int have_download_page_count = 0; Set<String> uniqSet = new HashSet<String>(); for (int pageNumber = 1; pageNumber <= max_page_number; pageNumber++) { ("currPage", pageNumber); ("params", ""); ("sort", 0); (parasMap); CrawlResultPojo resultPojo = crawlOnePage(urlPojo); if ((())) { ("碰到重复,代表已抓取完成!"); break; } else { (()); } if (resultPojo != null) { String content = (); String page_items = parserOnePage(content); all_items.append(page_items); have_download_page_count++; } } ("all items size---" + item_count); ("已经下载了---" + have_download_page_count); (filePath, all_items.toString(), "utf-8"); ("save successfully~"); } public static void main(String[] args) { String url = "/front_select-plat"; int max_page_number = 100; String fileName = "网易贷_数据集.txt"; processWangYiDai(url, max_page_number, fileName); ("done!"); } }