java+selenium

时间:2025-03-31 09:43:06
package com.mengkeng.selenium_demo.test; import com.alibaba.fastjson.JSON; import com.mengkeng.selenium_demo.entity.BuildAreaUrlLj; import com.mengkeng.selenium_demo.entity.IdAndNamePO; import com.mengkeng.selenium_demo.entity.TkBuildingsAreaInfolj; import com.mengkeng.selenium_demo.entity.TkBuildingsMonthPriceLj; import com.mengkeng.selenium_demo.mapper.BuildAreaUrlLjMapper; import com.mengkeng.selenium_demo.service.ProxyService; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.time.DateFormatUtils; import org.openqa.selenium.By; import org.openqa.selenium.PageLoadStrategy; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.chrome.ChromeOptions; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.data.redis.core.HashOperations; import org.springframework.data.redis.core.SetOperations; import org.springframework.data.redis.core.StringRedisTemplate; import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RestController; import java.time.LocalDate; import java.time.LocalDateTime; import java.util.*; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * * Date: 2022-09-05 13:58 * Description: 小区 */ @RestController @RequestMapping("areaInfo") @Slf4j public class LianjiaAreaInfoDemo { @Autowired private StringRedisTemplate redisTemplate; @Autowired private BuildAreaUrlLjMapper buildAreaUrlLjMapper; @Autowired private ProxyService proxyService; public static final String SKIP_URLS = "SKIP_URLS_AREAINFO_LIANJIA"; public static final String URLS = "URLS_AREAINFO_LIANJIA"; public static final String AREA_INFO_COMMUNITY_CODE_LJ = "AREA_INFO_COMMUNITY_CODE_LJ"; private static LinkedList<String> pages = new LinkedList<>(); ThreadPoolExecutor pagepoolExecutor = new ThreadPoolExecutor(2, 10, 30L, TimeUnit.SECONDS, new LinkedBlockingQueue<>()); @RequestMapping("sync") public void sync() throws InterruptedException { System.setProperty("", "D://"); boolean flag = false; while (!flag) { try { ChromeDriver driver = getChromeDriver(); SetOperations ops = redisTemplate.opsForSet(); try { getUrls(driver, ops); parsePagePre(ops); } finally { sleep(1000); driver.quit(); } } catch (Exception e) { Thread.sleep(10000); continue; } flag = true; } System.out.println("完成"); } /** * 获取浏览器对象 * @return */ private ChromeDriver getChromeDriver() { String nextProxy = proxyService.getNextProxy(); System.out.println("当前ip是" + nextProxy); String[] arr = {"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50", "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)"}; ChromeOptions chromeOptions = new ChromeOptions(); chromeOptions.setPageLoadStrategy(PageLoadStrategy.EAGER); chromeOptions.addArguments("--incognito"); chromeOptions.addArguments("--blink-settings=imagesEnabled=false"); chromeOptions.addArguments("--headless"); chromeOptions.addArguments("--no-sandbox"); chromeOptions.addArguments("--disable-gpu"); if (StringUtils.isNotBlank(nextProxy) && !nextProxy.equals("local")) { chromeOptions.addArguments("--proxy-server=" + nextProxy); } HashMap<String, Object> map = new HashMap<>(); map.put("webrtc.ip_handling_policy", "disable_non_proxied_udp"); map.put("webrtc.multiple_routes_enabled", false); map.put("webrtc.nonproxied_udp_enabled", false); chromeOptions.setExperimentalOption("prefs", map); Random random = new Random(); chromeOptions.addArguments("User-Agent=" + arr[random.nextInt(7)]); ChromeDriver driver = new ChromeDriver(chromeOptions); driver.manage().window().maximize(); return driver; } private void parsePagePre(SetOperations ops) { HashOperations<String, Object, Object> opsForHash = redisTemplate.opsForHash(); List<BuildAreaUrlLj> buildAreaUrlLjs = buildAreaUrlLjMapper.selectList(null); List<BuildAreaUrlLj> buildAreaUrlLjs1 = buildAreaUrlLjs.subList(1,3500); for (BuildAreaUrlLj buildAreaUrlLj : buildAreaUrlLjs1) { if (ops.isMember(SKIP_URLS, buildAreaUrlLj.getAreaUrl())) { System.out.println("跳过当前区域" + buildAreaUrlLj.getCityName() + "-" + buildAreaUrlLj.getCountyName()); continue; } pagepoolExecutor.execute(() -> parsePage(ops, opsForHash, buildAreaUrlLj)); } } /** * 解析列表 * @param ops * @param opsForHash * @param buildAreaUrlLj */ private void parsePage(SetOperations ops, HashOperations<String, Object, Object> opsForHash, BuildAreaUrlLj buildAreaUrlLj) { ChromeDriver driver = getChromeDriver(); try { driver.get(buildAreaUrlLj.getAreaUrl()); String windowHandlePage = driver.getWindowHandle(); WebElement totalNumStr = validElement("//h2[@class='total fl']/span", driver); if (null != totalNumStr) { Integer total = Integer.valueOf(totalNumStr.getText()); // 有数据 if (total > 1) { String pageData = driver.findElement(By.xpath("//div[@class='page-box house-lst-page-box']")).getAttribute("page-data"); Integer pageNumStr = Integer.valueOf(JSON.parseObject(pageData).getString("totalPage")); System.out.println("当前区域页数" + pageNumStr + "---" + buildAreaUrlLj.getAreaUrl()); for (int x = 1; x <= pageNumStr; x++) { List<WebElement> elements = driver.findElements(By.xpath("//ul[@class='listContent']/li/div[1]/div[1]/a")); for (int i = 0; i < elements.size(); i++) { WebElement item = elements.get(i); String code = ""; Pattern compile1 = Pattern.compile("xiaoqu/(\\w+)/"); Matcher matcher1 = compile1.matcher(item.getAttribute("href")); while (matcher1.find()) { code = matcher1.group(1); } driver.executeScript("arguments[0].click();", item); sleepAndCutoverNewPage(300, driver); // 如果有 则不解析详情 if (!opsForHash.hasKey(AREA_INFO_COMMUNITY_CODE_LJ, code)) { parseDetail(driver, code, buildAreaUrlLj, opsForHash); } else { System.out.println("当前code redis 存在" + code); //更新 // new TkBuildingsMonthPriceLj(); } driver.close(); driver.switchTo().window(windowHandlePage); sleep(200); elements = driver.findElements(By.xpath("//ul[@class='listContent']/li/div[1]/div[1]/a")); } if (x != pageNumStr) { String nextPage = buildAreaUrlLj.getAreaUrl() + "pg" + (x + 1) + "/"; driver.get(nextPage); System.out.println("下一页是" + nextPage); sleep(200); } } } } ops.add(SKIP_URLS, buildAreaUrlLj.getAreaUrl()); } catch (NumberFormatException e) { throw new RuntimeException("多线程发生异常"+e.getMessage()); }finally { driver.quit(); } } /** * 解析详情 * @param driver * @param communityCode * @param buildAreaUrlLj * @param opsForHash */ private void parseDetail(ChromeDriver driver, String communityCode, BuildAreaUrlLj buildAreaUrlLj, HashOperations<String, Object, Object> opsForHash) { LocalDateTime now1 = LocalDateTime.now(); if (null != validElement("//span[@class='xiaoquUnitPrice']", driver)) { TkBuildingsMonthPriceLj lj = new TkBuildingsMonthPriceLj(); lj.setCommunityCode(communityCode); String year = String.valueOf(LocalDate.now().getYear()); if (driver.findElement(By.className("xiaoquUnitPriceDesc")).getText().equals("挂牌均价")){ lj.setYearmonth(DateFormatUtils.format(new Date(),"yyyyMM")); }else{ String monthStr = driver.findElement(By.className("xiaoquUnitPriceDesc")).getText().replace("月参考均价", ""); String month = String.format("%02d", Integer.parseInt(monthStr)); lj.setYearmonth(year + month); } lj.setAvgPrice(Integer.valueOf(driver.findElement(By.className("xiaoquUnitPrice")).getText())); lj.setGenerateType("0"); lj.setCreateBy("1"); lj.setCreateDate(new Date()); lj.setUpdateBy("1"); lj.setUpdateDate(new Date()); lj.setDelFlag("0"); System.out.println("持久化价格"+lj); } LocalDateTime now2 = LocalDateTime.now(); TkBuildingsAreaInfolj infolj = new TkBuildingsAreaInfolj(); infolj.setDataOrigin("lianjia"); infolj.setGenerateType("0"); infolj.setProvince(buildAreaUrlLj.getProvinceId()); infolj.setCity(buildAreaUrlLj.getCityId()); infolj.setArea(buildAreaUrlLj.getCountyId()); infolj.setCommunity(validElement("//h1[@class='detailTitle']", driver) == null ? "" : driver.findElement(By.xpath("//h1[@class='detailTitle']")).getText()); infolj.setCommunityCode(communityCode); infolj.setBuildingYear(validElement("//span[text()='建筑年代']", driver) == null ? "" : driver.findElement(By.xpath("//span[text()='建筑年代']/parent::div/span[2]")).getText()); infolj.setBuildingType(validElement("//span[text()='建筑类型']", driver) == null ? "" : driver.findElement(By.xpath("//span[text()='建筑类型']/parent::div/span[2]")).getText()); infolj.setManageCost(validElement("//span[text()='物业费用']", driver) == null ? "" : driver.findElement(By.xpath("//span[text()='物业费用']/parent::div/span[2]")).getText()); infolj.setManageCompany(validElement("//span[text()='物业公司']", driver) == null ? "" : driver.findElement(By.xpath("//span[text()='物业公司']/parent::div/span[2]")).getText()); infolj.setManageDevlop(validElement("//span[text()='开发商']", driver) == null ? "" : driver.findElement(By.xpath("//span[text()='开发商']/parent::div/span[2]")).getText()); infolj.setBuildingCount(validElement("//span[text()='楼栋总数']", driver) == null ? "" : driver.findElement(By.xpath("//span[text()='楼栋总数']/parent::div/span[2]")).getText()); infolj.setRoomCount(validElement("//span[text()='房屋总数']", driver) == null ? "" : driver.findElement(By.xpath("//span[text()='房屋总数']/parent::div/span[2]")).getText()); infolj.setCreateBy("1"); infolj.setCreateDate(new Date()); infolj.setUpdateBy("1"); infolj.setUpdateDate(new Date()); infolj.setDelFlag("0"); System.out.println("持久化小区"+infolj); } /** * 爬取链接 * @param driver * @param ops */ private void getUrls(ChromeDriver driver, SetOperations ops) { driver.get("/city/"); int count = 0; List<WebElement> elements = driver.findElements(By.xpath("//ul[@class='city_list_ul']/li/div[2]/div/ul/li/a")); for (int i = 0; i < elements.size(); i++) { WebElement element = elements.get(i); String provinceName = element.findElement(By.xpath("./parent::li/parent::ul/parent::div/div")).getText(); String areaName = element.getText(); Boolean memberFlag = ops.isMember(URLS, areaName); if (memberFlag) { System.out.println("已跑过当前区域 跳过" + areaName); continue; } driver.executeScript("arguments[0].click();", element); String frontPage = driver.getWindowHandle(); WebElement ershoufang = null; try { ershoufang = driver.findElement(By.linkText("小区")); } catch (Exception e) { ops.add(URLS, areaName); sleep(200); System.out.println(areaName + " 没有小区===="); driver.get("/city/"); elements = driver.findElements(By.xpath("//ul[@class='city_list_ul']/li/div[2]/div/ul/li/a")); continue; } driver.executeScript("arguments[0].click();", ershoufang); sleepAndCutoverNewPage(500, driver); List<WebElement> citys = driver.findElements(By.xpath("//div[@data-role='ershoufang']/div[1]/a")); citys.forEach(e -> System.out.println("市级============" + e.getText() + "==" + e.getAttribute("href"))); for (int j = 0; j < citys.size(); j++) { String countyName = citys.get(j).getText(); driver.executeScript("arguments[0].click();", citys.get(j)); sleep(200); if (validElement("//h2[@class='total fl']/span", driver) != null) { String text = driver.findElement(By.xpath("//h2[@class='total fl']/span")).getText(); count += Integer.parseInt(text); System.out.println(countyName + text + "个"); System.out.println("当前总数是" + count); } List<WebElement> areas = null; try { areas = driver.findElements(By.xpath("//div[@data-role='ershoufang']/div[2]/a")); } catch (Exception e) { citys = driver.findElements(By.xpath("//div[@data-role='ershoufang']/div[1]/a")); saveDataCity(countyName, areaName, provinceName, citys); break; } if (areas.size() == 0) { citys = driver.findElements(By.xpath("//div[@data-role='ershoufang']/div[1]/a")); saveDataCity(countyName, areaName, provinceName, citys); break; } saveDataCounty(countyName, areaName, provinceName, areas); sleep(100); citys = driver.findElements(By.xpath("//div[@data-role='ershoufang']/div[1]/a")); } ops.add(URLS, areaName); driver.close(); driver.switchTo().window(frontPage); driver.get("/city/"); sleep(200); elements = driver.findElements(By.xpath("//ul[@class='city_list_ul']/li/div[2]/div/ul/li/a")); } System.out.println("总数是" + count); } private void saveDataCounty(String countyName, String areaName, String provinceName, List<WebElement> list) { for (WebElement element : list) { String url = element.getAttribute("href"); BuildAreaUrlLj buildAreaUrlLj = new BuildAreaUrlLj(); IdAndNamePO provincepo = queryProvinceCityArea(1, provinceName, null); buildAreaUrlLj.setProvinceName(provincepo.getBusinessName()); buildAreaUrlLj.setProvinceId(provincepo.getBusinessId()); IdAndNamePO areapo = queryProvinceCityArea(2, areaName, provincepo.getBusinessId()); buildAreaUrlLj.setCityName(areapo.getBusinessName()); buildAreaUrlLj.setCityId(areapo.getBusinessId()); IdAndNamePO countypo = queryProvinceCityArea(3, countyName, areapo.getBusinessId()); buildAreaUrlLj.setCountyName(countypo.getBusinessName()); buildAreaUrlLj.setCountyId(countypo.getBusinessId()); buildAreaUrlLj.setAreaUrl(url); buildAreaUrlLj.setCreateTime(new Date()); buildAreaUrlLj.setUpdateTime(new Date()); System.out.println("持久化链接"+buildAreaUrlLj); } } private void saveDataCity(String countyName, String areaName, String provinceName, List<WebElement> list) { for (WebElement element : list) { String url = element.getAttribute("href"); BuildAreaUrlLj buildAreaUrlLj = new BuildAreaUrlLj(); IdAndNamePO provincepo = queryProvinceCityArea(1, provinceName, null); buildAreaUrlLj.setProvinceName(provinceName); buildAreaUrlLj.setProvinceId(provincepo.getBusinessId()); buildAreaUrlLj.setCityName(areaName); IdAndNamePO areapo = queryProvinceCityArea(2, areaName, provincepo.getBusinessId()); buildAreaUrlLj.setCityId(areapo.getBusinessId()); IdAndNamePO countypo = queryProvinceCityArea(3, countyName, areapo.getBusinessId()); buildAreaUrlLj.setCountyName(countypo.getBusinessName()); buildAreaUrlLj.setCountyId(countypo.getBusinessId()); buildAreaUrlLj.setAreaUrl(url); buildAreaUrlLj.setCreateTime(new Date()); buildAreaUrlLj.setUpdateTime(new Date()); System.out.println("持久化链接"+buildAreaUrlLj); } } /** * 根据名称查询省市县信息 * @param type 1/省 2/市 3/区 * @param businessName 名称 * @param parentId 父id * @return */ private IdAndNamePO queryProvinceCityArea(Integer type, String businessName, String parentId) { if (StringUtils.isNotBlank(parentId)) { ArrayList<String> citys = new ArrayList<>(8); citys.add("50"); citys.add("11"); citys.add("31"); citys.add("12"); if (citys.contains(parentId)) { businessName = "市辖区"; } } IdAndNamePO po = null; try { if (type == 1) { // po = (businessName); } else if (type == 2) { // po = (businessName, parentId); } else if (type == 3) { // po = (businessName, parentId); } } catch (Exception e) { e.printStackTrace(); } if (null == po) { po = new IdAndNamePO(); po.setBusinessId("-1"); po.setBusinessName(businessName); } return po; } private static String sleepAndCutoverNewPage(int millis, WebDriver driver) { try { Thread.sleep(millis); for (String handle : driver.getWindowHandles()) { if (!pages.contains(handle)) { driver.switchTo().window(handle); } } } catch (InterruptedException e) { } return null; } private static void sleep(int millis) { try { Thread.sleep(millis); } catch (InterruptedException e) { } } public static WebElement validElement(String str, WebDriver driver) { try { WebElement element = driver.findElement(By.xpath(str)); return element; } catch (Exception e) { System.out.println("这个元素不存在" + str); } return null; } }