java+selenium
package com.mengkeng.selenium_demo.test;
import com.alibaba.fastjson.JSON;
import com.mengkeng.selenium_demo.entity.BuildAreaUrlLj;
import com.mengkeng.selenium_demo.entity.IdAndNamePO;
import com.mengkeng.selenium_demo.entity.TkBuildingsAreaInfolj;
import com.mengkeng.selenium_demo.entity.TkBuildingsMonthPriceLj;
import com.mengkeng.selenium_demo.mapper.BuildAreaUrlLjMapper;
import com.mengkeng.selenium_demo.service.ProxyService;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.DateFormatUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.PageLoadStrategy;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.redis.core.HashOperations;
import org.springframework.data.redis.core.SetOperations;
import org.springframework.data.redis.core.StringRedisTemplate;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.util.*;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
* Date: 2022-09-05 13:58
* Description: 小区
*/
@RestController
@RequestMapping("areaInfo")
@Slf4j
public class LianjiaAreaInfoDemo {
@Autowired
private StringRedisTemplate redisTemplate;
@Autowired
private BuildAreaUrlLjMapper buildAreaUrlLjMapper;
@Autowired
private ProxyService proxyService;
public static final String SKIP_URLS = "SKIP_URLS_AREAINFO_LIANJIA";
public static final String URLS = "URLS_AREAINFO_LIANJIA";
public static final String AREA_INFO_COMMUNITY_CODE_LJ = "AREA_INFO_COMMUNITY_CODE_LJ";
private static LinkedList<String> pages = new LinkedList<>();
ThreadPoolExecutor pagepoolExecutor = new ThreadPoolExecutor(2,
10, 30L,
TimeUnit.SECONDS, new LinkedBlockingQueue<>());
@RequestMapping("sync")
public void sync() throws InterruptedException {
System.setProperty("", "D://");
boolean flag = false;
while (!flag) {
try {
ChromeDriver driver = getChromeDriver();
SetOperations ops = redisTemplate.opsForSet();
try {
getUrls(driver, ops);
parsePagePre(ops);
} finally {
sleep(1000);
driver.quit();
}
} catch (Exception e) {
Thread.sleep(10000);
continue;
}
flag = true;
}
System.out.println("完成");
}
/**
* 获取浏览器对象
* @return
*/
private ChromeDriver getChromeDriver() {
String nextProxy = proxyService.getNextProxy();
System.out.println("当前ip是" + nextProxy);
String[] arr = {"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)"};
ChromeOptions chromeOptions = new ChromeOptions();
chromeOptions.setPageLoadStrategy(PageLoadStrategy.EAGER);
chromeOptions.addArguments("--incognito");
chromeOptions.addArguments("--blink-settings=imagesEnabled=false");
chromeOptions.addArguments("--headless");
chromeOptions.addArguments("--no-sandbox");
chromeOptions.addArguments("--disable-gpu");
if (StringUtils.isNotBlank(nextProxy) && !nextProxy.equals("local")) {
chromeOptions.addArguments("--proxy-server=" + nextProxy);
}
HashMap<String, Object> map = new HashMap<>();
map.put("webrtc.ip_handling_policy", "disable_non_proxied_udp");
map.put("webrtc.multiple_routes_enabled", false);
map.put("webrtc.nonproxied_udp_enabled", false);
chromeOptions.setExperimentalOption("prefs", map);
Random random = new Random();
chromeOptions.addArguments("User-Agent=" + arr[random.nextInt(7)]);
ChromeDriver driver = new ChromeDriver(chromeOptions);
driver.manage().window().maximize();
return driver;
}
private void parsePagePre(SetOperations ops) {
HashOperations<String, Object, Object> opsForHash = redisTemplate.opsForHash();
List<BuildAreaUrlLj> buildAreaUrlLjs = buildAreaUrlLjMapper.selectList(null);
List<BuildAreaUrlLj> buildAreaUrlLjs1 = buildAreaUrlLjs.subList(1,3500);
for (BuildAreaUrlLj buildAreaUrlLj : buildAreaUrlLjs1) {
if (ops.isMember(SKIP_URLS, buildAreaUrlLj.getAreaUrl())) {
System.out.println("跳过当前区域" + buildAreaUrlLj.getCityName() + "-" + buildAreaUrlLj.getCountyName());
continue;
}
pagepoolExecutor.execute(() -> parsePage(ops, opsForHash, buildAreaUrlLj));
}
}
/**
* 解析列表
* @param ops
* @param opsForHash
* @param buildAreaUrlLj
*/
private void parsePage(SetOperations ops, HashOperations<String, Object, Object> opsForHash, BuildAreaUrlLj buildAreaUrlLj) {
ChromeDriver driver = getChromeDriver();
try {
driver.get(buildAreaUrlLj.getAreaUrl());
String windowHandlePage = driver.getWindowHandle();
WebElement totalNumStr = validElement("//h2[@class='total fl']/span", driver);
if (null != totalNumStr) {
Integer total = Integer.valueOf(totalNumStr.getText());
// 有数据
if (total > 1) {
String pageData = driver.findElement(By.xpath("//div[@class='page-box house-lst-page-box']")).getAttribute("page-data");
Integer pageNumStr = Integer.valueOf(JSON.parseObject(pageData).getString("totalPage"));
System.out.println("当前区域页数" + pageNumStr + "---" + buildAreaUrlLj.getAreaUrl());
for (int x = 1; x <= pageNumStr; x++) {
List<WebElement> elements = driver.findElements(By.xpath("//ul[@class='listContent']/li/div[1]/div[1]/a"));
for (int i = 0; i < elements.size(); i++) {
WebElement item = elements.get(i);
String code = "";
Pattern compile1 = Pattern.compile("xiaoqu/(\\w+)/");
Matcher matcher1 = compile1.matcher(item.getAttribute("href"));
while (matcher1.find()) {
code = matcher1.group(1);
}
driver.executeScript("arguments[0].click();", item);
sleepAndCutoverNewPage(300, driver);
// 如果有 则不解析详情
if (!opsForHash.hasKey(AREA_INFO_COMMUNITY_CODE_LJ, code)) {
parseDetail(driver, code, buildAreaUrlLj, opsForHash);
} else {
System.out.println("当前code redis 存在" + code);
//更新
// new TkBuildingsMonthPriceLj();
}
driver.close();
driver.switchTo().window(windowHandlePage);
sleep(200);
elements = driver.findElements(By.xpath("//ul[@class='listContent']/li/div[1]/div[1]/a"));
}
if (x != pageNumStr) {
String nextPage = buildAreaUrlLj.getAreaUrl() + "pg" + (x + 1) + "/";
driver.get(nextPage);
System.out.println("下一页是" + nextPage);
sleep(200);
}
}
}
}
ops.add(SKIP_URLS, buildAreaUrlLj.getAreaUrl());
} catch (NumberFormatException e) {
throw new RuntimeException("多线程发生异常"+e.getMessage());
}finally {
driver.quit();
}
}
/**
* 解析详情
* @param driver
* @param communityCode
* @param buildAreaUrlLj
* @param opsForHash
*/
private void parseDetail(ChromeDriver driver, String communityCode, BuildAreaUrlLj buildAreaUrlLj, HashOperations<String, Object, Object> opsForHash) {
LocalDateTime now1 = LocalDateTime.now();
if (null != validElement("//span[@class='xiaoquUnitPrice']", driver)) {
TkBuildingsMonthPriceLj lj = new TkBuildingsMonthPriceLj();
lj.setCommunityCode(communityCode);
String year = String.valueOf(LocalDate.now().getYear());
if (driver.findElement(By.className("xiaoquUnitPriceDesc")).getText().equals("挂牌均价")){
lj.setYearmonth(DateFormatUtils.format(new Date(),"yyyyMM"));
}else{
String monthStr = driver.findElement(By.className("xiaoquUnitPriceDesc")).getText().replace("月参考均价", "");
String month = String.format("%02d", Integer.parseInt(monthStr));
lj.setYearmonth(year + month);
}
lj.setAvgPrice(Integer.valueOf(driver.findElement(By.className("xiaoquUnitPrice")).getText()));
lj.setGenerateType("0");
lj.setCreateBy("1");
lj.setCreateDate(new Date());
lj.setUpdateBy("1");
lj.setUpdateDate(new Date());
lj.setDelFlag("0");
System.out.println("持久化价格"+lj);
}
LocalDateTime now2 = LocalDateTime.now();
TkBuildingsAreaInfolj infolj = new TkBuildingsAreaInfolj();
infolj.setDataOrigin("lianjia");
infolj.setGenerateType("0");
infolj.setProvince(buildAreaUrlLj.getProvinceId());
infolj.setCity(buildAreaUrlLj.getCityId());
infolj.setArea(buildAreaUrlLj.getCountyId());
infolj.setCommunity(validElement("//h1[@class='detailTitle']", driver) == null ?
"" : driver.findElement(By.xpath("//h1[@class='detailTitle']")).getText());
infolj.setCommunityCode(communityCode);
infolj.setBuildingYear(validElement("//span[text()='建筑年代']", driver) == null ?
"" : driver.findElement(By.xpath("//span[text()='建筑年代']/parent::div/span[2]")).getText());
infolj.setBuildingType(validElement("//span[text()='建筑类型']", driver) == null ?
"" : driver.findElement(By.xpath("//span[text()='建筑类型']/parent::div/span[2]")).getText());
infolj.setManageCost(validElement("//span[text()='物业费用']", driver) == null ?
"" : driver.findElement(By.xpath("//span[text()='物业费用']/parent::div/span[2]")).getText());
infolj.setManageCompany(validElement("//span[text()='物业公司']", driver) == null ?
"" : driver.findElement(By.xpath("//span[text()='物业公司']/parent::div/span[2]")).getText());
infolj.setManageDevlop(validElement("//span[text()='开发商']", driver) == null ?
"" : driver.findElement(By.xpath("//span[text()='开发商']/parent::div/span[2]")).getText());
infolj.setBuildingCount(validElement("//span[text()='楼栋总数']", driver) == null ?
"" : driver.findElement(By.xpath("//span[text()='楼栋总数']/parent::div/span[2]")).getText());
infolj.setRoomCount(validElement("//span[text()='房屋总数']", driver) == null ?
"" : driver.findElement(By.xpath("//span[text()='房屋总数']/parent::div/span[2]")).getText());
infolj.setCreateBy("1");
infolj.setCreateDate(new Date());
infolj.setUpdateBy("1");
infolj.setUpdateDate(new Date());
infolj.setDelFlag("0");
System.out.println("持久化小区"+infolj);
}
/**
* 爬取链接
* @param driver
* @param ops
*/
private void getUrls(ChromeDriver driver, SetOperations ops) {
driver.get("/city/");
int count = 0;
List<WebElement> elements = driver.findElements(By.xpath("//ul[@class='city_list_ul']/li/div[2]/div/ul/li/a"));
for (int i = 0; i < elements.size(); i++) {
WebElement element = elements.get(i);
String provinceName = element.findElement(By.xpath("./parent::li/parent::ul/parent::div/div")).getText();
String areaName = element.getText();
Boolean memberFlag = ops.isMember(URLS, areaName);
if (memberFlag) {
System.out.println("已跑过当前区域 跳过" + areaName);
continue;
}
driver.executeScript("arguments[0].click();", element);
String frontPage = driver.getWindowHandle();
WebElement ershoufang = null;
try {
ershoufang = driver.findElement(By.linkText("小区"));
} catch (Exception e) {
ops.add(URLS, areaName);
sleep(200);
System.out.println(areaName + " 没有小区====");
driver.get("/city/");
elements = driver.findElements(By.xpath("//ul[@class='city_list_ul']/li/div[2]/div/ul/li/a"));
continue;
}
driver.executeScript("arguments[0].click();", ershoufang);
sleepAndCutoverNewPage(500, driver);
List<WebElement> citys = driver.findElements(By.xpath("//div[@data-role='ershoufang']/div[1]/a"));
citys.forEach(e -> System.out.println("市级============" + e.getText() + "==" + e.getAttribute("href")));
for (int j = 0; j < citys.size(); j++) {
String countyName = citys.get(j).getText();
driver.executeScript("arguments[0].click();", citys.get(j));
sleep(200);
if (validElement("//h2[@class='total fl']/span", driver) != null) {
String text = driver.findElement(By.xpath("//h2[@class='total fl']/span")).getText();
count += Integer.parseInt(text);
System.out.println(countyName + text + "个");
System.out.println("当前总数是" + count);
}
List<WebElement> areas = null;
try {
areas = driver.findElements(By.xpath("//div[@data-role='ershoufang']/div[2]/a"));
} catch (Exception e) {
citys = driver.findElements(By.xpath("//div[@data-role='ershoufang']/div[1]/a"));
saveDataCity(countyName, areaName, provinceName, citys);
break;
}
if (areas.size() == 0) {
citys = driver.findElements(By.xpath("//div[@data-role='ershoufang']/div[1]/a"));
saveDataCity(countyName, areaName, provinceName, citys);
break;
}
saveDataCounty(countyName, areaName, provinceName, areas);
sleep(100);
citys = driver.findElements(By.xpath("//div[@data-role='ershoufang']/div[1]/a"));
}
ops.add(URLS, areaName);
driver.close();
driver.switchTo().window(frontPage);
driver.get("/city/");
sleep(200);
elements = driver.findElements(By.xpath("//ul[@class='city_list_ul']/li/div[2]/div/ul/li/a"));
}
System.out.println("总数是" + count);
}
private void saveDataCounty(String countyName, String areaName, String provinceName, List<WebElement> list) {
for (WebElement element : list) {
String url = element.getAttribute("href");
BuildAreaUrlLj buildAreaUrlLj = new BuildAreaUrlLj();
IdAndNamePO provincepo = queryProvinceCityArea(1, provinceName, null);
buildAreaUrlLj.setProvinceName(provincepo.getBusinessName());
buildAreaUrlLj.setProvinceId(provincepo.getBusinessId());
IdAndNamePO areapo = queryProvinceCityArea(2, areaName, provincepo.getBusinessId());
buildAreaUrlLj.setCityName(areapo.getBusinessName());
buildAreaUrlLj.setCityId(areapo.getBusinessId());
IdAndNamePO countypo = queryProvinceCityArea(3, countyName, areapo.getBusinessId());
buildAreaUrlLj.setCountyName(countypo.getBusinessName());
buildAreaUrlLj.setCountyId(countypo.getBusinessId());
buildAreaUrlLj.setAreaUrl(url);
buildAreaUrlLj.setCreateTime(new Date());
buildAreaUrlLj.setUpdateTime(new Date());
System.out.println("持久化链接"+buildAreaUrlLj);
}
}
private void saveDataCity(String countyName, String areaName, String provinceName, List<WebElement> list) {
for (WebElement element : list) {
String url = element.getAttribute("href");
BuildAreaUrlLj buildAreaUrlLj = new BuildAreaUrlLj();
IdAndNamePO provincepo = queryProvinceCityArea(1, provinceName, null);
buildAreaUrlLj.setProvinceName(provinceName);
buildAreaUrlLj.setProvinceId(provincepo.getBusinessId());
buildAreaUrlLj.setCityName(areaName);
IdAndNamePO areapo = queryProvinceCityArea(2, areaName, provincepo.getBusinessId());
buildAreaUrlLj.setCityId(areapo.getBusinessId());
IdAndNamePO countypo = queryProvinceCityArea(3, countyName, areapo.getBusinessId());
buildAreaUrlLj.setCountyName(countypo.getBusinessName());
buildAreaUrlLj.setCountyId(countypo.getBusinessId());
buildAreaUrlLj.setAreaUrl(url);
buildAreaUrlLj.setCreateTime(new Date());
buildAreaUrlLj.setUpdateTime(new Date());
System.out.println("持久化链接"+buildAreaUrlLj);
}
}
/**
* 根据名称查询省市县信息
* @param type 1/省 2/市 3/区
* @param businessName 名称
* @param parentId 父id
* @return
*/
private IdAndNamePO queryProvinceCityArea(Integer type, String businessName, String parentId) {
if (StringUtils.isNotBlank(parentId)) {
ArrayList<String> citys = new ArrayList<>(8);
citys.add("50");
citys.add("11");
citys.add("31");
citys.add("12");
if (citys.contains(parentId)) {
businessName = "市辖区";
}
}
IdAndNamePO po = null;
try {
if (type == 1) {
// po = (businessName);
} else if (type == 2) {
// po = (businessName, parentId);
} else if (type == 3) {
// po = (businessName, parentId);
}
} catch (Exception e) {
e.printStackTrace();
}
if (null == po) {
po = new IdAndNamePO();
po.setBusinessId("-1");
po.setBusinessName(businessName);
}
return po;
}
private static String sleepAndCutoverNewPage(int millis, WebDriver driver) {
try {
Thread.sleep(millis);
for (String handle : driver.getWindowHandles()) {
if (!pages.contains(handle)) {
driver.switchTo().window(handle);
}
}
} catch (InterruptedException e) {
}
return null;
}
private static void sleep(int millis) {
try {
Thread.sleep(millis);
} catch (InterruptedException e) {
}
}
public static WebElement validElement(String str, WebDriver driver) {
try {
WebElement element = driver.findElement(By.xpath(str));
return element;
} catch (Exception e) {
System.out.println("这个元素不存在" + str);
}
return null;
}
}