java爬虫系列第三讲-获取页面中绝对路径的各种方法

时间:2022-12-27 21:48:43

在使用webmgiac的过程中,很多时候我们需要抓取连接的绝对路径,总结了几种方法,示例代码放在最后。

以和讯网的一个页面为例:

java爬虫系列第三讲-获取页面中绝对路径的各种方法

xpath方式获取

log.info("{}", page.getHtml().xpath("//div[@id='cyldata']").links().all());
log.info("{}", page.getHtml().xpath("//div[@id='cyldata']//a//@abs:href").all());

xpath+css选择器方式获取

log.info("{}", page.getHtml().xpath("//div[@id='cyldata']").css("a", "abs:href").all());

css选择器方式获取

log.info("{}", page.getHtml().css("div[id='cyldata']").css("a", "abs:href").all());
log.info("{}", page.getHtml().css("div[id='cyldata']").links().all());
log.info("{}", page.getHtml().css("div[id='cyldata'] a").links().all());
log.info("{}", page.getHtml().css("div[id='cyldata'] a", "abs:href").all());

jsoup方式获取

for (Element element : Jsoup.parse(page.getRawText(), page.getRequest().getUrl()).select("#cyldata a")) {
log.info("{}", element.attr("abs:href"));
log.info("{}", element.absUrl("href"));
}

jsoup中stringutil工具类方式获取

for (Element element : Jsoup.parse(page.getRawText(), page.getRequest().getUrl()).select("#cyldata a")) {
log.info("{}", StringUtil.resolve(page.getRequest().getUrl(), element.attr("href")));
}

示例代码

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.1.4.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>com.ady01</groupId>
<artifactId>java-pachong</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>java-pachong</name>
<description>java爬虫项目</description> <properties>
<java.version>1.8</java.version>
</properties> <dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter</artifactId>
</dependency> <dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency> <!-- webmagic start -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
<exclusions>
<exclusion>
<artifactId>fastjson</artifactId>
<groupId>com.alibaba</groupId>
</exclusion>
<exclusion>
<artifactId>commons-io</artifactId>
<groupId>commons-io</groupId>
</exclusion>
<exclusion>
<artifactId>commons-io</artifactId>
<groupId>commons-io</groupId>
</exclusion>
<exclusion>
<artifactId>fastjson</artifactId>
<groupId>com.alibaba</groupId>
</exclusion>
<exclusion>
<artifactId>fastjson</artifactId>
<groupId>com.alibaba</groupId>
</exclusion>
<exclusion>
<artifactId>log4j</artifactId>
<groupId>log4j</groupId>
</exclusion>
<exclusion>
<artifactId>slf4j-log4j12</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-selenium</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>net.minidev</groupId>
<artifactId>json-smart</artifactId>
<version>2.2.1</version>
</dependency>
<!-- webmagic end -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.49</version>
</dependency>
<dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
<version>2.6</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.6</version>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.11</version>
</dependency>
<dependency>
<groupId>commons-collections</groupId>
<artifactId>commons-collections</artifactId>
<version>3.2.2</version>
</dependency>
</dependencies> <build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build> </project>
package com.ady01.demo3;

import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Element;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor; /**
* <b>description</b>:webmagic中获取绝对路径 <br>
* <b>time</b>:2019/4/22 10:42 <br>
* <b>author</b>:微信公众号:路人甲Java,专注于java技术分享(带你玩转 爬虫、分布式事务、异步消息服务、任务调度、分库分表、大数据等),喜欢请关注!
*/
@Slf4j
public class AbsHrefPageProcessor implements PageProcessor {
Site site = Site.me().setSleepTime(1000); @Override
public void process(Page page) {
//获取超链接绝对路径的方式
log.info("----------------------xpath方式获取------------------------");
//xpath方式获取
log.info("{}", page.getHtml().xpath("//div[@id='cyldata']").links().all());
log.info("{}", page.getHtml().xpath("//div[@id='cyldata']//a//@abs:href").all()); //xpath+css选择器方式获取
log.info("----------------------xpath+css选择器方式获取------------------------");
log.info("{}", page.getHtml().xpath("//div[@id='cyldata']").css("a", "abs:href").all()); //css选择器方式获取
log.info("----------------------css选择器方式获取------------------------");
log.info("{}", page.getHtml().css("div[id='cyldata']").css("a", "abs:href").all());
log.info("{}", page.getHtml().css("div[id='cyldata']").links().all());
log.info("{}", page.getHtml().css("div[id='cyldata'] a").links().all());
log.info("{}", page.getHtml().css("div[id='cyldata'] a", "abs:href").all()); //jsoup方式获取
log.info("----------------------jsoup方式获取------------------------");
for (Element element : Jsoup.parse(page.getRawText(), page.getRequest().getUrl()).select("#cyldata a")) {
log.info("{}", element.attr("abs:href"));
log.info("{}", element.absUrl("href"));
} //jsoup中stringutil工具类方式获取
log.info("----------------------jsoup中stringutil工具类方式获取------------------------");
for (Element element : Jsoup.parse(page.getRawText(), page.getRequest().getUrl()).select("#cyldata a")) {
log.info("{}", StringUtil.resolve(page.getRequest().getUrl(), element.attr("href")));
}
} @Override
public Site getSite() {
return site;
} public static void main(String[] args) {
Request request = new Request("http://industry.hexun.com/c193_59.shtml");
Spider.create(new AbsHrefPageProcessor()).addRequest(request).run();
}
}

​执行结果:

java爬虫系列第三讲-获取页面中绝对路径的各种方法

java爬虫系列第三讲-获取页面中绝对路径的各种方法