1. maven pom.xml 配置:
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
2. 核心webmagic Processor爬虫业务逻辑处理类:
public class XXXXProcessor implements PageProcessor{
//设置爬虫参数
private String baseUrl = "";
private String spiderUrl = "";
private String videoUrlTag = "";
private String videoTitleTag = "";
private String downloadVideoPath = "";
public VideoProcessor(String baseUrl,
String spiderUrl,
String videoUrlTag,
String videoTitleTag,
String downloadVideoPath){
this.baseUrl = baseUrl;
this.spiderUrl = spiderUrl;
this.videoUrlTag = videoUrlTag;
this.videoTitleTag = videoTitleTag;
this.downloadVideoPath = downloadVideoPath;
}
// 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
private int spiderIntervalSecond = 2;
private int retryTimes= 3;
private Site site = Site.me().setRetryTimes(retryTimes).setSleepTime(spiderIntervalSecond)
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31")
.setCharset("UTF-8");
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
page.setCharset("utf-8");
//从页面发现后续的url地址来抓取
page.addTargetRequests(page.getHtml().links().regex(spiderUrl).all());
if(page.getUrl().regex(spiderUrl).match()) {
String videoUrl = page.getHtml().xpath(videoUrlTag).toString();
if(StringUtils.isNotBlank(videoUrl)){
if(!videoUrl.contains("http://")){
videoUrl = baseUrl + videoUrl;
}
String videoFormat =videoUrl.substring(videoUrl.lastIndexOf("."),videoUrl.length());
String title = page.getHtml().xpath(videoTitleTag).toString();
File destFile = new File(downloadVideoPath + title + videoFormat);
UrlFileDownloadUtil.downloadVideo(videoUrl, destFile);
}
}
}
public Site getSite() {
return site;
}
}
2. 下载视频工具类.
public class UrlFileDownloadUtil {
/**
* 下载视频
* @param url
* @throws IOException
*/
public static void downloadVideo(String url,File destFile) {
try {
URL videoUrl = new URL(url);
InputStream is = videoUrl.openStream();
FileOutputStream fos = new FileOutputStream(destFile);
int len = 0;
byte[] buffer = new byte[1024];
while ((-1) != (len = is.read(buffer))) {
fos.write(buffer, 0, len);
}
fos.flush();
if (null != fos) {
fos.close();
}
if (null != is) {
is.close();
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
3. 开始测试。
public static void main(String [] args) throws JMException {
String startUrl = "http://www.ctdsb.net/#sp";
String baseUrl = "";
String spiderUrl = "http://www.ctdsb.net/html/2020/0412/videos\\w+\\.html";
String videoUrlTag = "//video/@src";
String videoTitleTag = "//div[@class='page-header']/h3/text()";
String downloadPath = "E:/ctdsb/";
int cpuN = CpuNumUtils.getCpuNum();
Spider.create(new VideoProcessor(baseUrl, spiderUrl, videoUrlTag, videoTitleTag, downloadPath))
.addUrl(startUrl)
//.addPipeline(new VideoPipeline())
.setDownloader(new HttpClientDownloader())
.thread(cpuN+1)
.run();
}
测试成功如下图: