webmagic爬取腾讯nba数据

时间:2022-11-29 16:57:27
package cn.taneroom.webmagic.demo.processor;

import java.util.List;

import org.apache.commons.collections.CollectionUtils;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.JsonPathSelector;
import us.codecraft.webmagic.utils.HttpConstant;

/**
* 腾讯NBA数据爬取
* @author TANZHEN553
*/

public class TencentNbaPageProcessor implements PageProcessor {

// 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);

private final String LIST_URL = "http://tags\\.open\\.qq\\.com/interface/tag/articles\\.php\\?callback=tagListCb&p=\\d+&l=\\d+&tag=NBA&oe=gbk&ie=utf-8&source=web&site=sports&_=\\d+";
private final String DETAIL_URL = "http://sports\\.qq\\.com/a/\\d+/\\d+\\.htm";

public Site getSite() {
return site;
}

public void process(Page page) {
if (page.getUrl().regex(LIST_URL).match()) { // 匹配列表页
// 获取列表的jsonp数据,并解析得到对应的详情url
String rawText = page.getRawText();
rawText = rawText.substring("tagListCb(".length(), rawText.length() - 1);
List<String> detailUrls = new JsonPathSelector("$.data.articles[*].url").selectList(rawText);
if (CollectionUtils.isNotEmpty(detailUrls)) {
for (String detailUrl : detailUrls) {
//构造get请求
Request request = createGetRequest(detailUrl);
//添加Request对象到URL请求队列
page.addTargetRequest(request);
}
}
} else if (page.getUrl().regex(DETAIL_URL).match()) { // 匹配详情页
String title = page.getHtml().xpath("//div[@class='qq_article']/div[@class='hd']/h1/text()").get();
String content = page.getHtml().xpath("//div[@id='Cnt-Main-Article-QQ']").get();
if (content.length() > 1000) {
content = content.substring(0, 1000)+"......";
}
System.out.println("标题:\n"+title);
System.out.println("内容:\n"+content);
}
}

/**
* 创建GET请求的Request对象
* @param url
* @return
*/

private Request createGetRequest(String url){
//构造Request请求对象
Request request = new Request(url);
request.setMethod(HttpConstant.Method.GET);
return request;
}

public static void main(String[] args) {
int start = 1;
int end = 20;
Spider.create(new TencentNbaPageProcessor()).addUrl("http://tags.open.qq.com/interface/tag/articles.php?callback=tagListCb&p="+start+"&l="+end+"&tag=NBA&oe=gbk&ie=utf-8&source=web&site=sports&_="+System.currentTimeMillis()).thread(3).run();
}

}