package cn.taneroom.webmagic.demo.processor;
import java.util.List;
import org.apache.commons.collections.CollectionUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.JsonPathSelector;
import us.codecraft.webmagic.utils.HttpConstant;
/**
* 腾讯NBA数据爬取
* @author TANZHEN553
*/
public class TencentNbaPageProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
private final String LIST_URL = "http://tags\\.open\\.qq\\.com/interface/tag/articles\\.php\\?callback=tagListCb&p=\\d+&l=\\d+&tag=NBA&oe=gbk&ie=utf-8&source=web&site=sports&_=\\d+";
private final String DETAIL_URL = "http://sports\\.qq\\.com/a/\\d+/\\d+\\.htm";
public Site getSite() {
return site;
}
public void process(Page page) {
if (page.getUrl().regex(LIST_URL).match()) {
String rawText = page.getRawText();
rawText = rawText.substring("tagListCb(".length(), rawText.length() - 1);
List<String> detailUrls = new JsonPathSelector("$.data.articles[*].url").selectList(rawText);
if (CollectionUtils.isNotEmpty(detailUrls)) {
for (String detailUrl : detailUrls) {
Request request = createGetRequest(detailUrl);
page.addTargetRequest(request);
}
}
} else if (page.getUrl().regex(DETAIL_URL).match()) {
String title = page.getHtml().xpath("//div[@class='qq_article']/div[@class='hd']/h1/text()").get();
String content = page.getHtml().xpath("//div[@id='Cnt-Main-Article-QQ']").get();
if (content.length() > 1000) {
content = content.substring(0, 1000)+"......";
}
System.out.println("标题:\n"+title);
System.out.println("内容:\n"+content);
}
}
/**
* 创建GET请求的Request对象
* @param url
* @return
*/
private Request createGetRequest(String url){
Request request = new Request(url);
request.setMethod(HttpConstant.Method.GET);
return request;
}
public static void main(String[] args) {
int start = 1;
int end = 20;
Spider.create(new TencentNbaPageProcessor()).addUrl("http://tags.open.qq.com/interface/tag/articles.php?callback=tagListCb&p="+start+"&l="+end+"&tag=NBA&oe=gbk&ie=utf-8&source=web&site=sports&_="+System.currentTimeMillis()).thread(3).run();
}
}