Java文章抓取

时间:2023-02-18 20:59:16
@PostMapping("/grab")
@ApiOperationSupport(order = 9)
@ApiOperation(value = "抓取", notes = "传入grabUrl")
public R grabe(@ApiParam(value = "抓取", required = true)@RequestParam String grabUrl){
SpiderResultVO spiderResultVO=new SpiderResultVO();
System.out.println("=================="+grabUrl);
try {
Document document=null;
document= Jsoup.connect(grabUrl).get();

String title=document.selectFirst("h1").text();
System.out.println("抓取之后的标题:"+title);
String author=document.selectFirst(".info>span>span:eq(3)").text();
System.out.println("抓取之后的作者:"+author);

Element createTimeElement=document.select(".info>span").first().children().last();
Date crateTime= DateUtil.parse(createTimeElement.text(),"yyyy-MM-dd");
System.out.println("抓取之后的时间:"+crateTime);

//來源。
Element sourceElement = document.selectFirst(".info>span>span:eq(1) a");
String sourceName = sourceElement.text() ;
String sourceUrl = sourceElement.attr ("href");
System.out.println("抓取之后的來源:"+sourceName);
System.out.println("抓取之后的路径:"+sourceUrl);
//正文
Element articleBody = document. getElementById("articleBody") ;
Elements children = articleBody. children() ;
children. select("p[style*=float]").remove();
children. select("div.o.cl.ptm.pbm").remove() ;
children. select("div.arc_copyright"). remove() ;
children. select(" dl.navi"). remove();
String content = articleBody. html() ;
System.out.println("抓取之后的正文:"+content);
//摘要。
String summary = children.first().text();
spiderResultVO.setTitle(title);
spiderResultVO.setAuthor(author);
spiderResultVO.setSourceName(sourceName);
spiderResultVO.setSourceUrl(sourceUrl);
spiderResultVO.setContent(content);
spiderResultVO.setSummary(summary);
} catch (IOException e) {
e.printStackTrace();
}
return R.data(spiderResultVO);
}