一、需求分析
分析:
需要的数据:标题、摘要、原文地址、发布时间
存储数据库
二、设计数据库
标题、摘要、原文地址、发布时间
文章表:id主键 title标题summary摘要detailurl详细地址pubtime发布时间ctime创建时间
SQL脚本:
create database db_data1906; use db_data1906; create table t_bkyarticle(id int primary key auto_increment,title varchar(100),summary text,detailurl varchar(200),pubtime date,ctime date);
三、实现编码
技术栈:SpringBoot
1.新建项目
SpringBoot
2、依赖jar
3、逐层编写代码
实体层
@TableName("t_bkyarticle") @Data public class BkyArticle { @TableId(type = IdType.AUTO) private Integer id; private String title; private String summary; private String detailurl; private Date pubtime; private Date ctime; }
持久层
public interface BkyArticleDao extends BaseMapper<BkyArticle> { @Insert("insert into t_bkyarticle(title,summary,detailurl,pubtime,ctime) values(#{title},#{summary},#{detailurl},#{pubtime},now())") int save(BkyArticle article); }
业务逻辑层
public interface BkyArticleService extends IService<BkyArticle> { boolean saveEntity(BkyArticle article); }
@Service public class BkyArticleServiceImpl extends ServiceImpl<BkyArticleDao, BkyArticle> implements BkyArticleService { ? @Override public boolean saveEntity(BkyArticle article) { return getBaseMapper().save(article)>0; } }
4、编写爬虫核心代码
自定义页面处理器
@Service public class BkyArticlePage implements PageProcessor { ? private String baseUrl="https://www.cnblogs.com/"; @Override public void process(Page page) { //1、解析当前页面的内容 List<String> titles=page.getHtml().xpath("div[@id=‘post_list‘]/div[@class=‘post_item‘]/div[@class=‘post_item_body‘]/h3/a/text()").all(); List<String> urls=page.getHtml().xpath("div[@id=‘post_list‘]/div[@class=‘post_item‘]/div[@class=‘post_item_body‘]/h3/a/@href").all(); List<String> infos=page.getHtml().xpath("div[@id=‘post_list‘]/div[@class=‘post_item‘]/div[@class=‘post_item_body‘]/p[@class=‘post_item_summary‘]/text()").all(); List<String> times=page.getHtml().xpath("div[@id=‘post_list‘]/div[@class=‘post_item‘]/div[@class=‘post_item_body‘]/div[@class=‘post_item_foot‘]/a/text()").all(); //2、组装解析的结果 List<BkyArticle> articles=new ArrayList<>(); for(int i=0;i<titles.size();i ){ BkyArticle article=new BkyArticle(); article.setTitle(titles.get(i)); article.setSummary(infos.get(i)); article.setDetailurl(urls.get(i)); article.setPubtime(parseTime(getTimeStr(times.get(i)))); articles.add(article); } //3、传递给了结果处理器 page.putField("list",articles); ? //4、分页查询 获取分页的路径并标记继续爬取 if(page.getUrl().get().equals(baseUrl)){ //计算所有的分页请路径 List<String> pageurls=new ArrayList<>(); List<String>allpages=page.getHtml().xpath("div[@id=‘paging_block‘]/div[@class=‘pager‘]/a/text()").all(); int maxPage=Integer.parseInt(allpages.get(allpages.size()-2)); for(int i=2;i<=maxPage;i ){ pageurls.add(baseUrl "/sitehome/p/" i); } //设置继续爬取的网页 page.addTargetRequests(pageurls); } } private String getTimeStr(String s){ String s1=s.trim(); if(s1.indexOf(" ")>0){ return s.substring(s.indexOf(‘ ‘) 1); }else { return null; } } private Date parseTime(String time){ if(time!=null) { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm"); try { return sdf.parse(time); } catch (ParseException e) { e.printStackTrace(); return new Date(); } }else { return new Date(); } } private Site site=Site.me().setTimeOut(6000).setSleepTime(2000); ? @Override public Site getSite() { return site; } }
结果处理器
@Repository public class BkyArticPipeline implements Pipeline { @Autowired private BkyArticleDao bkyArticleDao; @Override public void process(ResultItems resultItems, Task task) { List<BkyArticle> articleList=resultItems.get("list"); System.out.println("爬取数据:" articleList.size()); for(BkyArticle a:articleList){ bkyArticleDao.save(a); } } }
5、编写启动接口
控制器 实现爬取的运行
@Api @RestController public class BkyArticController { @Autowired private BkyArticleService bkyArticleService; @Autowired private BkyArticlePage page; @Autowired private BkyArticPipeline pipeline; //启动爬虫 @GetMapping("/api/spider/start.do") public R start(){ Spider.create(page).addPipeline(pipeline).addUrl("https://www.cnblogs.com/").thread(5).run(); return R.ok("爬取已经启动"); } //查询爬取数据 @GetMapping("api/bkyartic/all.do") public R all(){ return R.ok(bkyArticleService.list()); } }
6、配置Swagger
@Configuration //配置文件 public class SwaggerConfig { //创建文档说明 public ApiInfo createAI(){ ApiInfo apiInfo=new ApiInfoBuilder().title("文章接口").description("实现一款基于爬虫实现的数据接口").contact(new Contact("Feri","http://www.17feri.top","[email protected]")).build(); return apiInfo; } //创建Swagger扫描信息 @Bean public Docket createD(){ return new Docket(DocumentationType.SWAGGER_2).apiInfo(createAI()).select(). apis(RequestHandlerSelectors.basePackage("com.feri.point.controller")).build(); } }
7、启动测试