前段时间需要爬取网页上的信息,自己对于爬虫没有任何了解,就了解了一下webmagic,写了个简单的爬虫。
一、首先介绍一下webmagic:
webmagic采用完全模块化的设计,功能覆盖整个爬虫的生命周期(链接提取、页面下载、内容抽取、持久化),支持多线程抓取,分布式抓取,并支持自动重试、自定义ua/cookie等功能。
实现理念:
maven依赖:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
<dependency>
<groupid>us.codecraft</groupid>
<artifactid>webmagic-core</artifactid>
<version> 0.7 . 3 </version>
</dependency>
<dependency>
<groupid>us.codecraft</groupid>
<artifactid>webmagic-extension</artifactid>
<version> 0.7 . 3 </version>
</dependency>
<dependency>
<groupid>us.codecraft</groupid>
<artifactid>webmagic-extension</artifactid>
<version> 0.7 . 3 </version>
<exclusions>
<exclusion>
<groupid>org.slf4j</groupid>
<artifactid>slf4j-log4j12</artifactid>
</exclusion>
</exclusions>
</dependency>
|
jdbc模式:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
|
ublic class csdnblogdao {
private connection conn = null ;
private statement stmt = null ;
public csdnblogdao() {
try {
string url = "jdbc:mysql://localhost:3306/test?"
+ "user=***&password=***3&useunicode=true&characterencoding=utf8" ;
conn = drivermanager.getconnection(url);
stmt = conn.createstatement();
} catch (classnotfoundexception e) {
e.printstacktrace();
} catch (sqlexception e) {
e.printstacktrace();
}
}
public int add(csdnblog csdnblog) {
try {
string sql = "insert into `test`.`csdnblog` (`keyes`, `titles`, `content` , `dates`, `tags`, `category`, `views`, `comments`, `copyright`) values (?, ?, ?, ?, ?, ?, ?, ?,?);" ;
preparedstatement ps = conn.preparestatement(sql);
ps.setint( 1 , csdnblog.getkey());
ps.setstring( 2 , csdnblog.gettitle());
ps.setstring( 3 ,csdnblog.getcontent());
ps.setstring( 4 , csdnblog.getdates());
ps.setstring( 5 , csdnblog.gettags());
ps.setstring( 6 , csdnblog.getcategory());
ps.setint( 7 , csdnblog.getview());
ps.setint( 8 , csdnblog.getcomments());
ps.setint( 9 , csdnblog.getcopyright());
return ps.executeupdate();
} catch (sqlexception e) {
e.printstacktrace();
}
return - 1 ;
}
}
|
实体类:
- public class csdnblog {
- private int key;// 编号
- private string title;// 标题
- private string dates;// 日期
- private string tags;// 标签
- private string category;// 分类
- private int view;// 阅读人数
- private int comments;// 评论人数
- private int copyright;// 是否原创
- private string content; //文字内容
- public string getcontent() {
- return content;
- }
- public void setcontent(string content) {
- this.content = content;
- }
- public int getkey() {
- return key;
- }
- public void setkey(int key) {
- this.key = key;
- }
- public string gettitle() {
- return title;
- }
- public void settitle(string title) {
- this.title = title;
- }
- public string getdates() {
- return dates;
- }
- public void setdates(string dates) {
- this.dates = dates;
- }
- public string gettags() {
- return tags;
- }
- public void settags(string tags) {
- this.tags = tags;
- }
- public string getcategory() {
- return category;
- }
- public void setcategory(string category) {
- this.category = category;
- }
- public int getview() {
- return view;
- }
- public void setview(int view) {
- this.view = view;
- }
- public int getcomments() {
- return comments;
- }
- public void setcomments(int comments) {
- this.comments = comments;
- }
- public int getcopyright() {
- return copyright;
- }
- public void setcopyright(int copyright) {
- this.copyright = copyright;
- }
- public string tostring() {
- return "csdnblog [key=" + key + ", title=" + title + ", content=" + content + ",dates=" + dates + ", tags=" + tags + ", category="
- + category + ", view=" + view + ", comments=" + comments + ", copyright=" + copyright + "]";
- }
- }
启动类:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
|
public class csdnblogpageprocessor implements pageprocessor {
private static string username= "chenyufeng1991" ; // 设置csdn用户名
private static int size = 0 ; // 共抓取到的文章数量
// 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等
private site site = site.me().setretrytimes( 3 ).setsleeptime( 1000 );
public site getsite() {
return site;
}
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(page page) {
// 列表页
if (!page.geturl().regex( "http://blog.csdn.net/" + username + "/article/details/d+" ).match()) {
// 添加所有文章页
page.addtargetrequests(page.gethtml().xpath( "//div[@id='article_list']" ).links()// 限定文章列表获取区域
.regex( "/" + username + "/article/details/d+" )
.replace( "/" + username + "/" , "http://blog.csdn.net/" + username + "/" )// 巧用替换给把相对url转换成绝对url
.all());
// 添加其他列表页
page.addtargetrequests(page.gethtml().xpath( "//div[@id='papelist']" ).links()// 限定其他列表页获取区域
.regex( "/" + username + "/article/list/d+" )
.replace( "/" + username + "/" , "http://blog.csdn.net/" + username + "/" )// 巧用替换给把相对url转换成绝对url
.all());
// 文章页
} else {
size++; // 文章数量加1
// 用csdnblog类来存抓取到的数据,方便存入数据库
csdnblog csdnblog = new csdnblog();
// 设置编号
csdnblog.setkey(integer.parseint(
page.geturl().regex( "http://blog.csdn.net/" + username + "/article/details/(d+)" ).get()));
// 设置标题
csdnblog.settitle(
page.gethtml().xpath( "//div[@class='article_title']//span[@class='link_title']/a/text()" ).get());
//设置内容
csdnblog.setcontent(
page.gethtml().xpath( "//div[@class='article_content']/alltext()" ).get());
// 设置日期
csdnblog.setdates(
page.gethtml().xpath( "//div[@class='article_r']/span[@class='link_postdate']/text()" ).get());
// 设置标签(可以有多个,用,来分割)
csdnblog.settags(listtostring(page.gethtml().xpath( "//div[@class='article_l']/span[@class='link_categories']/a/alltext()" ).all()));
// 设置类别(可以有多个,用,来分割)
csdnblog.setcategory(listtostring(page.gethtml().xpath( "//div[@class='category_r']/label/span/text()" ).all()));
// 设置阅读人数
csdnblog.setview(integer.parseint(page.gethtml().xpath( "//div[@class='article_r']/span[@class='link_view']" )
.regex( "(d+)人阅读" ).get()));
// 设置评论人数
csdnblog.setcomments(integer.parseint(page.gethtml()
.xpath( "//div[@class='article_r']/span[@class='link_comments']" ).regex( "((d+))" ).get()));
// 设置是否原创
csdnblog.setcopyright(page.gethtml().regex( "bog_copyright" ).match() ? 1 : 0 );
// 把对象存入数据库
new csdnblogdao().add(csdnblog);
// 把对象输出控制台
system.out.println(csdnblog);
}
}
// 把list转换为string,用,分割
public static string listtostring(list<string> stringlist) {
if (stringlist == null ) {
return null ;
}
stringbuilder result = new stringbuilder();
boolean flag = false ;
for (string string : stringlist) {
if (flag) {
result.append( "," );
} else {
flag = true ;
}
result.append(string);
}
return result.tostring();
}
public static void main(string[] args) {
long starttime, endtime;
system.out.println( "【爬虫开始】..." );
starttime = system.currenttimemillis();
// 从用户博客首页开始抓,开启5个线程,启动爬虫
spider.create( new csdnblogpageprocessor()).addurl( "http://blog.csdn.net/" + username).thread( 5 ).run();
endtime = system.currenttimemillis();
system.out.println( "【爬虫结束】共抓取" + size + "篇文章,耗时约" + ((endtime - starttime) / 1000 ) + "秒,已保存到数据库,请查收!" );
}
}
|
使用mysql类型:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
|
public class gamepageprocessor implements pageprocessor {
private static final logger logger = loggerfactory.getlogger(gamepageprocessor. class );
private static dianjingservice d;
private static bannerservice bs;
private static sportservice ss;
private static yulenewsservice ys;
private static updateservice ud ;
// 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等
private site site = site.me().setretrytimes( 3 ).setsleeptime( 1000 );
public site getsite() {
return site;
}
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public static void main(string[] args) {
configurableapplicationcontext context= springapplication.run(gamepageprocessor. class , args);
d = context.getbean(dianjingservice. class );
//spider.create(new gamepageprocessor()).addurl("网址").thread(5).run();
}
public void process(page page) {
selectable url = page.geturl();
if (url.tostring().equals( "网址" )) {
dianjingvideo dv = new dianjingvideo();
list<string> ls = page.gethtml().xpath( "//div[@class='v']/div[@class='v-meta va']/div[@class='v-meta-title']/a/text()" ).all();
//hrefs
list<string> ls1 = page.gethtml().xpath( "//div[@class='v']/div[@class='v-link']/a/@href" ).all();//获取a标签的href
list<string> ls2 = page.gethtml().xpath( "//div[@class='v']/div[@class='v-meta va']/div[@class='v-meta-entry']/div[@class='v-meta-data']/span[@class='r']/text()" ).all();
//photo
list<string> ls3 = page.gethtml().xpath( "//div[@class='v']/div[@class='v-thumb']/img/@src" ).all();
for ( int i = 0 ; i < 5 ; i++) {
dv.settitles(ls.get(i));
dv.setcategory( "" );
dv.setdates(ls2.get(i));
dv.sethrefs(ls1.get(i));
dv.setphoto(ls3.get(i));
dv.setsources( "" );
d.addvideo(dv);
}
}
}
|
controller:
- @controller
- @requestmapping(value = "/dianjing")
- public class dianjingcontroller {
- @autowired
- private dianjingservice s;
- /*
- 手游
- */
- @requestmapping("/dianjing")
- @responsebody
- public object dianjing(){
- list<dianjing> list = s.find2();
- jsonobject jo = new jsonobject();
- if(list!=null){
- jo.put("code",0);
- jo.put("success",true);
- jo.put("count",list.size());
- jo.put("list",list);
- }
- return jo;
- }
- }
实体类就不展示了
dao层
1
2
|
@insert ( "insert into dianjing (titles,dates,category,hrefs,photo,sources) values(#{titles},#{dates},#{category},#{hrefs},#{photo},#{sources})" )
int adddj(dianjing dj);
|
以上这篇springboot+webmagic实现java爬虫jdbc及mysql的方法就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持服务器之家。
原文链接:https://www.cnblogs.com/NCL--/p/8608336.html