仅仅使用了一个java文件,运行main方法即可,需要依赖的jar包是com.alibaba.fastjson(版本1.2.28)和jsoup(版本1.10.2)
如果用了pom,那么就是以下两个:
1
2
3
4
5
6
7
8
9
10
|
<dependency>
<groupid>com.alibaba</groupid>
<artifactid>fastjson</artifactid>
<version> 1.2 . 28 </version>
</dependency>
<dependency>
<groupid>org.jsoup</groupid>
<artifactid>jsoup</artifactid>
<version> 1.10 . 2 </version>
</dependency>
|
完整的代码如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
|
package com.tuniu.fcm.facade.ipproxy;
import com.alibaba.fastjson.jsonobject;
import org.jsoup.jsoup;
import org.jsoup.nodes.document;
import java.util.arraylist;
import java.util.hashmap;
import java.util.list;
import java.util.map;
import java.util.regex.matcher;
import java.util.regex.pattern;
/**
* 获取代理ip,需要
* com.alibaba.fastjson.jsonobject以及jsoup
*/
public class proxycralwerunusedvpn {
threadlocal<integer> localwantednumber = new threadlocal<integer>();
threadlocal<list<proxyinfo>> localproxyinfos = new threadlocal<list<proxyinfo>>();
public static void main(string[] args) {
proxycralwerunusedvpn proxycrawler = new proxycralwerunusedvpn();
/**
* 想要获取的代理ip个数,由需求方自行指定。(如果个数太多,将导致返回变慢)
*/
proxycrawler.startcrawler( 1 );
}
/**
* 暴露给外部模块调用的入口
* @param wantednumber 调用方期望获取到的代理ip个数
*/
public string startcrawler( int wantednumber) {
localwantednumber.set(wantednumber);
kuaidailicom( "http://www.xicidaili.com/nn/" , 15 );
kuaidailicom( "http://www.xicidaili.com/nt/" , 15 );
kuaidailicom( "http://www.xicidaili.com/wt/" , 15 );
kuaidailicom( "http://www.kuaidaili.com/free/inha/" , 15 );
kuaidailicom( "http://www.kuaidaili.com/free/intr/" , 15 );
kuaidailicom( "http://www.kuaidaili.com/free/outtr/" , 15 );
/**
* 构造返回数据
*/
proxyresponse response = new proxyresponse();
response.setsuccess( "true" );
map<string, object> datainfomap = new hashmap<string, object>();
datainfomap.put( "numfound" , localproxyinfos.get().size());
datainfomap.put( "pagenum" , 1 );
datainfomap.put( "proxy" , localproxyinfos.get());
response.setdata(datainfomap);
string responsestring = jsonobject.tojson(response).tostring();
system.out.println(responsestring);
return responsestring;
}
private void kuaidailicom(string baseurl, int totalpage) {
string ipreg = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3} \\d{1,6}" ;
pattern ipptn = pattern.compile(ipreg);
for ( int i = 1 ; i < totalpage; i++) {
if (getcurrentproxynumber() >= localwantednumber.get()) {
return ;
}
try {
document doc = jsoup.connect(baseurl + i + "/" )
.header( "accept" , "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" )
.header( "accept-encoding" , "gzip, deflate, sdch" )
.header( "accept-language" , "zh-cn,zh;q=0.8,en;q=0.6" )
.header( "cache-control" , "max-age=0" )
.header( "user-agent" , "mozilla/5.0 (macintosh; intel mac os x 10_11_4) applewebkit/537.36 (khtml, like gecko) chrome/51.0.2704.103 safari/537.36" )
.header( "cookie" , "hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1462812244; _gat=1; _ga=ga1.2.1061361785.1462812244" )
.header( "host" , "www.kuaidaili.com" )
.header( "referer" , "http://www.kuaidaili.com/free/outha/" )
.timeout( 30 * 1000 )
.get();
matcher m = ipptn.matcher(doc.text());
while (m.find()) {
if (getcurrentproxynumber() >= localwantednumber.get()) {
break ;
}
string[] strs = m.group().split( " " );
if (checkproxy(strs[ 0 ], integer.parseint(strs[ 1 ]))) {
system.out.println( "获取到可用代理ip\t" + strs[ 0 ] + "\t" + strs[ 1 ]);
addproxy(strs[ 0 ], strs[ 1 ], "http" );
}
}
} catch (exception e) {
e.printstacktrace();
}
}
}
private static boolean checkproxy(string ip, integer port) {
try {
//http://1212.ip138.com/ic.asp 可以换成任何比较快的网页
jsoup.connect( "http://1212.ip138.com/ic.asp" )
.timeout( 2 * 1000 )
.proxy(ip, port)
.get();
return true ;
} catch (exception e) {
return false ;
}
}
private int getcurrentproxynumber() {
list<proxyinfo> proxyinfos = localproxyinfos.get();
if (proxyinfos == null ) {
proxyinfos = new arraylist<proxyinfo>();
localproxyinfos.set(proxyinfos);
return 0 ;
}
else {
return proxyinfos.size();
}
}
private void addproxy(string ip, string port, string protocol){
list<proxyinfo> proxyinfos = localproxyinfos.get();
if (proxyinfos == null ) {
proxyinfos = new arraylist<proxyinfo>();
proxyinfos.add( new proxyinfo(ip, port, protocol));
}
else {
proxyinfos.add( new proxyinfo(ip, port, protocol));
}
}
}
class proxyinfo {
private string username = "" ;
private string ip;
private string password = "" ;
private string type;
private string port;
private int is_internet = 1 ;
public proxyinfo(string ip, string port, string type) {
this .ip = ip;
this .type = type;
this .port = port;
}
public string getusername() {
return username;
}
public void setusername(string username) {
this .username = username;
}
public string getip() {
return ip;
}
public void setip(string ip) {
this .ip = ip;
}
public string getpassword() {
return password;
}
public void setpassword(string password) {
this .password = password;
}
public string gettype() {
return type;
}
public void settype(string type) {
this .type = type;
}
public string getport() {
return port;
}
public void setport(string port) {
this .port = port;
}
public int getis_internet() {
return is_internet;
}
public void setis_internet( int is_internet) {
this .is_internet = is_internet;
}
}
class proxyresponse {
private string success;
private map<string, object> data;
public string getsuccess() {
return success;
}
public void setsuccess(string success) {
this .success = success;
}
public map<string, object> getdata() {
return data;
}
public void setdata(map<string, object> data) {
this .data = data;
}
}
|
以上这篇java代理实现爬取代理ip的示例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/sdfiiiiii/article/details/70432060