JAVA爬虫入门学习

时间:2022-07-09 17:01:22

之前分享一个简单的获取数据流的方法,

今天分享下学习到的SpringBoot+Maven的框架

 

首先pom.xml

 1 <!-- 继承父包 -->
2 <parent>
3 <groupId>org.springframework.boot</groupId>
4 <artifactId>spring-boot-starter-parent</artifactId>
5 <version>1.5.9.RELEASE</version>
6 </parent>
7
8 <dependencies>
9 <!-- spring-boot使用jetty容器配置begin -->
10 <dependency>
11 <groupId>org.springframework.boot</groupId>
12 <artifactId>spring-boot-starter-web</artifactId>
13 <!-- 排除默认的tomcat,引入jetty容器. -->
14 <exclusions>
15 <exclusion>
16 <groupId>org.springframework.boot</groupId>
17 <artifactId>spring-boot-starter-tomcat</artifactId>
18 </exclusion>
19 </exclusions>
20 </dependency>
21 <!-- jetty 容器. -->
22 <dependency>
23 <groupId>org.springframework.boot</groupId>
24 <artifactId>spring-boot-starter-jetty</artifactId>
25 </dependency>
26 <dependency>
27 <groupId>org.springframework.boot</groupId>
28 <artifactId>spring-boot-starter-test</artifactId>
29 <scope>test</scope>
30 </dependency>
31 <!-- spring-boot使用jetty容器配置end -->
32
33 <dependency>
34 <groupId>org.jsoup</groupId>
35 <artifactId>jsoup</artifactId>
36 <version>1.10.3</version>
37 </dependency>
38
39 </dependencies>
其中Jetty 是一个开源的servlet容器,它为基于Java的web容器,例如JSP和servlet提供运行环境。
Jetty是使用JAVA编写的,它的API以一组JAR包的形式发布。
开发人员可以将Jetty容器实例化成一个对象,可以迅速为一些独立运行(stand
-alone)的Java应用提供网络和web连接。
Jetty提供了一下HTTPClient的一些发包方法。
下面是控制器内写的接口
 1 @Controller  /*返回视图页名称*/
2 //@RestController 返回json数据类型
3 public class MainController {
4
5 @Autowired//根据类型匹配需要配合Qualifier
6 //@Qualifier//名称
7 //@Resource//根据名称自动匹配,区别于@Autowired
8 public GetCaptcha Captcha;
9
10 @GetMapping("Captcha")//等同于下面
11 //@RequestMapping(value = "name",method = RequestMethod.GET)
12 public @ResponseBody String index(HttpServletRequest request){//错误需要另外处理
13
14 //初始化
15 HttpClient httpClient = Captcha.GetHttpClient();
16 CaptchaData captchaData = new CaptchaData();
17 captchaData.setHttpClient(httpClient);
18
19 if(httpClient==null){
20 return "创建对象错误,检查代理是否到期";
21 }
22
23 CaptchaData imgValue = Captcha.GetCaptcha(captchaData);
24
25 request.getSession().setAttribute("captchaData",imgValue);
26
27 String htmlText = imgValue.getImg()+"<form action=\"/json\" method=\"POST\">\n" +
28 "请输入用户名:<br>\n" +
29 "<input type=\"text\" name=\"name\" value=\"张三\">\n" +
30 "<br>\n" +
31 "身份证号:<br>\n" +
32 "<input type=\"text\" name=\"cardId\" value=\"123456789098765432\">\n" +
33 "<br>\n" +
34 "验证码:<br>\n" +
35 "<input type=\"text\" name=\"captcha\" value=\"\">\n" +
36 "<br><br>\n" +
37 "<input type=\"submit\" value=\"提交\">\n" +
38 "</form> ";
39 return htmlText;//返回视图名称
40 }
41
42 @PostMapping("json")
43 public @ResponseBody String resp(Userdb user,HttpServletRequest request){
44
45 CaptchaData captchaData = (CaptchaData) request.getSession().getAttribute("captchaData");
46 HttpClient httpClient = captchaData.getHttpClient();
47
48 String returnValue = "";
49 try {
50 returnValue = Captcha.PostData(user,captchaData);
51 }catch (Exception e){
52 returnValue = "数据传输出现错误,请检查数据信息";
53 }finally {
54 try{
55 httpClient.stop();
56 }
57 catch (Exception e){
58 returnValue += " \n HttpClient关闭异常";
59 }
60 }
61 return returnValue;
62 }
63 }

在控制器中调用的方法分别包括:

获取验证码、发送数据到目标地址、初始化httpClient对象

代码如下:

 1 public CaptchaData GetCaptcha(CaptchaData captchaData) {
2 HttpClient httpClient = GetHttpClient();
3 try {
4 ContentResponse response1 = httpClient.newRequest("https://www.baidu.com").send();
5 //System.out.println(response1.getContentAsString());
6 //首页
7 String TOKEN;
8 response1 = httpClient.newRequest("XXXXXXXX/index1.do")
9 .method(HttpMethod.GET)
10 .header(HttpHeader.REFERER, "https://XXXXXXXX/")
11 .send();
12
13 //点击进入页面
14 response1 = httpClient.newRequest("XXXXXXXX/xxxxx.xxx?xxxx=initLogin")
15 .method(HttpMethod.GET)
16 .header(HttpHeader.REFERER, "https://XXXXXXXX/index1.do")
17 .send();
18
19 //点击注册页面
20 response1 = httpClient.newRequest("https://XXXXXXXX/xxxxx?method=initReg")
21 .method(HttpMethod.POST)
22 .header("Referer", "https://XXXXXXXX/loginreg.jsp")
23 .send();
24
25 //获取TOKEN
26 Document documentUrlOne = Jsoup.parse(response1.getContentAsString());
27 Elements firstNode = documentUrlOne.select("div.white-con");
28 Elements formNode = firstNode.select("form[action]");
29 Element inputNode = formNode.select("input").first();
30 TOKEN = inputNode.attr("value");
31 //存入
32 captchaData.setTOKEN(TOKEN);
33 //TOKEN需要在注册提交页面生成的才有效
34 //System.out.println(TOKEN+"---获取org.apache.struts.taglib.html.TOKEN----");
35
36 //获取验证码。
37 String timeString = String.valueOf(new Date().getTime());
38 String ImgStr = "https://XXXXXXXX/imgrc.do?a=" + timeString;
39
40 response1 = httpClient.newRequest(ImgStr)
41 .method(HttpMethod.GET)
42 .header("Referer", "https://XXXXXXXXXXXXXXXX.do")
43 .send();
44 //图片类型转换
45 String encodeBase64 = Base64.getEncoder().encodeToString(response1.getContent());
46 //<img src=''/>
47 //String filepath = "XXXXXXXX\a.png";
48 //convertBase64DataToImage(encodeBase64, filepath);
49 //System.out.printf("图片下载成功")
50 captchaData.setImg("<img src='data:image/png;base64," + encodeBase64 + "'/>");
51 captchaData.setResponse1(response1);
52 return captchaData;
53 } catch (Exception e) {
54 e.printStackTrace();
55 } finally {
56 return captchaData;
57 }
58 }

 

public String PostData(Userdb User, CaptchaData captchaData) {

String Name
= User.getName();
String CardId
= User.getCardId();
String CAPTCHA
= User.getCaptcha();
//发送信息提交
String TOKEN =captchaData.getTOKEN();
String postdata
= "org.apache.struts.taglib.html.TOKEN=" + TOKEN +
"&method=checkIdentity" +
"&userInfoVO.name=" + Name +
"&userInfoVO.certType=0" +
"&userInfoVO.certNo=" + CardId +
"&_%40IMGRC%40_=" + CAPTCHA +
"&1=on";
HttpClient httpClient
= captchaData.getHttpClient();


try{
ContentResponse response1
= httpClient.newRequest("https://xxxxxxxxxdo?" + postdata)
.method(HttpMethod.POST)
.header(
"Referer", "https:/xxxxxxxx.do")
.send();

// 获取结果。检查是否出现错误
Document documentPage = Jsoup.parse(response1.getContentAsString());
Elements ErrorNode
= documentPage.select("div.erro_div1");
Elements ErrorValueNode
= ErrorNode.select("span[id=_error_field_]");
String ErrorValue
= ErrorValueNode.text();
if (ErrorValue.equals("")) {
return "身份信息正确---END";
//System.out.println("身份信息正确---END");
} else {
return ErrorValue + "cw---END";
//System.out.println(ErrorValue+"---END");
}
}
catch (Exception e){
return "异常哎";
}

}

 



public HttpClient GetHttpClient() {
//创建HTTPClient对象,
//初始化
SslContextFactory sslContextFactory = new SslContextFactory();
sslContextFactory.setTrustAll(
true);
HttpClient httpClient
= new HttpClient(sslContextFactory);
httpClient.setConnectTimeout(
10000);
httpClient.setFollowRedirects(
true);
httpClient.setUserAgentField(
new HttpField("User-Agent", "xxxxxxxxxxxxxxxxxxxxxxxxxx"));

ProxyConfiguration proxyConfig
= httpClient.getProxyConfiguration();
HttpProxy proxy
= new HttpProxy("xxxxxx", xxx);
proxyConfig.getProxies().add(proxy);
try {
httpClient.start();
return httpClient;
}
catch (Exception e) {
return null;
}

}
 
1 //下载图片
2 public void convertBase64DataToImage(String base64ImgData, String filePath) throws IOException {
3 BASE64Decoder d = new BASE64Decoder();
4 byte[] bs = d.decodeBuffer(base64ImgData);
5 FileOutputStream os = new FileOutputStream(filePath);
6 os.write(bs);
7 os.close();
8 }

 

最后是application.yml配置
 1 server:
2 port: 8080
3
4
5 spring:
6 http:
7 encoding:
8 charset: utf-8
9 enabled: true
10 force: true
11 thymeleaf:
12 mode: HTML5
13 encoding: UTF-8
14 content-type: text/html
15 cache: false
16 prefix: classpath:/templates/
17 suffix: .html
18 resources:
19 static-locations: classpath:/static/
 

 

一个完整的爬虫就实现了,使用服务接口的形式