Java强大的网络数据抓取能力 -- 解析DOM获取网络数据
通过请求 获取网页内容
Java通过HTTP请求获取页面内容的 两种方式
-
基于Apache 的 HttpClient包实现
通过HttpResponse实例获得请求返回的数据体,具体数据封装在HttpEntity对象中。 -
基于Java的net工具包实现
通过HttpURLConnection 对象设置网络连接参数,建立网络连接,与获得请求返回的网络数据输入流,并从中获得数据。对于JSON格式的数据体,可进一步封装为JSONObject。
1. 基于Apache Httpclient实现Java网络访问工具
前面已经介绍了,这种方式的主要思路是通过HttpResponse实例获得请求返回的数据体,然后具体数据封装在HttpEntity对象中返回解析。
第一步,先引入Apache的Httpclient工具包,这里使用的是Httpclient-4.5.5
的版本
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.5</version>
</dependency>
第二步,创建 ApacheHttpUtil
工具类
package com.yxh.demo;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.message.BasicNameValuePair;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
* @author yangxiaohui
* @Date: Create by 2018-10-31 12:23
* @Description: 基于Apache的HttpClient实现的网络访问工具类
*/
public class ApacheHttpUtil {
public static CloseableHttpClient httpClient = HttpClientBuilder.create().build();
/**
* <pre>
* 发起GET请求 获取网络内容 并转成字符串进行返回(字符串:后面无论是解析dom或者结果是json字符串 都比较方便进行转换)
* <pre>
* @author Yangxiaohui
* @date 2018-10-31 12:28
* @param url 请求地址
* @return
*/
public static String getHttpContent(String url){
StringBuffer result = new StringBuffer();
HttpGet httpGet = new HttpGet(url);
try{
HttpResponse response = httpClient.execute(httpGet);
HttpEntity entity = response.getEntity();
InputStreamReader reader = new InputStreamReader(entity.getContent(),"utf-8");
char [] charbufer;
while (0<reader.read(charbufer=new char[10])){
result.append(charbufer);
}
}catch (IOException e){
e.printStackTrace();
}finally {
httpGet.releaseConnection();
}
httpGet.releaseConnection();
return result.toString();
}
/**
* <pre>
* 发起POST请求 获取网络内容 并转成字符串进行返回(字符串:后面无论是解析dom或者结果是json字符串 都比较方便进行转换)
* <pre>
* @author Yangxiaohui
* @date 2018-10-31 12:31
* @param url 请求地址
* @param data 请求参数内容
* @return
*/
public static String postHttpContent(String url, Map<String,String> data){
StringBuffer sb = new StringBuffer();
HttpPost httpPost = new HttpPost(url);
List<NameValuePair> valuePairs = new ArrayList<NameValuePair>();
//解决中文乱码问题
httpPost.addHeader("Content-type","application/x-www-form-urlencoded; charset=utf-8");
if(null != data) {
httpPost.setEntity(new StringEntity(data.toString(), Charset.forName("UTF-8")));
for (String key : data.keySet()) {
valuePairs.add(new BasicNameValuePair(key, data.get(key)));
}
}
try {
httpPost.setEntity(new UrlEncodedFormEntity(valuePairs));
HttpResponse response = httpClient.execute(httpPost);
HttpEntity httpEntity = response.getEntity();
BufferedInputStream bis = new BufferedInputStream(httpEntity.getContent());
byte [] buffer;
while (0<bis.read(buffer=new byte[128])){
sb.append(new String(buffer,"utf-8"));
}
}catch (UnsupportedEncodingException e){
e.printStackTrace();
}catch (IOException e){
e.printStackTrace();
}finally {
httpPost.releaseConnection();
}
return sb.toString();
}
}
测试代码
public static void main(String[] args){
String getStr=ApacheHttpUtil.getHttpContent("https://stock.tuchong.com/creative");
String postStr=ApacheHttpUtil.postHttpContent("https://stock.tuchong.com/creative",new HashMap<String, String>());
System.out.println(getStr);
System.out.println(postStr);
}
测试结果
2. 基于net 工具包 实现Java网络访问工具
基于java内置的net工具包开发,不需要引入第三方包
创建NetHttpUtil
工具类
package com.yxh.demo;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
/**
* @author yangxiaohui
* @Date: Create by 2018-10-31 16:49
* @Description: 基于java net工具包实现的网络访问工具类
*/
public class NetHttpUtil {
/**
* <pre>
* 发起GET请求 获取网络内容 并转成字符串进行返回(字符串:后面无论是解析dom或者结果是json字符串 都比较方便进行转换)
* <pre>
* @author Yangxiaohui
* @date 2018-10-31 16:50
* @param url 地址
* @return
*/
public static String getHttpContent(String url){
HttpURLConnection http = null;
InputStream is = null;
try {
URL urlGet = new URL(url);
http = (HttpURLConnection) urlGet.openConnection();
http.setRequestMethod("GET");
http.setRequestProperty("Content-Type","application/x-www-form-urlencoded");
http.setDoOutput(true);
http.setDoInput(true);
System.setProperty("sun.net.client.defaultConnectTimeout", "30000");
System.setProperty("sun.net.client.defaultReadTimeout", "30000");
http.connect();
is =http.getInputStream();
int size =is.available();
byte[] jsonBytes =new byte[size];
is.read(jsonBytes);
String message=new String(jsonBytes,"UTF-8");
return message;
} catch (Exception e) {
return null;
}finally {
if(null != http) http.disconnect();
try {
if (null != is) is.close();
}catch (IOException e){
e.printStackTrace();
}
}
}
/**
* <pre>
* 发起POST请求 获取网络内容 并转成字符串进行返回(字符串:后面无论是解析dom或者结果是json字符串 都比较方便进行转换)
* <pre>
* @author Yangxiaohui
* @date 2018-10-31 16:50
* @param url 地址
* @return
*/
public static String postHttpContent(String url,String data){
HttpURLConnection http = null;
PrintWriter out = null;
BufferedReader reader = null;
try {
//创建连接
URL urlPost = new URL(url);
http = (HttpURLConnection) urlPost
.openConnection();
http.setDoOutput(true);
http.setDoInput(true);
http.setRequestMethod("POST");
http.setUseCaches(false);
http.setInstanceFollowRedirects(true);
http.setRequestProperty("Content-Type",
"application/x-www-form-urlencoded");
http.connect();
//POST请求
OutputStreamWriter outWriter = new OutputStreamWriter(http.getOutputStream(), "utf-8");
out = new PrintWriter(outWriter);
out.print(data);
out.flush();
out.close();
out = null;
//读取响应
reader = new BufferedReader(new InputStreamReader(
http.getInputStream()));
String lines;
StringBuffer sb = new StringBuffer("");
while ((lines = reader.readLine()) != null) {
lines = new String(lines.getBytes(), "utf-8");
sb.append(lines);
}
reader.close();
reader = null;
System.out.println(sb.toString());
return sb.toString();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
return null;
}finally {
if(null != http) http.disconnect();
if(null != out) out.close();
try{
if(null != reader) reader.close();
}catch (IOException e){
e.printStackTrace();
}
}
}
}
JAVA 基于 Jsoup 对抓取到的网页对象进行DOM解析
jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。
1. 使用Jsoup 对字符串进行处理,并解析DOM
第一步,引入Jsoup 包。这里使用的是jsoup-1.11.2
版本
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.2</version>
</dependency>
第二步,创建工具类DomPareUtil
- 在解析这个网页前,我们需要确认一下,最后需要获得这个网页的哪些信息
打开F12开发者工具,使用鼠标选择元素工具选中我们需要得到的内容
由此处我们可以得出,我们只需要所有的a标签里的href,就能进而获取到所有的图片详地址了
package com.yxh.demo;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* @author yangxiaohui
* @Date: Create by 2018-10-31 16:59
* @Description:
*/
public class DomPareUtil {
public static void getInfo(String str){
Document doc = Jsoup.parse(str);
//定义一个list 存储所有图片访问路径
List<String> imgSrcs = new ArrayList<String>();
//先获取所有的图片元素块
Elements rows = doc.select(".new-search-works-item");
//遍历取出每一行所有图片块儿
Iterator<Element> iterator = rows.iterator();
while (iterator.hasNext()){
Element element = iterator.next();
//获取图片跳转路径
String imgHref = element.select("a").attr("href");
if (imgHref.indexOf("html")>0){
imgSrcs.add(imgHref);
}
}
List imgs = new ArrayList();
//遍历获取每一个图片的真实路径并下载
for (String imgSrc:imgSrcs){
String imgInfoStr=ApacheHttpUtil.getHttpContent(imgSrc);
Document imgDoc = Jsoup.parse(imgInfoStr);
String img = imgDoc.selectFirst(".works-img").attr("src");
imgs.add(img);
System.out.println(img == null ? "":img );
}
//下载图片
downloadPicture(imgs);
}
private static void downloadPicture(List<String> urlList) {
URL url = null;
int imageNumber = 0;
for (String urlString : urlList) {
try {
url = new URL(urlString);
DataInputStream dataInputStream = new DataInputStream(url.openStream());
String imageName = imageNumber + ".jpg";
File file = new File(imageName);
FileOutputStream fileOutputStream = new FileOutputStream(file);
byte[] buffer = new byte[1024];
int length;
while ((length = dataInputStream.read(buffer)) > 0) {
fileOutputStream.write(buffer, 0, length);
}
dataInputStream.close();
fileOutputStream.close();
imageNumber++;
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
测试代码
public static void main(String[] args){
//测试GET方式抓取图虫信息
String getStr=ApacheHttpUtil.getHttpContent("http://soso.nipic.com/?q=%E7%BE%8E%E5%A5%B3&or=0&y=40&g=1");
DomPareUtil.getInfo(getStr);
// //测试POST方式抓取图虫信息
// String postStr=ApacheHttpUtil.postHttpContent("https://stock.tuchong.com/creative",new HashMap<String, String>());
// System.out.println(getStr);
// System.out.println(postStr);
}
抓取结果: