HttpClient连接网页,Jsoup解析网页

时间:2022-11-03 13:08:44

  这两天在爬取某个医疗网站的信息,一开始就没有用httpClient用的全是Jsoup,用Jsoup链接并解析,上网搜了一下发现HttpClient和Jsoup结合起来也可以获取内容.所以把我github上的一篇代码有翻修了一下.
  

package GetMedicineName_001;

import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.protocol.HTTP;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import sun.net.www.http.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;

import java.io.*;
import java.net.URL;
import java.util.LinkedList;
import java.util.List;
import java.lang.String;
/**
* Created by panlu on 15-8-10.
*/


public class medicineName {
private String HTML = null;
private String URL = null;
private String masterURL = null;
private List<String> medicineNames;
private List<String> medicineLinks;

medicineName() {
URL = "http://jib.xywy.com/html/";
medicineNames = new LinkedList<String>();
medicineLinks = new LinkedList<String>();
}

public static void main(String[] args) {
medicineName med = new medicineName();

PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager(); //建立一个连接池
CloseableHttpClient httpclient1 = HttpClients.custom().setConnectionManager(cm).build();
String[] urlToGet = {"http://jib.xywy.com/html/a.html",
"http://jib.xywy.com/html/b.html",
"http://jib.xywy.com/html/c.html",
"http://jib.xywy.com/html/d.html",
"http://jib.xywy.com/html/e.html",
"http://jib.xywy.com/html/f.html",
"http://jib.xywy.com/html/g.html",
"http://jib.xywy.com/html/h.html",
"http://jib.xywy.com/html/i.html",
"http://jib.xywy.com/html/j.html",
"http://jib.xywy.com/html/k.html",
"http://jib.xywy.com/html/l.html",
"http://jib.xywy.com/html/m.html",
"http://jib.xywy.com/html/n.html",
"http://jib.xywy.com/html/p.html"};
GetThread[] threads = new GetThread[urlToGet.length];
for (int i = 0; i < threads.length; i++) {
HttpGet get = new HttpGet(urlToGet[i]);
threads[i] = new GetThread(httpclient1, get);
}

for (int j = 0; j < threads.length; j++) {
threads[j].start();
}

for (int j = 0; j < threads.length; j++) {
try {
threads[j].join();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}

}

线程方法:

package GetMedicineName_001;

import org.apache.commons.httpclient.HttpStatus;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import javax.xml.ws.spi.http.HttpContext;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;

/**
* Created by panlu on 15-8-11.
*/

public class GetThread extends Thread {
private final CloseableHttpClient client;
private final HttpClientContext context;
private final HttpGet httpget;
public String html;
private List<String> medicineLinks;
private List<String> medicineNames;
public String url;

public GetThread(CloseableHttpClient chc,HttpGet hg){
this.client = chc;
this.context = HttpClientContext.create();
this.httpget = hg;
medicineLinks = new LinkedList<String>();
medicineNames = new LinkedList<String>();
}

@Override
public void run(){
try {
CloseableHttpResponse response1 = client.execute(httpget,context);
int status = response1.getStatusLine().getStatusCode();
if (status == HttpStatus.SC_OK){
try {
HttpEntity entity1 = response1.getEntity();
if (entity1!=null) {
try {
html = EntityUtils.toString(entity1, "gb2312");
} catch (IOException e) {
e.printStackTrace();
}
}
}finally {
response1.close();
}
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}

Document doc = Jsoup.parse(html);
Elements linksElements = doc.getElementsByClass("ks-ill-txt");
Elements elemA = linksElements.select("a");
for (Element e : elemA) {
medicineLinks.add(e.attr("href"));
}

for (int i = 0; i < medicineLinks.size(); i++){
url = "http://jib.xywy.com" + medicineLinks.get(i);
try {
Document doc2 = Jsoup.connect(url).get();
// CloseableHttpClient client02 = = HttpClients.custom().setConnectionManager(cm).build();
// HttpGet get = new HttpGet(url);
// CloseableHttpResponse clr =
String title = doc2.title();
// 获取到title然后对title用,进行剪裁
String patternStr = ","; //正则表达式
String[] attrStr = title.split(patternStr);
title = attrStr[0];
medicineNames.add(title);
System.out.println(title);
} catch (IOException e) {
e.printStackTrace();
}
}
}
}