Java实现网络爬虫

时间:2022-01-16 20:43:36

1、pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>

<artifactId>webspider</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>

<name>webspider</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

<repositories>
<repository>
<id>maven center</id>
<url>http://central.maven.org/maven2/</url>
</repository>
</repositories>

<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.geronimo.specs</groupId>
<artifactId>geronimo-servlet_2.5_spec</artifactId>
<version>1.2</version>
</dependency>

<dependency>
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-core-asl</artifactId>
<version>1.9.10</version>
</dependency>

<dependency>
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-mapper-asl</artifactId>
<version>1.9.10</version>
</dependency>

<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.2.4</version>
</dependency>

<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>3.6.2</version>
</dependency>

<dependency>
<groupId>org.htmlparser</groupId>
<artifactId>htmlparser</artifactId>
<version>2.1</version>
</dependency>

<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.2</version>
</dependency>

<dependency>
<groupId>org.apache.directory.studio</groupId>
<artifactId>org.apache.commons.io</artifactId>
<version>2.4</version>
</dependency>
</dependencies>
</project>

2、

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

public class WebSpider {
private String url;
private String content;
private byte[] contentByte;
private String charSet = "";
private static WebSpider webSpider;
private BasicNameValuePair username;
private BasicNameValuePair password;
private Map<String, String> header = new HashMap<String, String>();

public BasicNameValuePair getUsername() {
return username;
}

public void setUsername(BasicNameValuePair username) {
this.username = username;
}

public BasicNameValuePair getPassword() {
return password;
}

public void setPassword(BasicNameValuePair password) {
this.password = password;
}

public static WebSpider newInstance() {
if(webSpider == null) {
webSpider = new WebSpider();
}
return webSpider;
}

private WebSpider() {
}

public String getUrl() {
return url;
}

public void setUrl(String url) {
this.url = url;
}

public String getContent() {
this.spider();
return this.content;
}

public String getContent(String defaultCharSet){
DefaultHttpClient httpclient = new DefaultHttpClient();
HttpGet httpget = new HttpGet(this.url);

httpget.setHeader("User-Agent","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.2)");
httpget.setHeader("Accept-Language", "zh-cn,zh;q=0.5");
httpget.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7");

try {
HttpResponse response = httpclient.execute(httpget);

int statusCode = response.getStatusLine().getStatusCode();
if ((statusCode == HttpStatus.SC_MOVED_PERMANENTLY)
|| (statusCode == HttpStatus.SC_MOVED_TEMPORARILY)
|| (statusCode == HttpStatus.SC_SEE_OTHER)
|| (statusCode == HttpStatus.SC_TEMPORARY_REDIRECT)) {

String newUri = response.getLastHeader("Location").getValue();
httpclient = new DefaultHttpClient();
httpget = new HttpGet(newUri);
response = httpclient.execute(httpget);
}

HttpEntity entity = response.getEntity();

if (entity != null) {

this.contentByte = EntityUtils.toByteArray(entity);
if (contentByte.length > 0) {

this.charSet = EntityUtils.getContentCharSet(entity);

if (this.charSet == "" || this.charSet == null) {
this.charSet = defaultCharSet;
}
this.content = new String(contentByte, this.charSet);
}
}
} catch (ClientProtocolException e) {
//e.printStackTrace();
} catch (IOException e) {
//e.printStackTrace();
} finally {
httpclient.getConnectionManager().shutdown();
}
return this.content;
}

private void simulateLogin(HttpGet httpget){
if(this.username == null)
{
return;
}
HttpClient httpclient = new DefaultHttpClient();
// 设置登录参数
List<NameValuePair> formparams = new ArrayList<NameValuePair>();
formparams.add(this.username);
formparams.add(this.password);
UrlEncodedFormEntity entity;
try {
entity = new UrlEncodedFormEntity(formparams, "UTF-8");
// 新建Http post请求
HttpPost httppost = new HttpPost(httpget.getURI());
httppost.setEntity(entity);

// 处理请求,得到响应
HttpResponse response = httpclient.execute(httppost);
String set_cookie = response.getFirstHeader("Set-Cookie").getValue();
// 根据获得的Cookie值,设置头信息
httpget.setHeader("Cookie",set_cookie.substring(0, set_cookie.indexOf(";")));
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ClientProtocolException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}

@SuppressWarnings("deprecation")
private void spider() {
DefaultHttpClient httpclient = new DefaultHttpClient();
HttpGet httpget = new HttpGet(this.url);
httpget.setHeader("User-Agent","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.2)");
httpget.setHeader("Accept-Language", "zh-cn,zh;q=0.5");
httpget.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7");

// 如果设置了username,则执行模拟登录
simulateLogin(httpget);

Iterator it = header.entrySet().iterator();
Entry<String,String> entry;
while (it.hasNext()) {
entry = (Entry) it.next();
httpget.addHeader(entry.getKey(), entry.getValue());
}

try {
HttpResponse response = httpclient.execute(httpget);

int statusCode = response.getStatusLine().getStatusCode();
if ((statusCode == HttpStatus.SC_MOVED_PERMANENTLY)
|| (statusCode == HttpStatus.SC_MOVED_TEMPORARILY)
|| (statusCode == HttpStatus.SC_SEE_OTHER)
|| (statusCode == HttpStatus.SC_TEMPORARY_REDIRECT)) {

String newUri = response.getLastHeader("Location").getValue();
httpclient = new DefaultHttpClient();
httpget = new HttpGet(newUri);
response = httpclient.execute(httpget);
}

HttpEntity entity = response.getEntity();

if (entity != null) {

this.contentByte = EntityUtils.toByteArray(entity);
if (contentByte.length > 0) {

�ڴ˴���ȡ
this.charSet = EntityUtils.getContentCharSet(entity);

if (this.charSet == "" || this.charSet == null) {
this.charSet = "UTF-8";
}
this.content = new String(contentByte, this.charSet);
}
}
} catch (ClientProtocolException e) {
//e.printStackTrace();
} catch (IOException e) {
//e.printStackTrace();
} finally {
httpclient.getConnectionManager().shutdown();
}
}
public Map<String, String> getHeader() {
return header;
}

public void setHeader(Map<String, String> header) {
this.header = header;
}

}

3、
import java.util.ArrayList;
import java.util.List;

import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import com.gbits.platform.webspider.util.StringUtils;

public class HtmlParser {
private String content;
private NodeFilter filter;
private List<String> result;
private static HtmlParser htmlParser;

public static HtmlParser newInstance() {
if(htmlParser == null) {
htmlParser = new HtmlParser();
}
return htmlParser;
}

public HtmlParser() {
}

public String getContent() {
return content;
}

public void setContent(String content) {
this.content = content;
}

public NodeFilter getFilter() {
return filter;
}

public void setFilter(NodeFilter filter) {
this.filter = filter;
}

private void parser() {
try {
if(!StringUtils.hasText(this.content))
return;
Parser parser = new Parser(this.content);
NodeList nodes = parser.extractAllNodesThatMatch(this.filter);

result = new ArrayList<String>();

if(nodes != null && nodes.size() > 0)
{
for (int i = 0; i < nodes.size(); i++) {
this.result.add(nodes.elementAt(i).toHtml());
}
}
} catch (ParserException e) {
e.printStackTrace();
}
}

public List<String> getResult() {
this.parser();
return this.result;
}

public static void main(String args[]) {
WebSpider w = WebSpider.newInstance();
w.setUrl("http://tool.5173.com/AccountRent/index.aspx?gm=f3823b6683834acdbfbd82f67b394b88&ga=&gs=&key=&p=1");
String content = w.getContent();
NodeFilter itemFilter = new HasAttributeFilter("class","item");
NodeFilter itemDisableFilter = new HasAttributeFilter("class","item disable");
NodeFilter filter = new OrFilter(itemFilter, itemDisableFilter);

HtmlParser parser = HtmlParser.newInstance();
parser.setContent(content);
parser.setFilter(filter);

List<String> result = parser.getResult();
if(result != null && result.size() > 0) {
for(String tmp : result) {
Document doc = Jsoup.parse(tmp);
Element elt = doc.select("ul.colthree li").first();
System.out.println(elt.text());
}
} else {
System.out.println("parser error");
}
}
}