【文件属性】:
文件名称:爬虫例子源代码
文件大小:16.22MB
文件格式:ZIP
更新时间:2016-06-13 09:49:43
Spider
package test;
import java.io.IOException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
public class Spider implements Runnable {
boolean search_key_words = false;
int count = 0;
int limitsite = 10;
int countsite = 1;
String keyword = "中国";
Parser parser = new Parser();
// List linkList=new ArrayList();
String startsite = "";
SearchResultBean srb;
List resultList = new ArrayList();
List searchsite = new ArrayList();
Queue linklist = new LinkedList();
HashMap> disallowListCade = new HashMap>();
public Spider(String keyword, String startsite) {
this.keyword = keyword;
this.startsite = startsite;
linklist.add(startsite);
srb = new SearchResultBean();
}
public void run() {
}
public void search(Queue queue) {
String url = "";
while (!queue.isEmpty()) {
url = queue.peek().toString();
try {
if (!isSearched(searchsite, url)) {
if (isRobotAllowed(new URL(url))) {
processHtml(url);
}
} else {
System.out.println("thissssssss");
}
} catch (Exception e) {
e.printStackTrace();
}
queue.remove();
}
}
private void processHtml(String url) throws Exception {
searchsite.add(url);
count = 0;
System.out.println("searching..." + url);
try {
parser.setURL(url);
parser.setEncoding("gbk");
URLConnection uc = parser.getConnection();
uc.connect();
uc.getLastModified();
NodeIterator nit = parser.elements();
while (nit.hasMoreNodes()) {
Node node = nit.nextNode();
parserNode(node);
}
srb.setKeyWords(keyword);
srb.setUrl(url);
srb.setCount_key_words(count);
resultList.add(srb);
System.out.println("keywords:" + count);
} catch (ParserException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
public void dealTag(Tag tag) throws Exception {
NodeList list = tag.getChildren();
if (list != null) {
NodeIterator it = list.elements();
while (it.hasMoreNodes()) {
Node node = it.nextNode();
parserNode(node);
}
}
}
private void parserNode(Node node) throws Exception {
if (node instanceof StringNode) {
StringNode n = (StringNode) node;
}
}
private boolean isRobotAllowed(URL url) {
return false;
}
private boolean isSearched(List searchsite2, String url) {
return false;
}
}