爬虫例子源代码

时间:2016-06-13 09:49:43
【文件属性】:
文件名称:爬虫例子源代码
文件大小:16.22MB
文件格式:ZIP
更新时间:2016-06-13 09:49:43
Spider package test; import java.io.IOException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Queue; import org.htmlparser.Node; import org.htmlparser.Parser; import org.htmlparser.Tag; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; public class Spider implements Runnable { boolean search_key_words = false; int count = 0; int limitsite = 10; int countsite = 1; String keyword = "中国"; Parser parser = new Parser(); // List linkList=new ArrayList(); String startsite = ""; SearchResultBean srb; List resultList = new ArrayList(); List searchsite = new ArrayList(); Queue linklist = new LinkedList(); HashMap> disallowListCade = new HashMap>(); public Spider(String keyword, String startsite) { this.keyword = keyword; this.startsite = startsite; linklist.add(startsite); srb = new SearchResultBean(); } public void run() { } public void search(Queue queue) { String url = ""; while (!queue.isEmpty()) { url = queue.peek().toString(); try { if (!isSearched(searchsite, url)) { if (isRobotAllowed(new URL(url))) { processHtml(url); } } else { System.out.println("thissssssss"); } } catch (Exception e) { e.printStackTrace(); } queue.remove(); } } private void processHtml(String url) throws Exception { searchsite.add(url); count = 0; System.out.println("searching..." + url); try { parser.setURL(url); parser.setEncoding("gbk"); URLConnection uc = parser.getConnection(); uc.connect(); uc.getLastModified(); NodeIterator nit = parser.elements(); while (nit.hasMoreNodes()) { Node node = nit.nextNode(); parserNode(node); } srb.setKeyWords(keyword); srb.setUrl(url); srb.setCount_key_words(count); resultList.add(srb); System.out.println("keywords:" + count); } catch (ParserException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } public void dealTag(Tag tag) throws Exception { NodeList list = tag.getChildren(); if (list != null) { NodeIterator it = list.elements(); while (it.hasMoreNodes()) { Node node = it.nextNode(); parserNode(node); } } } private void parserNode(Node node) throws Exception { if (node instanceof StringNode) { StringNode n = (StringNode) node; } } private boolean isRobotAllowed(URL url) { return false; } private boolean isSearched(List searchsite2, String url) { return false; } }

网友评论

  • 参考了一下 还不错
  • 没有做过,看看别人怎么做的,3Q
  • 没有文档,不知道从哪个函数开始运行
  • 不错,非常有用的代码