项目的目录结构
核心源码:
package cn.edu.zyt.spider; import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Properties; import cn.edu.zyt.spider.model.SpiderParams;
import cn.edu.zyt.spider.queue.UrlQueue;
import cn.edu.zyt.spider.worker.SpiderWorker; public class SpiderStarter { public static void main(String[] args){ System.setProperty("java.net.useSystemProxies", "true");
System.setProperty("http.proxyHost", "113.128.9.37");
System.setProperty("http.proxyPort", "9999");
System.setProperty("https.proxyHost", "113.128.9.37");
System.setProperty("https.proxyPort", "9999"); // 初始化配置参数
initializeParams(); // 初始化爬取队列
initializeQueue(); // 创建worker线程并启动
for(int i = 1; i <= SpiderParams.WORKER_NUM; i++){
new Thread(new SpiderWorker(i)).start();
}
} /**
* 初始化配置文件参数
*/
private static void initializeParams(){
InputStream in;
try {
in = new BufferedInputStream(new FileInputStream("conf/spider.properties"));
Properties properties = new Properties();
properties.load(in); // 从配置文件中读取参数
SpiderParams.WORKER_NUM = Integer.parseInt(properties.getProperty("spider.threadNum"));
SpiderParams.DEYLAY_TIME = Integer.parseInt(properties.getProperty("spider.fetchDelay")); in.close();
}
catch (FileNotFoundException e) {
e.printStackTrace();
}
catch (IOException e) {
e.printStackTrace();
}
} /**
* 准备初始的爬取链接
*/
private static void initializeQueue(){
// 例如,需要抓取天下粮仓信息,根据链接规则生成URLs放入带抓取队列http://www.cofeed.com/national_1.html for(int i = 0; i < 3; i += 1){
UrlQueue.addElement("http://www.cofeed.com/national_" + i+".html");
}
}
}
实现效果图:
由于页面代码较多就不一一粘贴了,获取完整源码可在博客下方留言哈