java实现多线程使用多个代理ip的方式爬取网页页面内容

项目的目录结构

核心源码：

package cn.edu.zyt.spider;

import java.io.BufferedInputStream;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.IOException;

import java.io.InputStream;

import java.util.Properties;

import cn.edu.zyt.spider.model.SpiderParams;

import cn.edu.zyt.spider.queue.UrlQueue;

import cn.edu.zyt.spider.worker.SpiderWorker;

public class SpiderStarter {

    public static void main(String[] args){

        System.setProperty("java.net.useSystemProxies", "true");

        System.setProperty("http.proxyHost", "113.128.9.37");

        System.setProperty("http.proxyPort", "9999");

        System.setProperty("https.proxyHost", "113.128.9.37");

        System.setProperty("https.proxyPort", "9999");

        // 初始化配置参数

        initializeParams();

        // 初始化爬取队列

        initializeQueue();

        // 创建worker线程并启动

        for(int i = 1; i <= SpiderParams.WORKER_NUM; i++){

            new Thread(new SpiderWorker(i)).start();

        }

    }

    /**

     * 初始化配置文件参数

     */

    private static void initializeParams(){

        InputStream in;

        try {

            in = new BufferedInputStream(new FileInputStream("conf/spider.properties"));

            Properties properties = new Properties();

            properties.load(in);

            // 从配置文件中读取参数

            SpiderParams.WORKER_NUM = Integer.parseInt(properties.getProperty("spider.threadNum"));

            SpiderParams.DEYLAY_TIME = Integer.parseInt(properties.getProperty("spider.fetchDelay"));

            in.close();

        }

        catch (FileNotFoundException e) {

            e.printStackTrace();

        }

        catch (IOException e) {

            e.printStackTrace();

        }

    }

    /**

     * 准备初始的爬取链接

     */

    private static void initializeQueue(){

        // 例如，需要抓取天下粮仓信息，根据链接规则生成URLs放入带抓取队列http://www.cofeed.com/national_1.html

        for(int i = 0; i < 3; i += 1){

            UrlQueue.addElement("http://www.cofeed.com/national_" + i+".html");

        }

    }

}

java实现多线程使用多个代理ip的方式爬取网页页面内容

实现效果图：

java实现多线程使用多个代理ip的方式爬取网页页面内容

由于页面代码较多就不一一粘贴了，获取完整源码可在博客下方留言哈

秒客网

java实现多线程使用多个代理ip的方式爬取网页页面内容

相关文章