PHP简单爬虫 爬取免费代理ip 一万条

时间:2024-03-17 17:05:11

目标站:http://www.xicidaili.com/

代码:

<?php
require \'lib/phpQuery.php\';
require \'lib/QueryList.php\';
require "db/shared/ez_sql_core.php";  
require "db/mysql/ez_sql_mysql.php";  
require "public/function.php";  
use QL\QueryList;

//抓取猫眼电影TOP100榜单内容 
$db = new ezSQL_mysql(\'root\', \'root\', \'spider\', \'localhost\');  

for($j=1;$j<=100;$j++){
    $gurl="http://www.xicidaili.com/nn/".$j;
    $html=curl_request($gurl);
    for($i=1;$i<=100;$i++){
        getIpInfo($html,$i,$db);
    }
    echo "".$j."页完成".PHP_EOL;
}




function getIpInfo($html,$t,$db){
    $rules = array(
        //采集id为one这个元素里面的纯文本内容
        \'ip\' => array("#ip_list tr:eq($t) td:eq(1)",\'text\'),//ip
        \'port\' => array("#ip_list tr:eq($t) td:eq(2)",\'text\'),//端口
        \'area\' => array("#ip_list tr:eq($t) td:eq(3)",\'text\'),//位置
        \'anonymous\' => array("#ip_list tr:eq($t) td:eq(4)",\'text\'),//是否匿名
        \'type\' => array("#ip_list tr:eq($t) td:eq(5)",\'text\'),//类型
        \'speed\' => array("#ip_list tr:eq($t) td:eq(6)",\'html\',\'\',function($content){
            $num=explode(\'%\', explode(\':\', $content)[1])[0];
            if($num >= 60 && $num<80){
                return "一般";
            }else if($num >= 80){
                return "很快";
            }else{
                return "较慢";
            }
        }),//速度
        \'chtime\' => array("#ip_list tr:eq($t) td:eq(8)",\'text\'),//存活时间.
        \'yztime\' => array("#ip_list tr:eq($t) td:eq(9)",\'text\'),//验证时间

    );
    $data = QueryList::Query($html,$rules)->data;
    print_r($data);

    $ip=$data[0]["ip"];
    $port=$data[0]["port"];
    $area=$data[0]["area"];
    $anonymous=$data[0]["anonymous"];
    $type=$data[0]["type"];
    $speed=$data[0]["speed"];
    $chtime=$data[0]["chtime"];
    $yztime=$data[0]["yztime"];


    $db->query("INSERT INTO ip (ip, port,area,anonymous,type,speed,chtime,yztime) 
        VALUES (\'$ip\',\'$port\',\'$area\',\'$anonymous\',\'$type\',\'$speed\',\'$chtime\',\'$yztime\')");
}

结果:

完整项目下载:https://files.cnblogs.com/files/wordblog/spider2.rar