php结合curl实现多线程抓取
1
|
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
|
<?php
/*
curl 多线程抓取
*/
/**
* curl 多线程
*
* @param array $array 并行网址
* @param int $timeout 超时时间
* @return array
*/
function Curl_http( $array , $timeout ){
$res = array ();
$mh = curl_multi_init(); //创建多个curl语柄
$startime = getmicrotime();
foreach ( $array as $k => $url ){
$conn [ $k ]=curl_init( $url );
curl_setopt( $conn [ $k ], CURLOPT_TIMEOUT, $timeout ); //设置超时时间
curl_setopt( $conn [ $k ], CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 5.01; Windows NT 5.0)' );
curl_setopt( $conn [ $k ], CURLOPT_MAXREDIRS, 7); //HTTp定向级别
curl_setopt( $conn [ $k ], CURLOPT_HEADER, 0); //这里不要header,加块效率
curl_setopt( $conn [ $k ], CURLOPT_FOLLOWLOCATION, 1); // 302 redirect
curl_setopt( $conn [ $k ],CURLOPT_RETURNTRANSFER,1);
curl_multi_add_handle ( $mh , $conn [ $k ]);
}
//防止死循环耗死cpu 这段是根据网上的写法
do {
$mrc = curl_multi_exec( $mh , $active ); //当无数据,active=true
} while ( $mrc == CURLM_CALL_MULTI_PERFORM); //当正在接受数据时
while ( $active and $mrc == CURLM_OK) { //当无数据时或请求暂停时,active=true
if (curl_multi_select( $mh ) != -1) {
do {
$mrc = curl_multi_exec( $mh , $active );
} while ( $mrc == CURLM_CALL_MULTI_PERFORM);
}
}
foreach ( $array as $k => $url ) {
curl_error( $conn [ $k ]);
$res [ $k ]=curl_multi_getcontent( $conn [ $k ]); //获得返回信息
$header [ $k ]=curl_getinfo( $conn [ $k ]); //返回头信息
curl_close( $conn [ $k ]); //关闭语柄
curl_multi_remove_handle( $mh , $conn [ $k ]); //释放资源
}
curl_multi_close( $mh );
$endtime = getmicrotime();
$diff_time = $endtime - $startime ;
return array ( 'diff_time' => $diff_time ,
'return' => $res ,
'header' => $header
);
}
//计算当前时间
function getmicrotime() {
list( $usec , $sec ) = explode ( " " ,microtime());
return ((float) $usec + (float) $sec );
}
//测试一下,curl 三个网址
$array = array (
" http://www.weibo.com/ " ,
" http://www.renren.com/ " ,
" http://www.qq.com/ "
);
$data = Curl_http( $array , '10' ); //调用
var_dump( $data ); //输出
//如果POST的数据大于1024字节,curl并不会直接就发起POST请求
//发送请求时,header中包含一个空的Expect。curl_setopt($ch, CURLOPT_HTTPHEADER, array("Expect:"));
?>
|
我们再来看几个例子
(1)下面这段代码是实现抓取多个URL,然后将抓取的URL的页面代码写入指定的文件
1
|
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
$urls = array (
' http://www.zzvips.com/ ' ,
' http://www.google.com/ ' ,
' http://www.example.com/ '
); // 设置要抓取的页面URL
$save_to = '/test.txt' ; // 把抓取的代码写入该文件
$st = fopen ( $save_to , "a" );
$mh = curl_multi_init();
foreach ( $urls as $i => $url ) {
$conn [ $i ] = curl_init( $url );
curl_setopt( $conn [ $i ], CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)" );
curl_setopt( $conn [ $i ], CURLOPT_HEADER ,0);
curl_setopt( $conn [ $i ], CURLOPT_CONNECTTIMEOUT,60);
curl_setopt( $conn [ $i ], CURLOPT_FILE, $st ); // 将爬取的代码写入文件
curl_multi_add_handle ( $mh , $conn [ $i ]);
} // 初始化
do {
curl_multi_exec( $mh , $active );
} while ( $active ); // 执行
foreach ( $urls as $i => $url ) {
curl_multi_remove_handle( $mh , $conn [ $i ]);
curl_close( $conn [ $i ]);
} // 结束清理
curl_multi_close( $mh );
fclose( $st );
|
(2)下面这段代码和上面差不多意思,只不过这个地方是将获得的代码先放入变量,然后再将获取到的内容写入指定的文件
1
|
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
$urls = array (
' http://www.zzvips.com/ ' ,
' http://www.google.com/ ' ,
' http://www.example.com/ '
);
$save_to = '/test.txt' ; // 把抓取的代码写入该文件
$st = fopen ( $save_to , "a" );
$mh = curl_multi_init();
foreach ( $urls as $i => $url ) {
$conn [ $i ] = curl_init( $url );
curl_setopt( $conn [ $i ], CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)" );
curl_setopt( $conn [ $i ], CURLOPT_HEADER ,0);
curl_setopt( $conn [ $i ], CURLOPT_CONNECTTIMEOUT,60);
curl_setopt( $conn [ $i ],CURLOPT_RETURNTRANSFER,true); // 不将爬取代码写到浏览器,而是转化为字符串
curl_multi_add_handle ( $mh , $conn [ $i ]);
}
do {
curl_multi_exec( $mh , $active );
} while ( $active );
foreach ( $urls as $i => $url ) {
$data = curl_multi_getcontent( $conn [ $i ]); // 获得爬取的代码字符串
fwrite( $st , $data ); // 将字符串写入文件
} // 获得数据变量,并写入文件
foreach ( $urls as $i => $url ) {
curl_multi_remove_handle( $mh , $conn [ $i ]);
curl_close( $conn [ $i ]);
}
curl_multi_close( $mh );
fclose( $st );
|
(3)下面这段代码实现的是利用 PHP 的 Curl Functions 实现并发多线程下载文件
1
|
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
$urls = array (
' http://www.zzvips.com/5w.zip ' ,
' http://www.zzvips.com/5w.zip ' ,
' http://www.zzvips.com/5w.zip '
);
$save_to = './home/' ;
$mh =curl_multi_init();
foreach ( $urls as $i => $url ){
$g = $save_to . basename ( $url );
if (! is_file ( $g )){
$conn [ $i ]=curl_init( $url );
$fp [ $i ]= fopen ( $g , "w" );
curl_setopt( $conn [ $i ],CURLOPT_USERAGENT, "Mozilla/4.0(compatible; MSIE 7.0; Windows NT 6.0)" );
curl_setopt( $conn [ $i ],CURLOPT_FILE, $fp [ $i ]);
curl_setopt( $conn [ $i ],CURLOPT_HEADER ,0);
curl_setopt( $conn [ $i ],CURLOPT_CONNECTTIMEOUT,60);
curl_multi_add_handle( $mh , $conn [ $i ]);
}
}
do {
$n =curl_multi_exec( $mh , $active );
} while ( $active );
foreach ( $urls as $i => $url ){
curl_multi_remove_handle( $mh , $conn [ $i ]);
curl_close( $conn [ $i ]);
fclose( $fp [ $i ]);
}
curl_multi_close( $mh ); $urls = array (
' http://www.zzvips.com/5w.zip ' ,
' http://www.zzvips.com/5w.zip ' ,
' http://www.zzvips.com/5w.zip '
);
$save_to = './home/' ;
$mh =curl_multi_init();
foreach ( $urls as $i => $url ){
$g = $save_to . basename ( $url );
if (! is_file ( $g )){
$conn [ $i ]=curl_init( $url );
$fp [ $i ]= fopen ( $g , "w" );
curl_setopt( $conn [ $i ],CURLOPT_USERAGENT, "Mozilla/4.0(compatible; MSIE 7.0; Windows NT 6.0)" );
curl_setopt( $conn [ $i ],CURLOPT_FILE, $fp [ $i ]);
curl_setopt( $conn [ $i ],CURLOPT_HEADER ,0);
curl_setopt( $conn [ $i ],CURLOPT_CONNECTTIMEOUT,60);
curl_multi_add_handle( $mh , $conn [ $i ]);
}
}
do {
$n =curl_multi_exec( $mh , $active );
} while ( $active );
foreach ( $urls as $i => $url ){
curl_multi_remove_handle( $mh , $conn [ $i ]);
curl_close( $conn [ $i ]);
fclose( $fp [ $i ]);
}
curl_multi_close( $mh );
|
以上所述就是本文的全部内容了,希望大家能够喜欢。