做采集的时候,可以使用file_get_contents()去获取网页源代码,但是使用file_get_contents采集,速度慢,而且超时时间,不好控制。如果采集的页面不存在,需要等待的时间很长。一般来说,curl的速度最快,其次是socket,最后是file_get_contents。 现在跟大家分享一个很强大的采集类,会根据你的服务器当前的配置,自动选择最快的方式。已经封装了curl和socket,file_get_contents
用法很简单: 1,采用get方法请求 Http::doGet(网址);//超市时间可忽略,默认是5秒 Http::doGet(网址,超时时间); 如echo Http::doGet('http://www.baidu.com');
2,采用post方法请求 Http::doPost(网址,数据,超时时间);
如 $url='http://www.canphp.com/test.php'; $data['name']='单骑'; $data['email']='admin@canphp.com'; Http::doPost($url,$data,10);
test.php页面接收数据 $_POST['name']; $_POST['email'];
这个http类不仅可以用来采集,还有一个很强大的作用,模拟php异步多进程。 比如有index.php和a.php, b.php, c.php 在index.php中 Http::doGet('http://www.canphp.com/a.php',1); Http::doGet('http://www.canphp.com/b.php',1); Http::doGet('http://www.canphp.com/c.php',1);
a.php, b.php, c.php程序分别在头部加上ignore_user_abort(true); 那么就可以实现多进程了。
原理: 通过curl或socket发送请求给a.php, b.php, c.php,由于超时时间比较短,只是触发了a.php, b.php, c.php三个页面,不需要等待数据返回,连接已中断,但是a.php, b.php, c.php程序中加上了ignore_user_abort(true);忽略客户端连接,还会继续执行。
具体案例可以观看很邪恶很强大的av程序(http://www.canphp.com/bbs/thread-295-1-1.html)
- <?php
- //数据采集,doGET,doPOST
- class Http
- {//类定义开始
- //通过get方式获取数据
- static public function doGet($url,$timeout=5)
- {
- $code=self::getSupport();
- switch($code)
- {
- case 1:return self::curl($url,'',$timeout);break;
- case 2:return self::socketGet($url,$timeout);break;
- case 3:return @file_get_contents($url);break;
- default:return false;
- }
- }
- //通过POST方式发送数据
- static public function doPost($url,$data,$timeout=5)
- {
- $code=self::getSupport();
- switch($code)
- {
- case 1:return self::curl($url,$data,$timeout);break;
- case 2:return self::socketPost($url,$data,$timeout);break;
- default:return false;
- }
- }
- //获取支持读取远程文件的方式
- static public function getSupport()
- {
- if(function_exists('curl_init'))//curl方式
- {
- return 1;
- }
- else if(function_exists('fsockopen'))//socket
- {
- return 2;
- }
- else if(function_exists('file_get_contents'))//php系统函数file_get_contents
- {
- return 3;
- }
- else if(ini_get('allow_url_fopen')&&function_exists('fopen'))//php系统函数fopen
- {
- return 4;
- }
- else
- {
- return 0;
- }
- }
- static public function GetHttpContent($fsock=null) {
- $out = null;
- while($buff = @fgets($fsock, 2048)){
- $out .= $buff;
- }
- fclose($fsock);
- $pos = strpos($out, "\r\n\r\n");
- $head = substr($out, 0, $pos); //http head
- $status = substr($head, 0, strpos($head, "\r\n")); //http status line
- $body = substr($out, $pos + 4, strlen($out) - ($pos + 4));//page body
- if(preg_match("/^HTTP\/\d\.\d\s([\d]+)\s.*$/", $status, $matches)){
- if(intval($matches[1]) / 100 == 2){
- return $body;
- }else{
- return false;
- }
- }else{
- return false;
- }
- }
- static public function socketGet($url,$timeout=5){
- $url2 = parse_url($url);
- $url2["path"] = isset($url2["path"])? $url2["path"]: "/" ;
- $url2["port"] = isset($url2["port"])? $url2["port"] : 80;
- $url2["query"] = isset($url2["query"])? "?".$url2["query"] : "";
- $host_ip = @gethostbyname($url2["host"]);
- $fsock_timeout = $timeout; //超时时间
- if(($fsock = fsockopen($host_ip, $url2['port'], $errno, $errstr, $fsock_timeout)) < 0){
- return false;
- }
- $request = $url2["path"] .$url2["query"];
- $in = "GET " . $request . " HTTP/1.1\r\n";
- $in .= "Accept: */*\r\n";
- // $in .= "User-Agent: Payb-Agent\r\n";
- $in .= "Host: " . $url2["host"] . "\r\n";
- $in .= "Connection: Close\r\n\r\n";
- if(!@fwrite($fsock, $in, strlen($in))){
- @fclose($fsock);
- return false;
- }
- return self::GetHttpContent($fsock);
- }
- static public function socketPost($url,$post_data=array(),$timeout=5){
- $url2 = parse_url($url);
- $url2["path"] = ($url2["path"] == "" ? "/" : $url2["path"]);
- $url2["port"] = ($url2["port"] == "" ? 80 : $url2["port"]);
- $host_ip = @gethostbyname($url2["host"]);
- $fsock_timeout = $timeout; //超时时间
- if(($fsock = fsockopen($host_ip, $url2['port'], $errno, $errstr, $fsock_timeout)) < 0){
- return false;
- }
- $request = $url2["path"].($url2["query"] ? "?" . $url2["query"] : "");
- $post_data2 = http_build_query($post_data);
- $in = "POST " . $request . " HTTP/1.1\r\n";
- $in .= "Accept: */*\r\n";
- $in .= "Host: " . $url2["host"] . "\r\n";
- // $in .= "User-Agent: Lowell-Agent\r\n";
- $in .= "Content-type: application/x-www-form-urlencoded\r\n";
- $in .= "Content-Length: " . strlen($post_data2) . "\r\n";
- $in .= "Connection: Close\r\n\r\n";
- $in .= $post_data2 . "\r\n\r\n";
- unset($post_data2);
- if(!@fwrite($fsock, $in, strlen($in))){
- @fclose($fsock);
- return false;
- }
- return self::GetHttpContent($fsock);
- }
- static public function curl($url, $data=array(), $timeout=5)
- {
- $ch = curl_init();
- if (is_array($data) && $data)
- {
- $formdata = http_build_query($data);
- curl_setopt($ch, CURLOPT_POST, true);
- curl_setopt($ch, CURLOPT_POSTFIELDS, $formdata);
- }
- curl_setopt($ch, CURLOPT_URL, $url);
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
- curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
- curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
- $result = curl_exec($ch);
- curl_close($ch);
- return $result;
- }
- }//类定义结束
- ?>
复制代码
|
<?php
//数据采集,doGET,doPOST
class Http
{//类定义开始
//通过get方式获取数据
static public function doGet($url,$timeout=5)
{
$code=self::getSupport();
switch($code)
{
case 1:return self::curl($url,'',$timeout);break;
case 2:return self::socketGet($url,$timeout);break;
case 3:return @file_get_contents($url);break;
default:return false;
}
}
//通过POST方式发送数据
static public function doPost($url,$data,$timeout=5)
{
$code=self::getSupport();
switch($code)
{
case 1:return self::curl($url,$data,$timeout);break;
case 2:return self::socketPost($url,$data,$timeout);break;
default:return false;
}
}
//获取支持读取远程文件的方式
static public function getSupport()
{
if(function_exists('curl_init'))//curl方式
{
return 1;
}
else if(function_exists('fsockopen'))//socket
{
return 2;
}
else if(function_exists('file_get_contents'))//php系统函数file_get_contents
{
return 3;
}
else if(ini_get('allow_url_fopen')&&function_exists('fopen'))//php系统函数fopen
{
return 4;
}
else
{
return 0;
}
}
static public function GetHttpContent($fsock=null) {
$out = null;
while($buff = @fgets($fsock, 2048)){
$out .= $buff;
}
fclose($fsock);
$pos = strpos($out, "\r\n\r\n");
$head = substr($out, 0, $pos); //http head
$status = substr($head, 0, strpos($head, "\r\n")); //http status line
$body = substr($out, $pos + 4, strlen($out) - ($pos + 4));//page body
if(preg_match("/^HTTP\/\d\.\d\s([\d]+)\s.*$/", $status, $matches)){
if(intval($matches[1]) / 100 == 2){
return $body;
}else{
return false;
}
}else{
return false;
}
}
static public function socketGet($url,$timeout=5){
$url2 = parse_url($url);
$url2["path"] = isset($url2["path"])? $url2["path"]: "/" ;
$url2["port"] = isset($url2["port"])? $url2["port"] : 80;
$url2["query"] = isset($url2["query"])? "?".$url2["query"] : "";
$host_ip = @gethostbyname($url2["host"]);
$fsock_timeout = $timeout; //超时时间
if(($fsock = fsockopen($host_ip, $url2['port'], $errno, $errstr, $fsock_timeout)) < 0){
return false;
}
$request = $url2["path"] .$url2["query"];
$in = "GET " . $request . " HTTP/1.1\r\n";
$in .= "Accept: */*\r\n";
// $in .= "User-Agent: Payb-Agent\r\n";
$in .= "Host: " . $url2["host"] . "\r\n";
$in .= "Connection: Close\r\n\r\n";
if(!@fwrite($fsock, $in, strlen($in))){
@fclose($fsock);
return false;
}
return self::GetHttpContent($fsock);
}
static public function socketPost($url,$post_data=array(),$timeout=5){
$url2 = parse_url($url);
$url2["path"] = ($url2["path"] == "" ? "/" : $url2["path"]);
$url2["port"] = ($url2["port"] == "" ? 80 : $url2["port"]);
$host_ip = @gethostbyname($url2["host"]);
$fsock_timeout = $timeout; //超时时间
if(($fsock = fsockopen($host_ip, $url2['port'], $errno, $errstr, $fsock_timeout)) < 0){
return false;
}
$request = $url2["path"].($url2["query"] ? "?" . $url2["query"] : "");
$post_data2 = http_build_query($post_data);
$in = "POST " . $request . " HTTP/1.1\r\n";
$in .= "Accept: */*\r\n";
$in .= "Host: " . $url2["host"] . "\r\n";
// $in .= "User-Agent: Lowell-Agent\r\n";
$in .= "Content-type: application/x-www-form-urlencoded\r\n";
$in .= "Content-Length: " . strlen($post_data2) . "\r\n";
$in .= "Connection: Close\r\n\r\n";
$in .= $post_data2 . "\r\n\r\n";
unset($post_data2);
if(!@fwrite($fsock, $in, strlen($in))){
@fclose($fsock);
return false;
}
return self::GetHttpContent($fsock);
}
static public function curl($url, $data=array(), $timeout=5)
{
$ch = curl_init();
if (is_array($data) && $data)
{
$formdata = http_build_query($data);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $formdata);
}
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
$result = curl_exec($ch);
curl_close($ch);
return $result;
}
}//类定义结束
?>