想要采集的某个站，貌似被“反采集”了，懂采集的朋友帮看看

http://price.pcauto.com.cn/dictionary/category/

这个站，我用了以下三种方式，都无法采集：



$fcontents = file_get_contents($url);



	##获取目标页面文件流并转换成字符串形式采集

	function openfile($url)

	{

		if(file($url))

		{

			$str = file($url);

			$count = count($str);

			for ($i=0;$i<$count;$i++)

			{

				$file .= $str[$i];

			}

			return $file;

		}

		else

		{ die("文件打开失败"); }

	}



	#通过获取文件流，采集

	function request_url($url,$method='POST')

	{

		$url = parse_url($url); //解析url地址,取得host,path,port,query等

		if (!$url) return '链接不完整';

		if (!isset($url['port'])) { $url['port'] = ''; }

		if (!isset($url['query'])) { $url['query'] = ''; }

		//连接服务器

		$fp = fsockopen($url['host'], $url['port'] ? $url['port'] : 80);

		if (!$fp) return '不能连接'.$url['host'].'服务器';

		//发送请求

		fputs($fp, sprintf("$method %s%s%s HTTP/1.0\n", $url['path'], $url['query'] ? "?" : "", $url['query']));

		fputs($fp, "Host: $url[host]\n");

		fputs($fp, "Content-type: application/x-www-form-urlencoded\n");

		fputs($fp, "Connection: close\n\n");

		//获得请求后返回的内容

		$line = fgets($fp,1024);

		if (!eregi("^HTTP/1\.. 200", $line)) return;

		$results = "";

		while(!feof($fp)) {

		$line = fgets($fp,1024);

		$results .= $line;

		}

		fclose($fp);

		return $results;

	}

用第一种，得到的是乱码，无论如何编码转换，都是乱码；
用后面两种，得到的均是空白（第三种能看到部分文件头）。

我把页面源文件复制下来，放入一个文件内，并采集这个文件，都可以的，所以，想来是这个页面做了“反采集”。

跪求高手，谢谢。

12 个解决方案

#1

还没搞定，继续求助

#2

要采集就用curl.

#3

确实做了“反采集”,无法抓取！

#4

不过是 gzip 压缩了而已

$url = 'http://price.pcauto.com.cn/dictionary/category/';

$s = file_get_contents($url);

echo gzdecode($s);


function gzdecode($data) { 

  $len = strlen($data); 

  if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) { 

    return null;  // Not GZIP format (See RFC 1952) 

  } 


  $method = ord(substr($data,2,1));  // Compression method 

  $flags  = ord(substr($data,3,1));  // Flags 

  if ($flags & 31 != $flags) { 

    // Reserved bits are set -- NOT ALLOWED by RFC 1952 

    return null; 

  } 


  // NOTE: $mtime may be negative (PHP integer limitations) 

  $mtime = unpack("V", substr($data,4,4)); 

  $mtime = $mtime[1]; 

  $xfl   = substr($data,8,1); 

  $os    = substr($data,8,1); 

  $headerlen = 10; 

  $extralen  = 0; 

  $extra     = ""; 

  if ($flags & 4) { 

    // 2-byte length prefixed EXTRA data in header 

    if ($len - $headerlen - 2 < 8) { 

      return false;    // Invalid format 

    } 

    $extralen = unpack("v",substr($data,8,2)); 

    $extralen = $extralen[1]; 

    if ($len - $headerlen - 2 - $extralen < 8) { 

      return false;    // Invalid format 

    } 

    $extra = substr($data,10,$extralen); 

    $headerlen += 2 + $extralen; 

  } 

  $filenamelen = 0; 

  $filename = ""; 

  if ($flags & 8) { 

    // C-style string file NAME data in header 

    if ($len - $headerlen - 1 < 8) { 

      return false;    // Invalid format 

    } 

    $filenamelen = strpos(substr($data,8+$extralen),chr(0)); 

    if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) { 

      return false;    // Invalid format 

    } 

    $filename = substr($data,$headerlen,$filenamelen); 

    $headerlen += $filenamelen + 1; 

  } 

  $commentlen = 0; 

  $comment = ""; 

  if ($flags & 16) { 

    // C-style string COMMENT data in header 

    if ($len - $headerlen - 1 < 8) { 

      return false;    // Invalid format 

    } 

    $commentlen = strpos(substr($data,8+$extralen+$filenamelen),chr(0)); 

    if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) { 

      return false;    // Invalid header format 

    } 

    $comment = substr($data,$headerlen,$commentlen); 

    $headerlen += $commentlen + 1; 

  } 

  $headercrc = ""; 

  if ($flags & 1) { 

    // 2-bytes (lowest order) of CRC32 on header present 

    if ($len - $headerlen - 2 < 8) { 

      return false;    // Invalid format 

    } 

    $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff; 

    $headercrc = unpack("v", substr($data,$headerlen,2)); 

    $headercrc = $headercrc[1]; 

    if ($headercrc != $calccrc) { 

      return false;    // Bad header CRC 

    } 

    $headerlen += 2; 

  } 

  // GZIP FOOTER - These be negative due to PHP's limitations 

  $datacrc = unpack("V",substr($data,-8,4)); 

  $datacrc = $datacrc[1]; 

  $isize = unpack("V",substr($data,-4)); 

  $isize = $isize[1]; 


  // Perform the decompression: 

  $bodylen = $len-$headerlen-8; 

  if ($bodylen < 1) { 

    // This should never happen - IMPLEMENTATION BUG! 

    return null; 

  } 

  $body = substr($data,$headerlen,$bodylen); 

  $data = ""; 

  if ($bodylen > 0) { 

    switch ($method) { 

      case 8: 

        // Currently the only supported compression method: 

        $data = gzinflate($body); 

        break; 

      default: 

        // Unknown compression method 

        return false; 

    } 

  } else { 

    // I'm not sure if zero-byte body content is allowed. 

    // Allow it for now...  Do nothing... 

  } 

  // Verifiy decompressed size and CRC32: 

  // NOTE: This may fail with large data sizes depending on how 

  //       PHP's integer limitations affect strlen() since $isize 

  //       may be negative for large sizes. 

  if ($isize != strlen($data) || crc32($data) != $datacrc) { 

    // Bad format!  Length or CRC doesn't match! 

    return false; 

  } 

  return $data; 

}

#5

确实被GZIP压缩，解压后正常！
     private static string DownloadWebPage(string url)
        {
            WebRequest request =
                WebRequest.Create(url);
            request.Credentials = CredentialCache.DefaultCredentials;
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            //Console.WriteLine(response.StatusDescription);
            Stream dataStream = response.GetResponseStream();
            GZipStream gzip = new GZipStream(dataStream, CompressionMode.Decompress);
            StreamReader reader = new StreamReader(gzip, Encoding.GetEncoding("GB2312"));
            string responseFromServer = reader.ReadToEnd();
            reader.Close();
            dataStream.Close();
            response.Close();
            return responseFromServer;
        }
        static void Main(string[] args)
        {
            DownloadWebPage("http://price.pcauto.com.cn/dictionary/category/");
        }

#6

NIU

#7

引用 4 楼 xuzuning 的回复:

不过是 gzip 压缩了而已
PHP code
$url = 'http://price.pcauto.com.cn/dictionary/category/';
$s = file_get_contents($url);
echo gzdecode($s);

function gzdecode($data) {
$len = strlen($data);
if ($len <……

太牛了

#8

不过是 gzip 压缩了而已

#9

引用 4 楼 xuzuning 的回复:

不过是 gzip 压缩了而已

PHP code
$url = 'http://price.pcauto.com.cn/dictionary/category/';
$s = file_get_contents($url);
echo gzdecode($s);

function gzdecode($data) {
$len = strlen($data);
if ……

版主很强大，请教一下：是如何看出来时经过gzip压缩的。以后学习留作参考。

#10

膜拜

#11

呵呵，谢谢了，又学了一招

我那问题已经搞定了，不过，用的。net采集的

#12

如果被“反采集”，用file_get_contents采集到的应该是空白，但是我采集的是乱码，想必就因为这个，才判断是gzip压缩吧

#1

还没搞定，继续求助

#2

要采集就用curl.

#3

确实做了“反采集”,无法抓取！

#4

不过是 gzip 压缩了而已

$url = 'http://price.pcauto.com.cn/dictionary/category/';

$s = file_get_contents($url);

echo gzdecode($s);


function gzdecode($data) { 

  $len = strlen($data); 

  if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) { 

    return null;  // Not GZIP format (See RFC 1952) 

  } 


  $method = ord(substr($data,2,1));  // Compression method 

  $flags  = ord(substr($data,3,1));  // Flags 

  if ($flags & 31 != $flags) { 

    // Reserved bits are set -- NOT ALLOWED by RFC 1952 

    return null; 

  } 


  // NOTE: $mtime may be negative (PHP integer limitations) 

  $mtime = unpack("V", substr($data,4,4)); 

  $mtime = $mtime[1]; 

  $xfl   = substr($data,8,1); 

  $os    = substr($data,8,1); 

  $headerlen = 10; 

  $extralen  = 0; 

  $extra     = ""; 

  if ($flags & 4) { 

    // 2-byte length prefixed EXTRA data in header 

    if ($len - $headerlen - 2 < 8) { 

      return false;    // Invalid format 

    } 

    $extralen = unpack("v",substr($data,8,2)); 

    $extralen = $extralen[1]; 

    if ($len - $headerlen - 2 - $extralen < 8) { 

      return false;    // Invalid format 

    } 

    $extra = substr($data,10,$extralen); 

    $headerlen += 2 + $extralen; 

  } 

  $filenamelen = 0; 

  $filename = ""; 

  if ($flags & 8) { 

    // C-style string file NAME data in header 

    if ($len - $headerlen - 1 < 8) { 

      return false;    // Invalid format 

    } 

    $filenamelen = strpos(substr($data,8+$extralen),chr(0)); 

    if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) { 

      return false;    // Invalid format 

    } 

    $filename = substr($data,$headerlen,$filenamelen); 

    $headerlen += $filenamelen + 1; 

  } 

  $commentlen = 0; 

  $comment = ""; 

  if ($flags & 16) { 

    // C-style string COMMENT data in header 

    if ($len - $headerlen - 1 < 8) { 

      return false;    // Invalid format 

    } 

    $commentlen = strpos(substr($data,8+$extralen+$filenamelen),chr(0)); 

    if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) { 

      return false;    // Invalid header format 

    } 

    $comment = substr($data,$headerlen,$commentlen); 

    $headerlen += $commentlen + 1; 

  } 

  $headercrc = ""; 

  if ($flags & 1) { 

    // 2-bytes (lowest order) of CRC32 on header present 

    if ($len - $headerlen - 2 < 8) { 

      return false;    // Invalid format 

    } 

    $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff; 

    $headercrc = unpack("v", substr($data,$headerlen,2)); 

    $headercrc = $headercrc[1]; 

    if ($headercrc != $calccrc) { 

      return false;    // Bad header CRC 

    } 

    $headerlen += 2; 

  } 

  // GZIP FOOTER - These be negative due to PHP's limitations 

  $datacrc = unpack("V",substr($data,-8,4)); 

  $datacrc = $datacrc[1]; 

  $isize = unpack("V",substr($data,-4)); 

  $isize = $isize[1]; 


  // Perform the decompression: 

  $bodylen = $len-$headerlen-8; 

  if ($bodylen < 1) { 

    // This should never happen - IMPLEMENTATION BUG! 

    return null; 

  } 

  $body = substr($data,$headerlen,$bodylen); 

  $data = ""; 

  if ($bodylen > 0) { 

    switch ($method) { 

      case 8: 

        // Currently the only supported compression method: 

        $data = gzinflate($body); 

        break; 

      default: 

        // Unknown compression method 

        return false; 

    } 

  } else { 

    // I'm not sure if zero-byte body content is allowed. 

    // Allow it for now...  Do nothing... 

  } 

  // Verifiy decompressed size and CRC32: 

  // NOTE: This may fail with large data sizes depending on how 

  //       PHP's integer limitations affect strlen() since $isize 

  //       may be negative for large sizes. 

  if ($isize != strlen($data) || crc32($data) != $datacrc) { 

    // Bad format!  Length or CRC doesn't match! 

    return false; 

  } 

  return $data; 

}

#5

#6

NIU

#7

引用 4 楼 xuzuning 的回复:

不过是 gzip 压缩了而已
PHP code
$url = 'http://price.pcauto.com.cn/dictionary/category/';
$s = file_get_contents($url);
echo gzdecode($s);

function gzdecode($data) {
$len = strlen($data);
if ($len <……

太牛了

#8

不过是 gzip 压缩了而已

#9

引用 4 楼 xuzuning 的回复:

不过是 gzip 压缩了而已

PHP code
$url = 'http://price.pcauto.com.cn/dictionary/category/';
$s = file_get_contents($url);
echo gzdecode($s);

function gzdecode($data) {
$len = strlen($data);
if ……

版主很强大，请教一下：是如何看出来时经过gzip压缩的。以后学习留作参考。

#10

膜拜

#11

呵呵，谢谢了，又学了一招

我那问题已经搞定了，不过，用的。net采集的

#12

如果被“反采集”，用file_get_contents采集到的应该是空白，但是我采集的是乱码，想必就因为这个，才判断是gzip压缩吧

秒客网

想要采集的某个站，貌似被“反采集”了，懂采集的朋友帮看看

12 个解决方案

#1

#2

#3

#4

#5

#6

#7

#8

#9

#10

#11

#12

#1

#2

#3

#4

#5

#6

#7

#8

#9

#10

#11

#12

相关文章