这个站,我用了以下三种方式,都无法采集:
$fcontents = file_get_contents($url);
##获取目标页面文件流并转换成字符串形式采集
function openfile($url)
{
if(file($url))
{
$str = file($url);
$count = count($str);
for ($i=0;$i<$count;$i++)
{
$file .= $str[$i];
}
return $file;
}
else
{ die("文件打开失败"); }
}
#通过获取文件流,采集
function request_url($url,$method='POST')
{
$url = parse_url($url); //解析url地址,取得host,path,port,query等
if (!$url) return '链接不完整';
if (!isset($url['port'])) { $url['port'] = ''; }
if (!isset($url['query'])) { $url['query'] = ''; }
//连接服务器
$fp = fsockopen($url['host'], $url['port'] ? $url['port'] : 80);
if (!$fp) return '不能连接'.$url['host'].'服务器';
//发送请求
fputs($fp, sprintf("$method %s%s%s HTTP/1.0\n", $url['path'], $url['query'] ? "?" : "", $url['query']));
fputs($fp, "Host: $url[host]\n");
fputs($fp, "Content-type: application/x-www-form-urlencoded\n");
fputs($fp, "Connection: close\n\n");
//获得请求后返回的内容
$line = fgets($fp,1024);
if (!eregi("^HTTP/1\.. 200", $line)) return;
$results = "";
while(!feof($fp)) {
$line = fgets($fp,1024);
$results .= $line;
}
fclose($fp);
return $results;
}
用第一种,得到的是乱码,无论如何编码转换,都是乱码;
用后面两种,得到的均是空白(第三种能看到部分文件头)。
我把页面源文件复制下来,放入一个文件内,并采集这个文件,都可以的,所以,想来是这个页面做了“反采集”。
跪求高手,谢谢。
12 个解决方案
#1
还没搞定,继续求助
#2
要采集 就用curl.
#3
确实做了“反采集”,无法抓取!
#4
不过是 gzip 压缩了而已
$url = 'http://price.pcauto.com.cn/dictionary/category/';
$s = file_get_contents($url);
echo gzdecode($s);
function gzdecode($data) {
$len = strlen($data);
if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) {
return null; // Not GZIP format (See RFC 1952)
}
$method = ord(substr($data,2,1)); // Compression method
$flags = ord(substr($data,3,1)); // Flags
if ($flags & 31 != $flags) {
// Reserved bits are set -- NOT ALLOWED by RFC 1952
return null;
}
// NOTE: $mtime may be negative (PHP integer limitations)
$mtime = unpack("V", substr($data,4,4));
$mtime = $mtime[1];
$xfl = substr($data,8,1);
$os = substr($data,8,1);
$headerlen = 10;
$extralen = 0;
$extra = "";
if ($flags & 4) {
// 2-byte length prefixed EXTRA data in header
if ($len - $headerlen - 2 < 8) {
return false; // Invalid format
}
$extralen = unpack("v",substr($data,8,2));
$extralen = $extralen[1];
if ($len - $headerlen - 2 - $extralen < 8) {
return false; // Invalid format
}
$extra = substr($data,10,$extralen);
$headerlen += 2 + $extralen;
}
$filenamelen = 0;
$filename = "";
if ($flags & 8) {
// C-style string file NAME data in header
if ($len - $headerlen - 1 < 8) {
return false; // Invalid format
}
$filenamelen = strpos(substr($data,8+$extralen),chr(0));
if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) {
return false; // Invalid format
}
$filename = substr($data,$headerlen,$filenamelen);
$headerlen += $filenamelen + 1;
}
$commentlen = 0;
$comment = "";
if ($flags & 16) {
// C-style string COMMENT data in header
if ($len - $headerlen - 1 < 8) {
return false; // Invalid format
}
$commentlen = strpos(substr($data,8+$extralen+$filenamelen),chr(0));
if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) {
return false; // Invalid header format
}
$comment = substr($data,$headerlen,$commentlen);
$headerlen += $commentlen + 1;
}
$headercrc = "";
if ($flags & 1) {
// 2-bytes (lowest order) of CRC32 on header present
if ($len - $headerlen - 2 < 8) {
return false; // Invalid format
}
$calccrc = crc32(substr($data,0,$headerlen)) & 0xffff;
$headercrc = unpack("v", substr($data,$headerlen,2));
$headercrc = $headercrc[1];
if ($headercrc != $calccrc) {
return false; // Bad header CRC
}
$headerlen += 2;
}
// GZIP FOOTER - These be negative due to PHP's limitations
$datacrc = unpack("V",substr($data,-8,4));
$datacrc = $datacrc[1];
$isize = unpack("V",substr($data,-4));
$isize = $isize[1];
// Perform the decompression:
$bodylen = $len-$headerlen-8;
if ($bodylen < 1) {
// This should never happen - IMPLEMENTATION BUG!
return null;
}
$body = substr($data,$headerlen,$bodylen);
$data = "";
if ($bodylen > 0) {
switch ($method) {
case 8:
// Currently the only supported compression method:
$data = gzinflate($body);
break;
default:
// Unknown compression method
return false;
}
} else {
// I'm not sure if zero-byte body content is allowed.
// Allow it for now... Do nothing...
}
// Verifiy decompressed size and CRC32:
// NOTE: This may fail with large data sizes depending on how
// PHP's integer limitations affect strlen() since $isize
// may be negative for large sizes.
if ($isize != strlen($data) || crc32($data) != $datacrc) {
// Bad format! Length or CRC doesn't match!
return false;
}
return $data;
}
#5
确实被GZIP压缩,解压后正常!
private static string DownloadWebPage(string url)
{
WebRequest request =
WebRequest.Create(url);
request.Credentials = CredentialCache.DefaultCredentials;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
//Console.WriteLine(response.StatusDescription);
Stream dataStream = response.GetResponseStream();
GZipStream gzip = new GZipStream(dataStream, CompressionMode.Decompress);
StreamReader reader = new StreamReader(gzip, Encoding.GetEncoding("GB2312"));
string responseFromServer = reader.ReadToEnd();
reader.Close();
dataStream.Close();
response.Close();
return responseFromServer;
}
static void Main(string[] args)
{
DownloadWebPage("http://price.pcauto.com.cn/dictionary/category/");
}
private static string DownloadWebPage(string url)
{
WebRequest request =
WebRequest.Create(url);
request.Credentials = CredentialCache.DefaultCredentials;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
//Console.WriteLine(response.StatusDescription);
Stream dataStream = response.GetResponseStream();
GZipStream gzip = new GZipStream(dataStream, CompressionMode.Decompress);
StreamReader reader = new StreamReader(gzip, Encoding.GetEncoding("GB2312"));
string responseFromServer = reader.ReadToEnd();
reader.Close();
dataStream.Close();
response.Close();
return responseFromServer;
}
static void Main(string[] args)
{
DownloadWebPage("http://price.pcauto.com.cn/dictionary/category/");
}
#6
NIU
#7
太牛了
#8
不过是 gzip 压缩了而已
#9
版主很强大,请教一下:是如何看出来时经过gzip压缩的。以后学习留作参考。
#10
膜拜
#11
呵呵,谢谢了,又学了一招
我那问题已经搞定了,不过,用的。net采集的
我那问题已经搞定了,不过,用的。net采集的
#12
如果被“反采集”,用file_get_contents采集到的应该是空白,但是我采集的是乱码,想必就因为这个,才判断是gzip压缩吧
#1
还没搞定,继续求助
#2
要采集 就用curl.
#3
确实做了“反采集”,无法抓取!
#4
不过是 gzip 压缩了而已
$url = 'http://price.pcauto.com.cn/dictionary/category/';
$s = file_get_contents($url);
echo gzdecode($s);
function gzdecode($data) {
$len = strlen($data);
if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) {
return null; // Not GZIP format (See RFC 1952)
}
$method = ord(substr($data,2,1)); // Compression method
$flags = ord(substr($data,3,1)); // Flags
if ($flags & 31 != $flags) {
// Reserved bits are set -- NOT ALLOWED by RFC 1952
return null;
}
// NOTE: $mtime may be negative (PHP integer limitations)
$mtime = unpack("V", substr($data,4,4));
$mtime = $mtime[1];
$xfl = substr($data,8,1);
$os = substr($data,8,1);
$headerlen = 10;
$extralen = 0;
$extra = "";
if ($flags & 4) {
// 2-byte length prefixed EXTRA data in header
if ($len - $headerlen - 2 < 8) {
return false; // Invalid format
}
$extralen = unpack("v",substr($data,8,2));
$extralen = $extralen[1];
if ($len - $headerlen - 2 - $extralen < 8) {
return false; // Invalid format
}
$extra = substr($data,10,$extralen);
$headerlen += 2 + $extralen;
}
$filenamelen = 0;
$filename = "";
if ($flags & 8) {
// C-style string file NAME data in header
if ($len - $headerlen - 1 < 8) {
return false; // Invalid format
}
$filenamelen = strpos(substr($data,8+$extralen),chr(0));
if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) {
return false; // Invalid format
}
$filename = substr($data,$headerlen,$filenamelen);
$headerlen += $filenamelen + 1;
}
$commentlen = 0;
$comment = "";
if ($flags & 16) {
// C-style string COMMENT data in header
if ($len - $headerlen - 1 < 8) {
return false; // Invalid format
}
$commentlen = strpos(substr($data,8+$extralen+$filenamelen),chr(0));
if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) {
return false; // Invalid header format
}
$comment = substr($data,$headerlen,$commentlen);
$headerlen += $commentlen + 1;
}
$headercrc = "";
if ($flags & 1) {
// 2-bytes (lowest order) of CRC32 on header present
if ($len - $headerlen - 2 < 8) {
return false; // Invalid format
}
$calccrc = crc32(substr($data,0,$headerlen)) & 0xffff;
$headercrc = unpack("v", substr($data,$headerlen,2));
$headercrc = $headercrc[1];
if ($headercrc != $calccrc) {
return false; // Bad header CRC
}
$headerlen += 2;
}
// GZIP FOOTER - These be negative due to PHP's limitations
$datacrc = unpack("V",substr($data,-8,4));
$datacrc = $datacrc[1];
$isize = unpack("V",substr($data,-4));
$isize = $isize[1];
// Perform the decompression:
$bodylen = $len-$headerlen-8;
if ($bodylen < 1) {
// This should never happen - IMPLEMENTATION BUG!
return null;
}
$body = substr($data,$headerlen,$bodylen);
$data = "";
if ($bodylen > 0) {
switch ($method) {
case 8:
// Currently the only supported compression method:
$data = gzinflate($body);
break;
default:
// Unknown compression method
return false;
}
} else {
// I'm not sure if zero-byte body content is allowed.
// Allow it for now... Do nothing...
}
// Verifiy decompressed size and CRC32:
// NOTE: This may fail with large data sizes depending on how
// PHP's integer limitations affect strlen() since $isize
// may be negative for large sizes.
if ($isize != strlen($data) || crc32($data) != $datacrc) {
// Bad format! Length or CRC doesn't match!
return false;
}
return $data;
}
#5
确实被GZIP压缩,解压后正常!
private static string DownloadWebPage(string url)
{
WebRequest request =
WebRequest.Create(url);
request.Credentials = CredentialCache.DefaultCredentials;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
//Console.WriteLine(response.StatusDescription);
Stream dataStream = response.GetResponseStream();
GZipStream gzip = new GZipStream(dataStream, CompressionMode.Decompress);
StreamReader reader = new StreamReader(gzip, Encoding.GetEncoding("GB2312"));
string responseFromServer = reader.ReadToEnd();
reader.Close();
dataStream.Close();
response.Close();
return responseFromServer;
}
static void Main(string[] args)
{
DownloadWebPage("http://price.pcauto.com.cn/dictionary/category/");
}
private static string DownloadWebPage(string url)
{
WebRequest request =
WebRequest.Create(url);
request.Credentials = CredentialCache.DefaultCredentials;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
//Console.WriteLine(response.StatusDescription);
Stream dataStream = response.GetResponseStream();
GZipStream gzip = new GZipStream(dataStream, CompressionMode.Decompress);
StreamReader reader = new StreamReader(gzip, Encoding.GetEncoding("GB2312"));
string responseFromServer = reader.ReadToEnd();
reader.Close();
dataStream.Close();
response.Close();
return responseFromServer;
}
static void Main(string[] args)
{
DownloadWebPage("http://price.pcauto.com.cn/dictionary/category/");
}
#6
NIU
#7
太牛了
#8
不过是 gzip 压缩了而已
#9
版主很强大,请教一下:是如何看出来时经过gzip压缩的。以后学习留作参考。
#10
膜拜
#11
呵呵,谢谢了,又学了一招
我那问题已经搞定了,不过,用的。net采集的
我那问题已经搞定了,不过,用的。net采集的
#12
如果被“反采集”,用file_get_contents采集到的应该是空白,但是我采集的是乱码,想必就因为这个,才判断是gzip压缩吧