1、推荐的一种方法:php判断搜索引擎蜘蛛爬虫还是人为访问代码,摘自Discuz x3.2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
<?php
function checkrobot( $useragent = '' ){
static $kw_spiders = array ( 'bot' , 'crawl' , 'spider' , 'slurp' , 'sohu-search' , 'lycos' , 'robozilla' );
static $kw_browsers = array ( 'msie' , 'netscape' , 'opera' , 'konqueror' , 'mozilla' );
$useragent = strtolower ( empty ( $useragent ) ? $_SERVER [ 'HTTP_USER_AGENT' ] : $useragent );
if ( strpos ( $useragent , 'http://' ) === false && dstrpos( $useragent , $kw_browsers )) return false;
if (dstrpos( $useragent , $kw_spiders )) return true;
return false;
}
function dstrpos( $string , $arr , $returnvalue = false) {
if ( empty ( $string )) return false;
foreach (( array ) $arr as $v ) {
if ( strpos ( $string , $v ) !== false) {
$return = $returnvalue ? $v : true;
return $return ;
}
}
return false;
}
if (checkrobot()){
echo '机器人爬虫' ;
} else {
echo '人' ;
}
?>
|
实际应用中可以这样判断,直接不是搜索引擎才执行操作
1
2
3
4
5
|
<?php
if (!checkrobot()){
//do something
}
?>
|
2、第二种方法:
使用PHP实现蜘蛛访问日志统计
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
$useragent = addslashes ( strtolower ( $_SERVER [ 'HTTP_USER_AGENT' ]));
if ( strpos ( $useragent , 'googlebot' )!== false){ $bot = 'Google' ;}
elseif ( strpos ( $useragent , 'mediapartners-google' ) !== false){ $bot = 'Google Adsense' ;}
elseif ( strpos ( $useragent , 'baiduspider' ) !== false){ $bot = 'Baidu' ;}
elseif ( strpos ( $useragent , 'sogou spider' ) !== false){ $bot = 'Sogou' ;}
elseif ( strpos ( $useragent , 'sogou web' ) !== false){ $bot = 'Sogou web' ;}
elseif ( strpos ( $useragent , 'sosospider' ) !== false){ $bot = 'SOSO' ;}
elseif ( strpos ( $useragent , '360spider' ) !== false){ $bot = '360Spider' ;}
elseif ( strpos ( $useragent , 'yahoo' ) !== false){ $bot = 'Yahoo' ;}
elseif ( strpos ( $useragent , 'msn' ) !== false){ $bot = 'MSN' ;}
elseif ( strpos ( $useragent , 'msnbot' ) !== false){ $bot = 'msnbot' ;}
elseif ( strpos ( $useragent , 'sohu' ) !== false){ $bot = 'Sohu' ;}
elseif ( strpos ( $useragent , 'yodaoBot' ) !== false){ $bot = 'Yodao' ;}
elseif ( strpos ( $useragent , 'twiceler' ) !== false){ $bot = 'Twiceler' ;}
elseif ( strpos ( $useragent , 'ia_archiver' ) !== false){ $bot = 'Alexa_' ;}
elseif ( strpos ( $useragent , 'iaarchiver' ) !== false){ $bot = 'Alexa' ;}
elseif ( strpos ( $useragent , 'slurp' ) !== false){ $bot = '雅虎' ;}
elseif ( strpos ( $useragent , 'bot' ) !== false){ $bot = '其它蜘蛛' ;}
if (isset( $bot )){
$fp = @ fopen ( 'bot.txt' , 'a' );
fwrite( $fp , date ( 'Y-m-d H:i:s' ). " " . $_SERVER [ "REMOTE_ADDR" ]. " " . $bot . " " . 'http://' . $_SERVER [ 'SERVER_NAME' ]. $_SERVER [ "REQUEST_URI" ]. " " );
fclose( $fp );
}
|
第三种方法:
我们可以通过HTTP_USER_AGENT来判断是否是蜘蛛,搜索引擎的蜘蛛都有自己的独特标志,下面列取了一部分。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
function is_crawler() {
$userAgent = strtolower ( $_SERVER [ 'HTTP_USER_AGENT' ]);
$spiders = array (
'Googlebot' , // Google 爬虫
'Baiduspider' , // 百度爬虫
'Yahoo! Slurp' , // 雅虎爬虫
'YodaoBot' , // 有道爬虫
'msnbot' // Bing爬虫
// 更多爬虫关键字
);
foreach ( $spiders as $spider ) {
$spider = strtolower ( $spider );
if ( strpos ( $userAgent , $spider ) !== false) {
return true;
}
}
return false;
}
|
下面的php代码附带了更多的蜘蛛标识
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
|
function isCrawler() {
echo $agent = strtolower ( $_SERVER [ 'HTTP_USER_AGENT' ]);
if (! empty ( $agent )) {
$spiderSite = array (
"TencentTraveler" ,
"Baiduspider+" ,
"BaiduGame" ,
"Googlebot" ,
"msnbot" ,
"Sosospider+" ,
"Sogou web spider" ,
"ia_archiver" ,
"Yahoo! Slurp" ,
"YoudaoBot" ,
"Yahoo Slurp" ,
"MSNBot" ,
"Java (Often spam bot)" ,
"BaiDuSpider" ,
"Voila" ,
"Yandex bot" ,
"BSpider" ,
"twiceler" ,
"Sogou Spider" ,
"Speedy Spider" ,
"Google AdSense" ,
"Heritrix" ,
"Python-urllib" ,
"Alexa (IA Archiver)" ,
"Ask" ,
"Exabot" ,
"Custo" ,
"OutfoxBot/YodaoBot" ,
"yacy" ,
"SurveyBot" ,
"legs" ,
"lwp-trivial" ,
"Nutch" ,
"StackRambler" ,
"The web archive (IA Archiver)" ,
"Perl tool" ,
"MJ12bot" ,
"Netcraft" ,
"MSIECrawler" ,
"WGet tools" ,
"larbin" ,
"Fish search" ,
);
foreach ( $spiderSite as $val ) {
$str = strtolower ( $val );
if ( strpos ( $agent , $str ) !== false) {
return true;
}
}
} else {
return false;
}
}
if (isCrawler()){
echo "你好蜘蛛精!" ;
}
else {
echo "你不是蜘蛛精啊!" ;
}
|
第四种方法:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
|
<?php
$flag = false;
$tmp = $_SERVER [ 'HTTP_USER_AGENT' ];
if ( strpos ( $tmp , 'Googlebot' ) !== false){
$flag = true;
} else if ( strpos ( $tmp , 'Baiduspider' ) >0){
$flag = true;
} else if ( strpos ( $tmp , 'Yahoo! Slurp' ) !== false){
$flag = true;
} else if ( strpos ( $tmp , 'msnbot' ) !== false){
$flag = true;
} else if ( strpos ( $tmp , 'Sosospider' ) !== false){
$flag = true;
} else if ( strpos ( $tmp , 'YodaoBot' ) !== false || strpos ( $tmp , 'OutfoxBot' ) !== false){
$flag = true;
} else if ( strpos ( $tmp , 'Sogou web spider' ) !== false || strpos ( $tmp , 'Sogou Orion spider' ) !== false){
$flag = true;
} else if ( strpos ( $tmp , 'fast-webcrawler' ) !== false){
$flag = true;
} else if ( strpos ( $tmp , 'Gaisbot' ) !== false){
$flag = true;
} else if ( strpos ( $tmp , 'ia_archiver' ) !== false){
$flag = true;
} else if ( strpos ( $tmp , 'altavista' ) !== false){
$flag = true;
} else if ( strpos ( $tmp , 'lycos_spider' ) !== false){
$flag = true;
} else if ( strpos ( $tmp , 'Inktomi slurp' ) !== false){
$flag = true;
}
if ( $flag == false){
// 自动转到http://www.zzvips.com 对应的网页
// $_SERVER['REQUEST_URI'] 为域名后面的路径
// 或换成header("Location: http://www.zzvips.com/tags.html");
exit ();
}
?>
|