PHP屏蔽蜘蛛访问代码代码:
常用搜索引擎名与 HTTP_USER_AGENT对应值
百度baiduspider
谷歌googlebot
搜狗sogou
腾讯SOSOsosospider
雅虎slurp
有道youdaobot
Bingbingbot
MSNmsnbot
Alexais_archiver
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
function is_crawler() {
$userAgent = strtolower ( $_SERVER [ 'HTTP_USER_AGENT' ]);
$spiders = array (
'Googlebot' , // Google 爬虫
'Baiduspider' , // 百度爬虫
'Yahoo! Slurp' , // 雅虎爬虫
'YodaoBot' , // 有道爬虫
'msnbot' // Bing爬虫
// 更多爬虫关键字
);
foreach ( $spiders as $spider ) {
$spider = strtolower ( $spider );
if ( strpos ( $userAgent , $spider ) !== false) {
return true;
}
}
return false;
}
|
下面的php代码附带了更多的蜘蛛标识
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
|
function isCrawler() {
echo $agent = strtolower ( $_SERVER [ 'HTTP_USER_AGENT' ]);
if (! empty ( $agent )) {
$spiderSite = array (
"TencentTraveler" ,
"Baiduspider+" ,
"BaiduGame" ,
"Googlebot" ,
"msnbot" ,
"Sosospider+" ,
"Sogou web spider" ,
"ia_archiver" ,
"Yahoo! Slurp" ,
"YoudaoBot" ,
"Yahoo Slurp" ,
"MSNBot" ,
"Java (Often spam bot)" ,
"BaiDuSpider" ,
"Voila" ,
"Yandex bot" ,
"BSpider" ,
"twiceler" ,
"Sogou Spider" ,
"Speedy Spider" ,
"Google AdSense" ,
"Heritrix" ,
"Python-urllib" ,
"Alexa (IA Archiver)" ,
"Ask" ,
"Exabot" ,
"Custo" ,
"OutfoxBot/YodaoBot" ,
"yacy" ,
"SurveyBot" ,
"legs" ,
"lwp-trivial" ,
"Nutch" ,
"StackRambler" ,
"The web archive (IA Archiver)" ,
"Perl tool" ,
"MJ12bot" ,
"Netcraft" ,
"MSIECrawler" ,
"WGet tools" ,
"larbin" ,
"Fish search" ,
);
foreach ( $spiderSite as $val ) {
$str = strtolower ( $val );
if ( strpos ( $agent , $str ) !== false) {
return true;
}
}
} else {
return false;
}
}
if (isCrawler()){
echo "你好蜘蛛精!" ;
}
else {
echo "你不是蜘蛛精啊!" ;
}
|
使用PHP实现蜘蛛访问日志统计
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
$useragent = addslashes ( strtolower ( $_SERVER [ 'HTTP_USER_AGENT' ]));
if ( strpos ( $useragent , 'googlebot' )!== false){ $bot = 'Google' ;}
elseif ( strpos ( $useragent , 'mediapartners-google' ) !== false){ $bot = 'Google Adsense' ;}
elseif ( strpos ( $useragent , 'baiduspider' ) !== false){ $bot = 'Baidu' ;}
elseif ( strpos ( $useragent , 'sogou spider' ) !== false){ $bot = 'Sogou' ;}
elseif ( strpos ( $useragent , 'sogou web' ) !== false){ $bot = 'Sogou web' ;}
elseif ( strpos ( $useragent , 'sosospider' ) !== false){ $bot = 'SOSO' ;}
elseif ( strpos ( $useragent , '360spider' ) !== false){ $bot = '360Spider' ;}
elseif ( strpos ( $useragent , 'yahoo' ) !== false){ $bot = 'Yahoo' ;}
elseif ( strpos ( $useragent , 'msn' ) !== false){ $bot = 'MSN' ;}
elseif ( strpos ( $useragent , 'msnbot' ) !== false){ $bot = 'msnbot' ;}
elseif ( strpos ( $useragent , 'sohu' ) !== false){ $bot = 'Sohu' ;}
elseif ( strpos ( $useragent , 'yodaoBot' ) !== false){ $bot = 'Yodao' ;}
elseif ( strpos ( $useragent , 'twiceler' ) !== false){ $bot = 'Twiceler' ;}
elseif ( strpos ( $useragent , 'ia_archiver' ) !== false){ $bot = 'Alexa_' ;}
elseif ( strpos ( $useragent , 'iaarchiver' ) !== false){ $bot = 'Alexa' ;}
elseif ( strpos ( $useragent , 'slurp' ) !== false){ $bot = '雅虎' ;}
elseif ( strpos ( $useragent , 'bot' ) !== false){ $bot = '其它蜘蛛' ;}
if (isset( $bot )){
$fp = @ fopen ( 'bot.txt' , 'a' );
fwrite( $fp , date ( 'Y-m-d H:i:s' ). "\t" . $_SERVER [ "REMOTE_ADDR" ]. "\t" . $bot . "\t" . 'http://' . $_SERVER [ 'SERVER_NAME' ]. $_SERVER [ "REQUEST_URI" ]. "\r\n" );
fclose( $fp );
}
|