实现爬虫记录本文从创建crawler 数据库,robot.php记录来访的爬虫从而将信息插入数据库crawler,然后从数据库中就可以获得所有的爬虫信息。实现代码具体如下:
数据库设计
1
2
3
4
5
6
7
8
|
create table crawler
(
crawler_ID bigint() unsigned not null auto_increment primary key,
crawler_category varchar() not null,
crawler_date datetime not null default '-- ::' ,
crawler_url varchar() not null,
crawler_IP varchar() not null
) default charset=utf;
|
以下文件 robot.php 记录来访的爬虫,并将信息写入数据库:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
|
<?php
$ServerName = $_SERVER [ "SERVER_NAME" ] ;
$ServerPort = $_SERVER [ "SERVER_PORT" ] ;
$ScriptName = $_SERVER [ "SCRIPT_NAME" ] ;
$QueryString = $_SERVER [ "QUERY_STRING" ];
$serverip = $_SERVER [ "REMOTE_ADDR" ] ;
$Url = "http://" . $ServerName ;
if ( $ServerPort != "" )
{
$Url = $Url . ":" . $ServerPort ;
}
$Url = $Url . $ScriptName ;
if ( $QueryString != "" )
{
$Url = $Url . "?" . $QueryString ;
}
$GetLocationURL = $Url ;
$agent = $_SERVER [ "HTTP_USER_AGENT" ];
$agent = strtolower ( $agent );
$Bot = "" ;
if ( strpos ( $agent , "bot" )>-)
{
$Bot = "Other Crawler" ;
}
if ( strpos ( $agent , "googlebot" )>-)
{
$Bot = "Google" ;
}
if ( strpos ( $agent , "mediapartners-google" )>-)
{
$Bot = "Google Adsense" ;
}
if ( strpos ( $agent , "baiduspider" )>-)
{
$Bot = "Baidu" ;
}
if ( strpos ( $agent , "sogou spider" )>-)
{
$Bot = "Sogou" ;
}
if ( strpos ( $agent , "yahoo" )>-)
{
$Bot = "Yahoo!" ;
}
if ( strpos ( $agent , "msn" )>-)
{
$Bot = "MSN" ;
}
if ( strpos ( $agent , "ia_archiver" )>-)
{
$Bot = "Alexa" ;
}
if ( strpos ( $agent , "iaarchiver" )>-)
{
$Bot = "Alexa" ;
}
if ( strpos ( $agent , "sohu" )>-)
{
$Bot = "Sohu" ;
}
if ( strpos ( $agent , "sqworm" )>-)
{
$Bot = "AOL" ;
}
if ( strpos ( $agent , "yodaoBot" )>-)
{
$Bot = "Yodao" ;
}
if ( strpos ( $agent , "iaskspider" )>-)
{
$Bot = "Iask" ;
}
require ( "./dbinfo.php" );
date_default_timezone_set( 'PRC' );
$shijian = date ( "Y-m-d h:i:s" , time());
// 连接到 MySQL 服务器
$connection = mysql_connect ( $host , $username , $password );
if (! $connection )
{
die ( 'Not connected : ' . mysql_error());
}
// 设置活动的 MySQL 数据库
$db_selected = mysql_select_db( $database , $connection );
if (! $db_selected )
{
die ( 'Can\'t use db : ' . mysql_error());
}
// 向数据库插入数据
$query = "insert into crawler (crawler_category, crawler_date, crawler_url, crawler_IP) values ('$Bot','$shijian','$GetLocationURL','$serverip')" ;
$result = mysql_query( $query );
if (! $result )
{
die ( 'Invalid query: ' . mysql_error());
}
?>
|
成功了,现在访问数据库即可得知什么时候哪里的蜘蛛爬过你的什么页面。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
|
view sourceprint?
<? php
include './robot.php';
include '../library/page.Class.php';
$page = $_GET['page'];
include '../library/conn_new.php';
$count = $mysql -> num_rows($mysql -> query("select * from crawler"));
$pages = new PageClass($count,,$_GET['page'],$_SERVER['PHP_SELF'].'?page={page}');
$sql = "select * from crawler order by ";
$sql .= "crawler_date desc limit ".$pages -> page_limit.",".$pages -> myde_size;
$result = $mysql -> query($sql);
?>
< table width = "" >
< thead >
< tr >
< td bgcolor = "#CCFFFF" ></ td >
< td bgcolor = "#CCFFFF" align = "center" style = "color:#" >爬虫访问时间</ td >
< td bgcolor = "#CCFFFF" align = "center" style = "color:#" >爬虫分类</ td >
< td bgcolor = "#CCFFFF" align = "center" style = "color:#" >爬虫IP</ td >
< td bgcolor = "#CCFFFF" align = "center" style = "color:#" >爬虫访问的URL</ td >
</ tr >
</ thead >
<? php
while($myrow = $mysql -> fetch_array($result)){
?>
< tr >
< td width = "" >< img src = "../images/topicnew.gif" /></ td >
< td width = "" style = "font-family:Georgia" ><? echo $myrow["crawler_date"] ?></ td >
< td width = "" style = "color:#FA" ><? echo $myrow["crawler_category"] ?></ td >
< td width = "" ><? echo $myrow["crawler_IP"] ?></ td >
< td width = "" ><? echo $myrow["crawler_url"] ?></ td >
</ tr >
<? php
}
?>
</ table >
<? php
echo $pages -> myde_write();
?>
|
以上代码就是PHP代码实现爬虫记录——超管用的全部内容,希望对大家有所帮助。