本文实例讲述了PHP实现简单爬虫的方法。分享给大家供大家参考。具体如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
|
<?php
/**
* 爬虫程序 -- 原型
*
* 从给定的url获取html内容
*
* @param string $url
* @return string
*/
function _getUrlContent( $url ) {
$handle = fopen ( $url , "r" );
if ( $handle ) {
$content = stream_get_contents( $handle , 1024 * 1024);
return $content ;
} else {
return false;
}
}
/**
* 从html内容中筛选链接
*
* @param string $web_content
* @return array
*/
function _filterUrl( $web_content ) {
$reg_tag_a = '/<[a|A].*?href=[\'\"]{0,1}([^>\'\"\ ]*).*?>/' ;
$result = preg_match_all( $reg_tag_a , $web_content , $match_result );
if ( $result ) {
return $match_result [1];
}
}
/**
* 修正相对路径
*
* @param string $base_url
* @param array $url_list
* @return array
*/
function _reviseUrl( $base_url , $url_list ) {
$url_info = parse_url ( $base_url );
$base_url = $url_info [ "scheme" ] . '://' ;
if ( $url_info [ "user" ] && $url_info [ "pass" ]) {
$base_url .= $url_info [ "user" ] . ":" . $url_info [ "pass" ] . "@" ;
}
$base_url .= $url_info [ "host" ];
if ( $url_info [ "port" ]) {
$base_url .= ":" . $url_info [ "port" ];
}
$base_url .= $url_info [ "path" ];
print_r( $base_url );
if ( is_array ( $url_list )) {
foreach ( $url_list as $url_item ) {
if (preg_match( '/^http/' , $url_item )) {
// 已经是完整的url
$result [] = $url_item ;
} else {
// 不完整的url
$real_url = $base_url . '/' . $url_item ;
$result [] = $real_url ;
}
}
return $result ;
} else {
return ;
}
}
/**
* 爬虫
*
* @param string $url
* @return array
*/
function crawler( $url ) {
$content = _getUrlContent( $url );
if ( $content ) {
$url_list = _reviseUrl( $url , _filterUrl( $content ));
if ( $url_list ) {
return $url_list ;
} else {
return ;
}
} else {
return ;
}
}
/**
* 测试用主程序
*/
function main() {
$current_url = "http://hao123.com/" ; //初始url
$fp_puts = fopen ( "url.txt" , "ab" ); //记录url列表
$fp_gets = fopen ( "url.txt" , "r" ); //保存url列表
do {
$result_url_arr = crawler( $current_url );
if ( $result_url_arr ) {
foreach ( $result_url_arr as $url ) {
fputs ( $fp_puts , $url . "\r\n" );
}
}
} while ( $current_url = fgets ( $fp_gets , 1024)); //不断获得url
}
main();
?>
|
希望本文所述对大家的php程序设计有所帮助。