一般来说,使用线程有两种模式, 一种是创建线程要执行的函数, 把这个函数传递进Thread对象里,让它来执行. 另一种是直接从Thread继承,创建一个新的class,把线程执行的代码放到这个新的class里。
实现多线程网页爬虫,采用了多线程和锁机制,实现了广度优先算法的网页爬虫。
先给大家简单介绍下我的实现思路:
对于一个网络爬虫,如果要按广度遍历的方式下载,它是这样的:
1.从给定的入口网址把第一个网页下载下来
2.从第一个网页中提取出所有新的网页地址,放入下载列表中
3.按下载列表中的地址,下载所有新的网页
4.从所有新的网页中找出没有下载过的网页地址,更新下载列表
5.重复3、4两步,直到更新后的下载列表为空表时停止
python代码如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
|
#!/usr/bin/env python
#coding=utf-8
import threading
import urllib
import re
import time
g_mutex = threading.Condition()
g_pages = [] #从中解析所有url链接
g_queueURL = [] #等待爬取的url链接列表
g_existURL = [] #已经爬取过的url链接列表
g_failedURL = [] #下载失败的url链接列表
g_totalcount = 0 #下载过的页面数
class Crawler:
def __init__( self ,crawlername,url,threadnum):
self .crawlername = crawlername
self .url = url
self .threadnum = threadnum
self .threadpool = []
self .logfile = file ( "log.txt" , 'w' )
def craw( self ):
global g_queueURL
g_queueURL.append(url)
depth = 0
print self .crawlername + " 启动..."
while ( len (g_queueURL)! = 0 ):
depth + = 1
print 'Searching depth ' ,depth, '...\n\n'
self .logfile.write( "URL:" + g_queueURL[ 0 ] + "........" )
self .downloadAll()
self .updateQueueURL()
content = '\n>>>Depth ' + str (depth) + ':\n'
self .logfile.write(content)
i = 0
while i< len (g_queueURL):
content = str (g_totalcount + i) + '->' + g_queueURL[i] + '\n'
self .logfile.write(content)
i + = 1
def downloadAll( self ):
global g_queueURL
global g_totalcount
i = 0
while i< len (g_queueURL):
j = 0
while j< self .threadnum and i + j < len (g_queueURL):
g_totalcount + = 1
threadresult = self .download(g_queueURL[i + j], str (g_totalcount) + '.html' ,j)
if threadresult! = None :
print 'Thread started:' ,i + j, '--File number =' ,g_totalcount
j + = 1
i + = j
for thread in self .threadpool:
thread.join( 30 )
threadpool = []
g_queueURL = []
def download( self ,url,filename,tid):
crawthread = CrawlerThread(url,filename,tid)
self .threadpool.append(crawthread)
crawthread.start()
def updateQueueURL( self ):
global g_queueURL
global g_existURL
newUrlList = []
for content in g_pages:
newUrlList + = self .getUrl(content)
g_queueURL = list ( set (newUrlList) - set (g_existURL))
def getUrl( self ,content):
regob = re. compile (reg,re.DOTALL)
urllist = regob.findall(content)
return urllist
class CrawlerThread(threading.Thread):
def __init__( self ,url,filename,tid):
threading.Thread.__init__( self )
self .url = url
self .filename = filename
self .tid = tid
def run( self ):
global g_mutex
global g_failedURL
global g_queueURL
try :
page = urllib.urlopen( self .url)
html = page.read()
fout = file ( self .filename, 'w' )
fout.write(html)
fout.close()
except Exception,e:
g_mutex.acquire()
g_existURL.append( self .url)
g_failedURL.append( self .url)
g_mutex.release()
print 'Failed downloading and saving' , self .url
print e
return None
g_mutex.acquire()
g_pages.append(html)
g_existURL.append( self .url)
g_mutex.release()
if __name__ = = "__main__" :
url = raw_input ( "请输入url入口:\n" )
threadnum = int ( raw_input ( "设置线程数:" ))
crawlername = "小小爬虫"
crawler = Crawler(crawlername,url,threadnum)
crawler.craw()
|
以上代码就是给大家分享的基python实现多线程网页爬虫,希望大家喜欢。