采用多线程对韩寒的微博进行爬取,这个只是不需要进行模拟登陆的:
1 #--coding:utf-8---
2 #!/usr/bin/env python
3 import urllib
4 import os
5 import re
6 import time
7 from threading import Thread
8 from multiprocessing import Process
9
10 def downloadURL(urls,dirpath):
11 ##在之前中分装一个list
12 for url in urls:
13 if(len(url)>0):
14 content=urllib.urlopen(url).read()
15 ##采用os模块中IO接口写写html文档
16 if not os.path.exists(dirpath):
17 os.makedirs(dirpath)
18 open(dirpath+r'/'+url[-26:],'w').write(content)
19
20
21 def parseTarget(url):
22 root_url=url
23 urls=[]
24 ##这里得到的东西是每一篇文章的链接
25 content=urllib.urlopen(root_url).read()
26
27
28 pattern=r'<a title="(.*?)" href="(.*?)">'
29
30 hrefs=re.findall(pattern,content)
31
32 for href in hrefs:
33 #print href
34 urls.append(href[1])
35
36 return urls
37
38 def thread_or_process_job(n,thread_or_process,url_lists,job):
39 local_time=time.time()
40 ##args为前面函数的参数
41 Thread_or_Process=[thread_or_process(target=job,args=(url_lists[i],str(n)+thread_or_process.__name__)) for i in xrange(n)]
42
43
44 for t in Thread_or_Process:
45 t.start()
46
47 for t in Thread_or_Process:
48 t.join()
49
50 print n,thread_or_process.__name__," run job need ",time.time()-local_time
51
52 if __name__=='__main__':
53 t=time.time()
54 urls=[]
55 for i in xrange(5):
56 urls.extend(parseTarget('http://blog.sina.com.cn/s/articlelist_1191258123_0_'+str(i+1)+'.html'))
57 url_len=len(urls)
58 print 'urls_len:',len(urls)
59
60 for n in[2,4,6,8]:
61 url_list=[]
62 url_split_len=url_len//n
63 ##将总的url进行分为多少段同时进行处理
64 for i in xrange(n):
65 if i==n-1:
66 url_list.append(urls[i*url_split_len:url_len])
67 else:
68 url_list.append(urls[i*url_split_len:(i+1)*url_split_len])
69
70 thread_or_process_job(n,Thread,url_list,downloadURL)
71 thread_or_process_job(n,Process,url_list,downloadURL)
72
73 print "All done in ",time.time()-t
74