本文实例讲述了python实现批量下载新浪博客的方法。分享给大家供大家参考。具体实现方法如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
|
# coding=utf-8
import urllib2
import sys, os
import re
import string
from BeautifulSoup import BeautifulSoup
def encode(s):
return s.decode( 'utf-8' ).encode(sys.stdout.encoding, 'ignore' )
def getHTML(url):
#proxy_handler = urllib2.ProxyHandler({'http':'http://211.138.124.211:80'})
#opener = urllib2.build_opener(proxy_handler)
#urllib2.install_opener(opener)
req = urllib2.Request(url)
response = urllib2.urlopen(req, timeout = 15 )
return BeautifulSoup(response, convertEntities = BeautifulSoup.HTML_ENTITIES)
def visible(element):
'''抓取可见的文本元素'''
if element.parent.name in [ 'style' , 'script' , '[document]' , 'head' , 'title' ]:
return False
elif re.match( '<!--.*-->' , str (element)):
return False
elif element = = u '\xa0' :
return False
return True
def delReturn(element):
'''删除元素内的换行'''
return re.sub( '(?<!^)\n+(?!$)' , ' ' , str (element)).decode( 'utf-8' )
def validFilename(filename):
# windows
return re.sub( '[\/:*?<>"|\xa0]' , '', filename)
def writeToFile(text, filename, dirname):
if not os.path.exists(dirname):
os.makedirs(dirname)
print encode( '保存到目录' ), dirname
filename = validFilename(filename)
print encode( '保存文章' ), filename
path = os.path.join(dirname, filename)
if not os.path.exists(path):
f = open (path, 'w' )
f.write(text)
f.close()
else :
print filename, encode( '已经存在' )
def formatContent(url, title = ''):
'''格式化文章内容'''
page = getHTML(url)
content = page.find( 'div' , { 'class' : 'articalContent' })
art_id = re.search( 'blog_(\w+)\.html' , url).group( 1 )
blog_name = page.find( 'span' , id = 'blognamespan' ).string
if title = = '':
title = page.find( 'h2' , id = re. compile ( '^t_' )).string
temp_data = filter (visible, content.findAll(text = True )) # 去掉不可见元素
temp_data = ''.join( map (delReturn, temp_data)) # 删除元素内的换行符
temp_data = temp_data.strip() # 删除文章首尾的空行
temp_data = re.sub( '\n{2,}' , '\n\n' , temp_data) # 删除文章内过多的空行
# 输出到文件
# 编码问题
temp_data = '本文地址:' .decode( 'utf-8' ) + url + '\n\n' + temp_data
op_text = temp_data.encode( 'utf-8' )
op_file = title + '_' + art_id + '.txt'
writeToFile(op_text, op_file, blog_name)
def articlelist(url):
articles = {}
page = getHTML(url)
pages = page.find( 'ul' , { 'class' : 'SG_pages' }).span.string
page_num = int (re.search( '(\d+)' , pages).group( 1 ))
for i in range ( 1 , page_num + 1 ):
print encode( '生成第%d页文章索引' % i)
if i ! = 1 :
url = re.sub( '(_)\d+(\.html)$' , '\g<1>' + str (i) + '\g<2>' , url)
page = getHTML(url)
article = page.findAll( 'span' , { 'class' : 'atc_title' })
for art in article:
art_title = art.a[ 'title' ]
art_href = art.a[ 'href' ]
articles[art_title] = art_href
return articles
def blog_dld(articles):
if not isinstance (articles, dict ):
return False
print encode( '开始下载文章' )
for art_title, art_href in articles.items():
formatContent(art_href, art_title)
if __name__ = = '__main__' :
sel = raw_input (encode( '你要下载的是(1)全部文章还是(2)单篇文章,输入1或者2: ' ))
if sel = = '1' :
#articlelist_url = 'http://blog.sina.com.cn/s/articlelist_1303481411_0_1.html'
articlelist_url = raw_input (encode( '请输入博客文章目录链接: ' ))
articles = articlelist(articlelist_url)
blog_dld(articles)
else :
#article_url = 'http://blog.sina.com.cn/s/blog_4db18c430100gxc5.html'
article_url = raw_input (encode( '请输入博客文章链接: ' ))
formatContent(article_url)
|
希望本文所述对大家的Python程序设计有所帮助。