学了下beautifulsoup后,做个个网络爬虫,爬取读者杂志并用reportlab制作成pdf..
crawler.py
复制代码代码如下:
#!/usr/bin/env python
#coding=utf-8
"""
Author: Anemone
Filename: getmain.py
Last modified: 2015-02-19 16:47
E-mail: anemone@82flex.com
"""
import urllib2
from bs4 import BeautifulSoup
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def getEachArticle(url):
# response = urllib2.urlopen('http://www.52duzhe.com/2015_01/duzh20150104.html')
response = urllib2.urlopen(url)
html = response.read()
soup = BeautifulSoup(html)#.decode("utf-8").encode("gbk"))
#for i in soup.find_all('div'):
# print i,1
style="margin: 3px auto 0px; padding: 0px 3px; outline: none; line-height: 25.2px; font-size: 14px; background: rgb(242, 246, 251); width: 640px; clear: both; border-top: 1px solid rgb(0, 153, 204); border-right: 1px solid rgb(0, 153, 204); border-left: 1px solid rgb(0, 153, 204); border-image: initial; border-bottom: none; font-family: tahoma, arial, "Microsoft YaHei";"> 复制代码代码如下:
#!/usr/bin/env python
#coding=utf-8
"""
Author: Anemone
Filename: writetopdf.py
Last modified: 2015-02-20 19:19
E-mail: anemone@82flex.com
"""
#coding=utf-8
import reportlab.rl_config
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.lib import fonts
import copy
from reportlab.platypus import Paragraph, SimpleDocTemplate,flowables
from reportlab.lib.styles import getSampleStyleSheet
import crawler
def writePDF(issue,duzhe):
reportlab.rl_config.warnOnMissingFontGlyphs = 0
pdfmetrics.registerFont(TTFont('song',"simsun.ttc"))
pdfmetrics.registerFont(TTFont('hei',"msyh.ttc"))
fonts.addMapping('song', 0, 0, 'song')
fonts.addMapping('song', 0, 1, 'song')
fonts.addMapping('song', 1, 0, 'hei')
fonts.addMapping('song', 1, 1, 'hei')
stylesheet=getSampleStyleSheet()
normalStyle = copy.deepcopy(stylesheet['Normal'])
normalStyle.fontName ='song'
normalStyle.fontSize = 11
normalStyle.leading = 11
normalStyle.firstLineIndent = 20
titleStyle = copy.deepcopy(stylesheet['Normal'])
titleStyle.fontName ='song'
titleStyle.fontSize = 15
titleStyle.leading = 20
firstTitleStyle = copy.deepcopy(stylesheet['Normal'])
firstTitleStyle.fontName ='song'
firstTitleStyle.fontSize = 20
firstTitleStyle.leading = 20
firstTitleStyle.firstLineIndent = 50
smallStyle = copy.deepcopy(stylesheet['Normal'])
smallStyle.fontName ='song'
smallStyle.fontSize = 8
smallStyle.leading = 8
story = []
story.append(Paragraph("<b>读者{0}期</b>".format(issue), firstTitleStyle))
for eachColumn in duzhe:
story.append(Paragraph('__'*28, titleStyle))
story.append(Paragraph('<b>{0}</b>'.format(eachColumn), titleStyle))
for eachArticle in duzhe[eachColumn]:
story.append(Paragraph(eachArticle["title"],normalStyle))
story.append(flowables.PageBreak())
for eachColumn in duzhe:
for eachArticle in duzhe[eachColumn]:
story.append(Paragraph("<b>{0}</b>".format(eachArticle["title"]),titleStyle))
story.append(Paragraph(" {0} {1}".format(eachArticle["writer"],eachArticle["from"]),smallStyle))
para=eachArticle["context"].split(" ")
for eachPara in para:
story.append(Paragraph(eachPara,normalStyle))
story.append(flowables.PageBreak())
#story.append(Paragraph("context",normalStyle))
doc = SimpleDocTemplate("duzhe"+issue+".pdf")
print "Writing PDF..."
doc.build(story)
def main(issue):
duzhe=crawler.getCatalog(issue)
writePDF(issue,duzhe)
if __name__ == '__main__':
issue=raw_input("Enter issue(201501):")
main(issue)
以上就是本文的全部内容了,希望大家能够喜欢。