本文实例讲述了Python实现抓取HTML网页并以PDF文件形式保存的方法。分享给大家供大家参考,具体如下:
一、前言
今天介绍将HTML网页抓取下来,然后以PDF保存,废话不多说直接进入教程。
今天的例子以廖雪峰老师的Python教程网站为例:http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000
二、准备工作
1. PyPDF2的安装使用(用来合并PDF):
PyPDF2版本:1.25.1
https://pypi.python.org/pypi/PyPDF2/1.25.1
或
https://github.com/mstamy2/PyPDF2
安装:
1
|
pip install PyPDF2
|
使用示例:
1
2
3
4
5
6
7
8
9
|
from PyPDF2 import PdfFileMerger
merger = PdfFileMerger()
input1 = open ( "hql_1_20.pdf" , "rb" )
input2 = open ( "hql_21_40.pdf" , "rb" )
merger.append(input1)
merger.append(input2)
# Write to an output PDF document
output = open ( "hql_all.pdf" , "wb" )
merger.write(output)
|
2. requests、beautifulsoup 是爬虫两大神器,reuqests 用于网络请求,beautifusoup 用于操作 html 数据。有了这两把梭子,干起活来利索。scrapy 这样的爬虫框架我们就不用了,这样的小程序派上它有点杀鸡用牛刀的意思。此外,既然是把 html 文件转为 pdf,那么也要有相应的库支持, wkhtmltopdf 就是一个非常的工具,它可以用适用于多平台的 html 到 pdf 的转换,pdfkit 是 wkhtmltopdf 的Python封装包。首先安装好下面的依赖包
1
2
3
|
pip install requests
pip install beautifulsoup4
pip install pdfkit
|
3. 安装 wkhtmltopdf
Windows平台直接在 http://wkhtmltopdf.org/downloads.html 下载稳定版的 wkhtmltopdf 进行安装,安装完成之后把该程序的执行路径加入到系统环境 $PATH 变量中,否则 pdfkit 找不到 wkhtmltopdf 就出现错误 “No wkhtmltopdf executable found”。Ubuntu 和 CentOS 可以直接用命令行进行安装
1
2
|
$ sudo apt - get install wkhtmltopdf # ubuntu
$ sudo yum intsall wkhtmltopdf # centos
|
三、数据准备
1. 获取每篇文章的url
1
2
3
4
5
6
7
8
9
10
11
12
13
|
def get_url_list():
"""
获取所有URL目录列表
:return:
"""
response = requests.get( "http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000" )
soup = BeautifulSoup(response.content, "html.parser" )
menu_tag = soup.find_all( class_ = "uk-nav uk-nav-side" )[ 1 ]
urls = []
for li in menu_tag.find_all( "li" ):
url = "http://www.liaoxuefeng.com" + li.a.get( 'href' )
urls.append(url)
return urls
|
2. 通过文章url用模板保存每篇文章的HTML文件
html模板:
1
2
3
4
5
6
7
8
9
10
11
|
html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
</head>
<body>
{content}
</body>
</html>
"""
|
进行保存:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
|
def parse_url_to_html(url, name):
"""
解析URL,返回HTML内容
:param url:解析的url
:param name: 保存的html文件名
:return: html
"""
try :
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser' )
# 正文
body = soup.find_all( class_ = "x-wiki-content" )[ 0 ]
# 标题
title = soup.find( 'h4' ).get_text()
# 标题加入到正文的最前面,居中显示
center_tag = soup.new_tag( "center" )
title_tag = soup.new_tag( 'h1' )
title_tag.string = title
center_tag.insert( 1 , title_tag)
body.insert( 1 , center_tag)
html = str (body)
# body中的img标签的src相对路径的改成绝对路径
pattern = "(<img .*?src=\")(.*?)(\")"
def func(m):
if not m.group( 3 ).startswith( "http" ):
rtn = m.group( 1 ) + "http://www.liaoxuefeng.com" + m.group( 2 ) + m.group( 3 )
return rtn
else :
return m.group( 1 ) + m.group( 2 ) + m.group( 3 )
html = re. compile (pattern).sub(func, html)
html = html_template. format (content = html)
html = html.encode( "utf-8" )
with open (name, 'wb' ) as f:
f.write(html)
return name
except Exception as e:
logging.error( "解析错误" , exc_info = True )
|
3. 把html转换成pdf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
def save_pdf(htmls, file_name):
"""
把所有html文件保存到pdf文件
:param htmls: html文件列表
:param file_name: pdf文件名
:return:
"""
options = {
'page-size' : 'Letter' ,
'margin-top' : '0.75in' ,
'margin-right' : '0.75in' ,
'margin-bottom' : '0.75in' ,
'margin-left' : '0.75in' ,
'encoding' : "UTF-8" ,
'custom-header' : [
( 'Accept-Encoding' , 'gzip' )
],
'cookie' : [
( 'cookie-name1' , 'cookie-value1' ),
( 'cookie-name2' , 'cookie-value2' ),
],
'outline-depth' : 10 ,
}
pdfkit.from_file(htmls, file_name, options = options)
|
4. 把转换好的单个PDF合并为一个PDF
1
2
3
4
|
merger = PdfFileMerger()
for pdf in pdfs:
merger.append( open (pdf, 'rb' ))
print u "合并完成第" + str (i) + '个pdf' + pdf
|
完整源码:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
|
# coding=utf-8
import os
import re
import time
import logging
import pdfkit
import requests
from bs4 import BeautifulSoup
from PyPDF2 import PdfFileMerger
html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
</head>
<body>
{content}
</body>
</html>
"""
def parse_url_to_html(url, name):
"""
解析URL,返回HTML内容
:param url:解析的url
:param name: 保存的html文件名
:return: html
"""
try :
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser' )
# 正文
body = soup.find_all( class_ = "x-wiki-content" )[ 0 ]
# 标题
title = soup.find( 'h4' ).get_text()
# 标题加入到正文的最前面,居中显示
center_tag = soup.new_tag( "center" )
title_tag = soup.new_tag( 'h1' )
title_tag.string = title
center_tag.insert( 1 , title_tag)
body.insert( 1 , center_tag)
html = str (body)
# body中的img标签的src相对路径的改成绝对路径
pattern = "(<img .*?src=\")(.*?)(\")"
def func(m):
if not m.group( 3 ).startswith( "http" ):
rtn = m.group( 1 ) + "http://www.liaoxuefeng.com" + m.group( 2 ) + m.group( 3 )
return rtn
else :
return m.group( 1 ) + m.group( 2 ) + m.group( 3 )
html = re. compile (pattern).sub(func, html)
html = html_template. format (content = html)
html = html.encode( "utf-8" )
with open (name, 'wb' ) as f:
f.write(html)
return name
except Exception as e:
logging.error( "解析错误" , exc_info = True )
def get_url_list():
"""
获取所有URL目录列表
:return:
"""
response = requests.get( "http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000" )
soup = BeautifulSoup(response.content, "html.parser" )
menu_tag = soup.find_all( class_ = "uk-nav uk-nav-side" )[ 1 ]
urls = []
for li in menu_tag.find_all( "li" ):
url = "http://www.liaoxuefeng.com" + li.a.get( 'href' )
urls.append(url)
return urls
def save_pdf(htmls, file_name):
"""
把所有html文件保存到pdf文件
:param htmls: html文件列表
:param file_name: pdf文件名
:return:
"""
options = {
'page-size' : 'Letter' ,
'margin-top' : '0.75in' ,
'margin-right' : '0.75in' ,
'margin-bottom' : '0.75in' ,
'margin-left' : '0.75in' ,
'encoding' : "UTF-8" ,
'custom-header' : [
( 'Accept-Encoding' , 'gzip' )
],
'cookie' : [
( 'cookie-name1' , 'cookie-value1' ),
( 'cookie-name2' , 'cookie-value2' ),
],
'outline-depth' : 10 ,
}
pdfkit.from_file(htmls, file_name, options = options)
def main():
start = time.time()
file_name = u "liaoxuefeng_Python3_tutorial"
urls = get_url_list()
for index, url in enumerate (urls):
parse_url_to_html(url, str (index) + ".html" )
htmls = []
pdfs = []
for i in range ( 0 , 124 ):
htmls.append( str (i) + '.html' )
pdfs.append(file_name + str (i) + '.pdf' )
save_pdf( str (i) + '.html' , file_name + str (i) + '.pdf' )
print u "转换完成第" + str (i) + '个html'
merger = PdfFileMerger()
for pdf in pdfs:
merger.append( open (pdf, 'rb' ))
print u "合并完成第" + str (i) + '个pdf' + pdf
output = open (u "廖雪峰Python_all.pdf" , "wb" )
merger.write(output)
print u "输出PDF成功!"
for html in htmls:
os.remove(html)
print u "删除临时文件" + html
for pdf in pdfs:
os.remove(pdf)
print u "删除临时文件" + pdf
total_time = time.time() - start
print (u "总共耗时:%f 秒" % total_time)
if __name__ = = '__main__' :
main()
|
希望本文所述对大家Python程序设计有所帮助。
原文链接:https://blog.csdn.net/hubaoquanu/article/details/66973149