图片版pdf无法复制,转化成文字版的pdf后使用更方便.
我们需要用到python3.6,pypdf2,ghostscript,pythonmagick,百度文字识别服务和pdfkit.
安装
安装python3.6 略
安装ghostscript
https://ghostscript.com/download/gsdnld.html
安装wkhtmltopdf
https://wkhtmltopdf.org/downloads.html
pip安装pypdf2,ghostscript,baidu-aip,pdfkit
1
2
3
4
|
pip install pypdf2
pip install ghostscript
pip install baidu - aip
pip install pdfkit
|
pip安装pythonmagick
https://www.lfd.uci.edu/~gohlke/pythonlibs/
1
2
|
cd 下载目录
pip install pythonmagick‑ 0.9 . 13 ‑cp36‑cp36m‑win_amd64.whl
|
pypdf2用于拆分和合并pdf
示例代码如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
#导入pdffilereader和pdffilewriter
from pypdf2 import pdffilereader, pdffilewriter
#获取一个pdf对象
pdf_input = pdffilereader( open (r 'pdf路径' , 'rb' ))
#获取pdf页数
page_count = pdf_input.getnumpages()
#获取pdf第四页的内容
page = pdf_input.getpage( 3 )
page[ '/contents' ]
#获取一个pdfwriter对象
pdf_output = pdffilewriter()
# 将一个 pageobject 加入到 pdffilewriter 中
pdf_output.addpage(page)
#把新pdf保存
pdf_output.write( open (r '新pdf路径' , 'wb' ))
|
pythonmagick用于将单页pdf转化为jpg
百度云-文字识别-python sdk
每天有500次免费的识别
示例代码如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
#导入baidu-aip
from aip import aipocr
#https://console.bce.baidu.com/#/index/overview
#产品服务->人工智能->文字识别->创建应用
#获取以下三个值
app_id = '??'
api_key = '??'
secret_key = '?? '
#新建一个aipocr
client = aipocr(app_id, api_key, secret_key)
#读取本地图片的函数
def get_file_content(filepath):
with open (filepath, 'rb' ) as fp:
return fp.read()
#读取本地图片
image = get_file_content( 'p1.jpg' )
#可选参数
options = {}
options[ "language_type" ] = "chn_eng"
options[ "detect_direction" ] = "true"
options[ "detect_language" ] = "true"
options[ "probability" ] = "true"
#通用文字识别
client.basicgeneral(image, options)
#读取网络图片
url = "https://note.youdao.com/yws/public/resource/1577071c1ffa2b6bf4e238ef6dbcfbf5/xmlnote/e5a19bedfeba4879b217c5bbf53b0245/22138"
#可选参数
options = {}
options[ "language_type" ] = "chn_eng"
options[ "detect_direction" ] = "true"
options[ "detect_language" ] = "true"
options[ "probability" ] = "true"
#通用文字识别
client.basicgeneralurl(url, options)
#读取本地表格图片的函数
def get_file_content(filepath):
with open (filepath, 'rb' ) as fp:
return fp.read()
#读取本地表格图片
image = get_file_content( 'p2.jpg' )
#可选参数
options = {}
options[ "language_type" ] = "chn_eng"
options[ "detect_direction" ] = "true"
options[ "detect_language" ] = "true"
options[ "probability" ] = "true"
#通用文字识别
client.basicgeneral(image, options)
#读取表格分割效果较差!
|
pdfkit用于利用字符串生成pdf
示例代码如下:
1
2
3
4
5
6
7
|
#pdfkit安装位置设置
path_wk = r 'pdfkit安装位置设置'
pdfkit_config = pdfkit.configuration(wkhtmltopdf = path_wk)
#pdfkit参数
pdfkit_options = { 'encoding' : 'utf-8' ,}
#制作pdf
pdfkit.from_string(( 'string' ), 'd:\test.pdf' ,configuration = pdfkit_config,options = pdfkit_options)
|
完整代码如下
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
|
#导入所需包
#os,操作文件和路径
import os
#ghostscript,代码简化
import ghostscript
#pypdf2,拆分pdf
from pypdf2 import pdffilereader, pdffilewriter
#pythonmagick,单页pdf转图片
from pythonmagick import image
#baidu-aip,百度文字识别
from aip import aipocr
#pdfkit,字符串制作pdf
import pdfkit
#参数
path = '??'
pdfname = '??'
dpi = '85'
#https://console.bce.baidu.com/#/index/overview
#产品服务->人工智能->文字识别->创建应用
#获取以下三个值
app_id = '??'
api_key = '??'
secret_key = '?? '
#pdfkit安装位置设置
path_wk = r 'pdfkit安装位置设置'
pdfkit_config = pdfkit.configuration(wkhtmltopdf = path_wk)
#pdfkit参数
pdfkit_options = { 'encoding' : 'utf-8' ,}
#pdf转化为图片
os.chdir(path)
pdf_input = pdffilereader( open (pdfname, 'rb' ))
#自动获取pdf页数
page_count = pdf_input.getnumpages()
page_range = range (page_count)
#也可以手工指定pdf需要转换的页数
#page_range=range(0,100)
#使用pypdf和ghostscript
#==超级好用,超级直观,超级短==
for page_num in page_range:
im = image()
im.density(dpi)
im.read(pdfname + '[' + str (page_num) + ']' )
im.write( str (page_num) + '.jpg' )
#图片转化为字符串
#新建一个aipocr
client = aipocr(app_id, api_key, secret_key)
#读取本地图片的函数
def get_file_content(filepath):
with open (filepath, 'rb' ) as fp:
return fp.read()
#可选参数
options = {}
options[ "language_type" ] = "chn_eng"
options[ "detect_direction" ] = "false"
options[ "detect_language" ] = "false"
options[ "probability" ] = "false"
allteststr = []
for page_num in page_range:
#读取本地图片
image = get_file_content(r '%s\%s.jpg' % (path,page_num))
#通用文字识别,得到的是一个dict
testjson = client.basicgeneral(image, options)
teststr = ''
for x in testjson[ 'words_result' ]:
teststr = teststr + x[ 'words' ] + '</br>'
allteststr.append(teststr)
#字符串写入pdf
for page_num in page_range:
pdfkit.from_string((allteststr[page_num]), '%s.pdf' % ( str (page_num)),configuration = pdfkit_config,options = pdfkit_options)
#合并单页pdf
pdf_output = pdffilewriter()
for page_num in page_range:
os.chdir(path)
pdf_input = pdffilereader( open ( '%s.pdf' % ( str (page_num)), 'rb' ))
page = pdf_input.getpage( 0 )
pdf_output.addpage(page)
pdf_output.write( open ( 'newpdf.pdf' , 'wb' ))
|
以上就是为大家介绍的如何使用python3.6,pypdf2,ghostscript,pythonmagick,百度文字识别服务和pdfkit
原文链接:https://blog.csdn.net/sqq513/article/details/79368243