自动化将 word 转为 pdf,再将pdf转为图片! - 海阔心

时间:2024-02-19 22:50:17

自动化将 word 转为 pdf,再将pdf转为图片!

参考:

https://blog.csdn.net/ynyn2013/article/details/49120731

https://www.jianshu.com/p/f57cc64b9f5e

 

一、将 doc 转为 pdf

1、install 依赖

pip install pywin32

 

2、直接调用win32com接口打开文件,另存为pdf。SaveAs中的参数17代表村委pdf格式,完了关闭文件,关闭word。

 1     def doc2pdf(self):
 2         try:
 3             w = Dispatch("Word.Application")
 4             doc = w.Documents.Open(self.docPath, ReadOnly=1)
 5             doc.SaveAs(self.pdfPath, 17)
 6         except:
 7             traceback.print_exc()
 8         finally:
 9             doc.Close()
10             w.Quit()
11         self.checkFile(self.pdfPath, \'pdf\')

以下为文件格式对应表

wdFormatDocument = 0
wdFormatDocument97 = 0
wdFormatDocumentDefault = 16
wdFormatDOSText = 4
wdFormatDOSTextLineBreaks = 5
wdFormatEncodedText = 7
wdFormatFilteredHTML = 10
wdFormatFlatXML = 19
wdFormatFlatXMLMacroEnabled = 20
wdFormatFlatXMLTemplate = 21
wdFormatFlatXMLTemplateMacroEnabled = 22
wdFormatHTML = 8
wdFormatPDF = 17
wdFormatRTF = 6
wdFormatTemplate = 1
wdFormatTemplate97 = 1
wdFormatText = 2
wdFormatTextLineBreaks = 3
wdFormatUnicodeText = 7
wdFormatWebArchive = 9
wdFormatXML = 11
wdFormatXMLDocument = 12
wdFormatXMLDocumentMacroEnabled = 13
wdFormatXMLTemplate = 14
wdFormatXMLTemplateMacroEnabled = 15
wdFormatXPS = 18

  

 

二、将pdf转为图片

1、install 依赖

1.1、pip isntall PyMuPDF

 

1.2、Windows安装配置poppler
Windows用户必须为Windows安装poppler (http://blog.alivate.com.au/poppler-windows/),然后将bin/文件夹添加到PATH(开始>输入env>编辑系统环境变量>环境变量...>系统变量>Path)
安装完poppler需重启系统后生效。
 
2、将pdf转为图片
 1     def pdf2image(self):
 2         # 建立图片文件夹
 3         if not os.path.exists(self.imgFold):
 4             os.mkdir(self.imgFold)
 5 
 6         # 转存图片
 7         pages = fitz.open(self.pdfPath)
 8         for page in pages:
 9             imgPath = os.path.join(self.imgFold, str(page.number)+\'.jpg\')
10             pix = page.getPixmap()
11             pix.writeImage(imgPath)
12         self.checkFile(imgPath, \'last img\')

 

三、直接将word转为图片

方法:结合1,2

代码如下:

 1 \'\'\'
 2 @Author: haikuoxin
 3 @Date: 2019-11-11 11:21:12
 4 @Last Modified by:   haikuoxin
 5 @Last Modified time: 2019-11-11 11:21:12
 6 \'\'\'
 7 import os
 8 import time
 9 import traceback
10 import fitz
11 from win32com.client import Dispatch
12 import comtypes.client
13 
14 
15 class Word2Pdf2Img():
16     def __init__(self, docPath):
17         # 初始化路径
18         self.docPath = docPath
19         self.fileName = os.path.basename(self.docPath).split(\'.\')[0]
20         self.fileFold = os.path.dirname(self.docPath)
21         self.pdfPath = os.path.join(self.fileFold, self.fileName + \'.pdf\')
22         self.imgFold = os.path.join(self.fileFold, self.fileName)
23 
24     @staticmethod
25     def checkFile(filePath, fileType=\'\'):
26         if os.path.isfile(filePath):
27             print (\'file {} existed!\'.format(fileType))
28         else:
29             print (\'file {} not existed!\'.format(fileType))
30 
31     def doc2pdf(self):
32         # try:
33         #     doc = w.Documents.Open(self.docPath, ReadOnly=1)
34         #     doc.SaveAs(self.pdfPath, 17)
35         #     doc.Close()
36         # except:
37         #     traceback.print_exc()
38         # finally:
39         #     w.Quit()
40 
41         # w = Dispatch("Word.Application")
42         # doc = w.Documents.Open(self.docPath, ReadOnly=1)
43         # doc.SaveAs(self.pdfPath, 17)
44         # doc.Close()
45         # w.Quit()
46 
47         w = comtypes.client.CreateObject(\'Word.Application\')
48         w.Visible = True
49         time.sleep(3)
50         doc = w.Documents.Open(self.docPath)
51         doc.SaveAs(self.pdfPath, FileFormat=17)
52         doc.Close()
53         w.Quit()
54         # os.system(\'taskkill /f /im WINWORD.EXE\')
55         self.checkFile(self.pdfPath, \'pdf\')
56 
57     def pdf2image(self):
58         # 建立图片文件夹
59         if not os.path.exists(self.imgFold):
60             os.mkdir(self.imgFold)
61 
62         # 转存图片
63         pages = fitz.open(self.pdfPath)
64         for page in pages:
65             imgPath = os.path.join(self.imgFold, str(page.number)+\'.jpg\')
66             pix = page.getPixmap()
67             pix.writeImage(imgPath)
68         self.checkFile(imgPath, \'last img\')
69         
70     def doc2image(self):
71         self.doc2pdf()
72         self.pdf2image()
73 
74 def run():
75     dataFold = r\'C:\Users\Administrator\Desktop\chatuClass\data\online\'
76     docPaths = [os.path.join(dataFold, name) for name in os.listdir(dataFold)]
77     docCnt = len(docPaths)
78     errorCnt = 0
79     for i, docPath in enumerate(docPaths[:]):
80         if docPath==\'\':
81             continue
82     
83         try:
84             cvter = Word2Pdf2Img(docPath)
85             print (docCnt, i, errorCnt, cvter.fileName)
86             cvter.doc2image()
87         except:
88             errorCnt += 1
89             traceback.print_exc()
90 
91 if __name__ == "__main__":
92     run()
93     # docPath = r\'C:\Users\Administrator\Desktop\chatuClass\data\test\b.docx\'
94     # cvter = Word2Pdf2Img(docPath)
95     # cvter.doc2pdf()