自动化将 word 转为 pdf,再将pdf转为图片!
参考:
https://blog.csdn.net/ynyn2013/article/details/49120731
https://www.jianshu.com/p/f57cc64b9f5e
一、将 doc 转为 pdf
1、install 依赖
pip install pywin32
2、直接调用win32com接口打开文件,另存为pdf。SaveAs中的参数17代表村委pdf格式,完了关闭文件,关闭word。
1 def doc2pdf(self): 2 try: 3 w = Dispatch("Word.Application") 4 doc = w.Documents.Open(self.docPath, ReadOnly=1) 5 doc.SaveAs(self.pdfPath, 17) 6 except: 7 traceback.print_exc() 8 finally: 9 doc.Close() 10 w.Quit() 11 self.checkFile(self.pdfPath, \'pdf\')
以下为文件格式对应表
wdFormatDocument = 0 wdFormatDocument97 = 0 wdFormatDocumentDefault = 16 wdFormatDOSText = 4 wdFormatDOSTextLineBreaks = 5 wdFormatEncodedText = 7 wdFormatFilteredHTML = 10 wdFormatFlatXML = 19 wdFormatFlatXMLMacroEnabled = 20 wdFormatFlatXMLTemplate = 21 wdFormatFlatXMLTemplateMacroEnabled = 22 wdFormatHTML = 8 wdFormatPDF = 17 wdFormatRTF = 6 wdFormatTemplate = 1 wdFormatTemplate97 = 1 wdFormatText = 2 wdFormatTextLineBreaks = 3 wdFormatUnicodeText = 7 wdFormatWebArchive = 9 wdFormatXML = 11 wdFormatXMLDocument = 12 wdFormatXMLDocumentMacroEnabled = 13 wdFormatXMLTemplate = 14 wdFormatXMLTemplateMacroEnabled = 15 wdFormatXPS = 18
二、将pdf转为图片
1、install 依赖
1.1、pip isntall PyMuPDF
1.2、Windows安装配置poppler
Windows用户必须为Windows安装poppler (http://blog.alivate.com.au/poppler-windows/),然后将bin/文件夹添加到PATH(开始>输入env>编辑系统环境变量>环境变量...>系统变量>Path)
安装完poppler需重启系统后生效。2、将pdf转为图片
1 def pdf2image(self): 2 # 建立图片文件夹 3 if not os.path.exists(self.imgFold): 4 os.mkdir(self.imgFold) 5 6 # 转存图片 7 pages = fitz.open(self.pdfPath) 8 for page in pages: 9 imgPath = os.path.join(self.imgFold, str(page.number)+\'.jpg\') 10 pix = page.getPixmap() 11 pix.writeImage(imgPath) 12 self.checkFile(imgPath, \'last img\')
三、直接将word转为图片
方法:结合1,2
代码如下:
1 \'\'\' 2 @Author: haikuoxin 3 @Date: 2019-11-11 11:21:12 4 @Last Modified by: haikuoxin 5 @Last Modified time: 2019-11-11 11:21:12 6 \'\'\' 7 import os 8 import time 9 import traceback 10 import fitz 11 from win32com.client import Dispatch 12 import comtypes.client 13 14 15 class Word2Pdf2Img(): 16 def __init__(self, docPath): 17 # 初始化路径 18 self.docPath = docPath 19 self.fileName = os.path.basename(self.docPath).split(\'.\')[0] 20 self.fileFold = os.path.dirname(self.docPath) 21 self.pdfPath = os.path.join(self.fileFold, self.fileName + \'.pdf\') 22 self.imgFold = os.path.join(self.fileFold, self.fileName) 23 24 @staticmethod 25 def checkFile(filePath, fileType=\'\'): 26 if os.path.isfile(filePath): 27 print (\'file {} existed!\'.format(fileType)) 28 else: 29 print (\'file {} not existed!\'.format(fileType)) 30 31 def doc2pdf(self): 32 # try: 33 # doc = w.Documents.Open(self.docPath, ReadOnly=1) 34 # doc.SaveAs(self.pdfPath, 17) 35 # doc.Close() 36 # except: 37 # traceback.print_exc() 38 # finally: 39 # w.Quit() 40 41 # w = Dispatch("Word.Application") 42 # doc = w.Documents.Open(self.docPath, ReadOnly=1) 43 # doc.SaveAs(self.pdfPath, 17) 44 # doc.Close() 45 # w.Quit() 46 47 w = comtypes.client.CreateObject(\'Word.Application\') 48 w.Visible = True 49 time.sleep(3) 50 doc = w.Documents.Open(self.docPath) 51 doc.SaveAs(self.pdfPath, FileFormat=17) 52 doc.Close() 53 w.Quit() 54 # os.system(\'taskkill /f /im WINWORD.EXE\') 55 self.checkFile(self.pdfPath, \'pdf\') 56 57 def pdf2image(self): 58 # 建立图片文件夹 59 if not os.path.exists(self.imgFold): 60 os.mkdir(self.imgFold) 61 62 # 转存图片 63 pages = fitz.open(self.pdfPath) 64 for page in pages: 65 imgPath = os.path.join(self.imgFold, str(page.number)+\'.jpg\') 66 pix = page.getPixmap() 67 pix.writeImage(imgPath) 68 self.checkFile(imgPath, \'last img\') 69 70 def doc2image(self): 71 self.doc2pdf() 72 self.pdf2image() 73 74 def run(): 75 dataFold = r\'C:\Users\Administrator\Desktop\chatuClass\data\online\' 76 docPaths = [os.path.join(dataFold, name) for name in os.listdir(dataFold)] 77 docCnt = len(docPaths) 78 errorCnt = 0 79 for i, docPath in enumerate(docPaths[:]): 80 if docPath==\'\': 81 continue 82 83 try: 84 cvter = Word2Pdf2Img(docPath) 85 print (docCnt, i, errorCnt, cvter.fileName) 86 cvter.doc2image() 87 except: 88 errorCnt += 1 89 traceback.print_exc() 90 91 if __name__ == "__main__": 92 run() 93 # docPath = r\'C:\Users\Administrator\Desktop\chatuClass\data\test\b.docx\' 94 # cvter = Word2Pdf2Img(docPath) 95 # cvter.doc2pdf()