非机构化解析【包含PDF、word、PPT】-PPT操作

时间:2024-04-14 07:27:14
import os
from zipfile import ZipFile
from pptx import Presentation
from docx import Document

class PPT:
    def PPT_get_pictrue(self,infile):
        in_File = infile.split('/')[2][:-5] 
        new_filepath = os.path.join('%s/%s') % ('./resluts', in_File)
        if not os.path.exists(new_filepath):
            os.makedirs(new_filepath)
        with ZipFile(infile) as f:
            for file in f.namelist():
                if file.startswith("ppt/media/"):
                    f.extract(file, path=new_filepath)

        return new_filepath

    def PPT_get_words_to_txt(self,inpath, outpath):
        m_ppt = Presentation(inpath)
        # print(len(m_ppt.slides))
        with open(os.path.join('%s/%s.txt') % (outpath, 'resluts'), 'w', encoding='utf-8') as f:
            for slide in m_ppt.slides: 
                for shape in slide.shapes:
                    if not shape.has_text_frame: 
                        continue
                    for paragraph in shape.text_frame.paragraphs: 
                        for content in paragraph.runs:
                            f.write(content.text + '\n')

    def PPT_get_words_to_docx(self,filepath,save_path):
        wordfile = Document()
        pptx = Presentation(filepath)
        for slide in pptx.slides:
            for shape in slide.shapes:
                if shape.has_text_frame:
                    text_frame = shape.text_frame
                    for paragraph in text_frame.paragraphs:
                        wordfile.add_paragraph(paragraph.text)

        wordfile.save(save_path)


if __name__ == "__main__":
    infile = "./test_data/OpenCV算法解析.pptx"
    new_infile=PPT().PPT_get_pictrue(infile)
    PPT().PPT_get_words_to_txt(infile,new_infile)

结果如下:
在这里插入图片描述