PDF匹配文本精准标记红框算法

时间:2024-10-12 08:25:59
## pip install pdfminer.six
## pip install PyMuPDF

import fitz
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTChar


## pdf匹配文本标红框
def pdfMarkedBox(string,name,address="",mode = 0,order=0,skew =0):
    """
    :param string:  匹配的字符串
    :param name:    标框的pdf文件命名或者放置地址
    :param address: 对应的pdf文件地址
    :param mode:    对应匹配模式 mode = 0 完全匹配 mode =1 包含匹配
    :param order:   取值位置,默认第一个
    :param skew:    偏移位置,用于飘忽不定的文本匹配不上,找到特定标识位置偏移匹配
    :return:        无
    """
    ## 文本存储
    list_text = []
    ## 坐标存储
    list_box = []
    ## 页码存储
    list_number = []

    def parse_pdf(address):
        with open(address, 'rb') as fp:
            ## 准备工作
            parser = PDFParser(fp)
            doc = PDFDocument(parser)
            rsrcmgr = PDFResourceManager()
            laparams = LAParams()
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            page_number = 0  # 初始化页码计数器
            ## 获取pdf页面数据
            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)
                layout = device.get_result()
                for element in layout:
                    if isinstance(element, LTTextBox) or isinstance(element, LTTextLine):
                        # print(f"Text: {element.get_text()}")
                        # print(f"Coordinates: {element.bbox}")
                        list_text.append(element.get_text())
                        list_box.append(element.bbox)
                        list_number.append(page_number)
                    # elif isinstance(element, LTChar):
                    #     # 单个字符遍历LTChar对象
                    #     print(f"Character: {element.get_text()}")
                    #     print(f"Coordinates: {element.bbox}")
                page_number += 1

    parse_pdf(address)
    print(list_text)
    print(list_box)
    print(list_number)
    ## 匹配列表 进行匹配操作 获取index
    list_index = []
    if mode == 0:
        print("进行完全匹配")
        number_matches = 0
        for index, value in enumerate(list_text):
            if string == value.replace("\n",""):
                list_index.append(index)
                number_matches += 1
        print("匹配数量为:", number_matches)
    elif mode ==1:
        print("进行包含匹配")
        number_matches = 0
        for index, value in enumerate(list_text):
            if string in value.replace("\n", ""):
                list_index.append(index)
                number_matches += 1
        print("匹配数量为:",number_matches)

    print("读取位置:",list_index[order]+skew)
    print("读取文本:",list_text[list_index[order]+skew])
    print("读取坐标:", list_box[list_index[order] + skew])
    print("读取页码:", list_number[list_index[order] + skew])

    def redBox(address,page,box,name):
        ##PyMuPDF进行处理
        doc = fitz.open(address)
        # 选择要添加注释的页面
        page = doc.load_page(page)  # 0 表示第一页
        # 获取页面大小来做处理
        page_rect = page.rect
        list_rect = list(page_rect)
        # 定义矩形注释的位置和大小(左下角和右上角的坐标)
        rect = box  # (left, bottom, right, top)
        list1 = []
        for i in rect:
            list1.append(i)
        n = 5
        new_rect = (list1[0] - n, list_rect[3] - list1[3] - n, list1[2] + n, list_rect[3] - list1[1] + n)
        # 添加矩形注释到页面
        annot = page.add_rect_annot(new_rect)
        # 保存 PDF 文件
        doc.save(name)
        # 关闭 PDF 文件
        doc.close()
        print("文件完成标红框:",name)

    redBox(address, list_number[list_index[order] + skew], list_box[list_index[order] + skew], name)

 

## pip install pdfminer.six
## pip install PyMuPDF

import fitz
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTChar


## pdf匹配文本标红框
def pdfMarkedBox(string,name,address="",mode = 0,order=0,skew =0):
    """
    :param string:  匹配的字符串
    :param name:    标框的pdf文件命名或者放置地址
    :param address: 对应的pdf文件地址
    :param mode:    对应匹配模式 mode = 0 完全匹配 mode =1 包含匹配
    :param order:   取值位置,默认第一个
    :param skew:    偏移位置,用于飘忽不定的文本匹配不上,找到特定标识位置偏移匹配
    :return:        无
    """
    ## 文本存储
    list_text = []
    ## 坐标存储
    list_box = []
    ## 页码存储
    list_number = []

    def parse_pdf(address):
        with open(address, 'rb') as fp:
            ## 准备工作
            parser = PDFParser(fp)
            doc = PDFDocument(parser)
            rsrcmgr = PDFResourceManager()
            laparams = LAParams()
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            page_number = 0  # 初始化页码计数器
            ## 获取pdf页面数据
            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)
                layout = device.get_result()
                for element in layout:
                    if isinstance(element, LTTextBox) or isinstance(element, LTTextLine):
                        # print(f"Text: {element.get_text()}")
                        # print(f"Coordinates: {element.bbox}")
                        list_text.append(element.get_text())
                        list_box.append(element.bbox)
                        list_number.append(page_number)
                    # elif isinstance(element, LTChar):
                    #     # 单个字符遍历LTChar对象
                    #     print(f"Character: {element.get_text()}")
                    #     print(f"Coordinates: {element.bbox}")
                page_number += 1

    parse_pdf(address)
    print(list_text)
    print(list_box)
    print(list_number)
    ## 匹配列表 进行匹配操作 获取index
    list_index = []
    if mode == 0:
        print("进行完全匹配")
        number_matches = 0
        for index, value in enumerate(list_text):
            if string == value.replace("\n",""):
                list_index.append(index)
                number_matches += 1
        print("匹配数量为:", number_matches)
    elif mode ==1:
        print("进行包含匹配")
        number_matches = 0
        for index, value in enumerate(list_text):
            if string in value.replace("\n", ""):
                list_index.append(index)
                number_matches += 1
        print("匹配数量为:",number_matches)

    print("读取位置:",list_index[order]+skew)
    print("读取文本:",list_text[list_index[order]+skew])
    print("读取坐标:", list_box[list_index[order] + skew])
    print("读取页码:", list_number[list_index[order] + skew])

    def redBox(address,page,box,name):
        ##PyMuPDF进行处理
        doc = fitz.open(address)
        # 选择要添加注释的页面
        page = doc.load_page(page)  # 0 表示第一页
        # 获取页面大小来做处理
        page_rect = page.rect
        list_rect = list(page_rect)
        # 定义矩形注释的位置和大小(左下角和右上角的坐标)
        rect = box  # (left, bottom, right, top)
        list1 = []
        for i in rect:
            list1.append(i)
        n = 5
        new_rect = (list1[0] - n, list_rect[3] - list1[3] - n, list1[2] + n, list_rect[3] - list1[1] + n)
        # 添加矩形注释到页面
        annot = page.add_rect_annot(new_rect)
        # 保存 PDF 文件
        doc.save(name)
        # 关闭 PDF 文件
        doc.close()
        print("文件完成标红框:",name)

    redBox(address, list_number[list_index[order] + skew], list_box[list_index[order] + skew], name)