docx文档表格抽取(openoffice)

时间:2021-11-01 21:07:16

引子

之前使用word去抽取word中table并转成png,过程和思路就不多赘述了。由于office的局限性,这里使用了openoffice去实现word抽取table的功能。

1.因为通过setTable生成新的文档会出现样式丢失的情况,所以这里我们需要记录下table的index并且移除除了index位置的Element去获取新的table文档。

/**  * 获取表格数据  * @param filePath  * @throws Exception  */ public static List<String> getWordExcel2007(String filePath,String picPath,XWPFDocument doc) throws Exception {
   //文件名
   String fileName = filePath.substring(filePath.lastIndexOf("\\")+1, filePath.length()-5);
   if (picPath != null && picPath.trim().length() > 0) {
      // 建立图片文件目录
      File imgFile = new File(picPath);
      if (!imgFile.exists()) {
         imgFile.mkdir();
      }
   }
       List<String> list = new ArrayList<String>();

       //接收表格
       XWPFDocument doc2;

       //记录table的index
       List<IBodyElement> bodyElements = doc.getBodyElements();
       List<Integer> tableIndex = new ArrayList<>();
       for(int i = 0 ;i<bodyElements.size();i++){
           BodyElementType elementType = bodyElements.get(i).getElementType();
           if(BodyElementType.TABLE.equals(elementType)){
               tableIndex.add(i);
           }
       }
       //输出
       OutputStream os = null;
       String fileOutPath = null;
       //图片存储路径
       String picoutpath = null;
       for (int j=0;j<tableIndex.size();j++) {
           InputStream is = new FileInputStream(filePath);
           doc = new XWPFDocument(is);
           doc2= doc;
           //设置页边距
           setDocumentMargin(doc2,"1797", "1440", "1797", "1440");
           fileOutPath = picPath+File.separator+"table"+ j +".docx";
           picoutpath = picPath+File.separator+"pic"+ j +".png";
           for (int i = bodyElements.size()-1; i>=0 ;i--) {
               if(i !=tableIndex.get(j)){
                   doc2.removeBodyElement(i);
               }
           }
           list.add(picoutpath);
           os = new FileOutputStream(fileOutPath);
           //写入文件
           doc2.write(os);
           //文档转换为图片
           picPath = picPath.replace("\\", "/");
           word2img(fileOutPath,picPath);
       }
       if(os != null){
           os.close();
       }
       return list;
}
/**  * 表格文档转图片  * @param fileOutPath 表格文档路径  * @param picPath 表格文档转图片存储路径  */ private static void word2img(String fileOutPath, String picPath) throws Exception {
   int index = fileOutPath.lastIndexOf("\\");
   String fileName = fileOutPath.substring(index+1,fileOutPath.length()-5);
   String toFileName = picPath+File.separator + fileName +".pdf";
       //用office
   //TableUtils.wordToPDF(fileOutPath,toFileName);
       //用oppenoffice
       TableForWord.openOffice2Pdf(fileOutPath,toFileName);
}

2.下面是整个word的操作工具类,直接上代码

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.artofsolving.jodconverter.OfficeDocumentConverter;
import org.artofsolving.jodconverter.office.DefaultOfficeManagerConfiguration;
import org.artofsolving.jodconverter.office.OfficeManager;

import javax.imageio.IIOImage;
import javax.imageio.ImageIO;
import javax.imageio.ImageWriter;
import javax.imageio.stream.ImageOutputStream;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.*;
import java.util.regex.Pattern;

/**  * 提取word中的table并转换成表格  * @author Pein  */ public class TableForWord {

    /**  * office中各种格式  */  private static final String[] OFFICE_POSTFIXS = { "doc", "docx", "xls",
            "xlsx", "ppt", "pptx" };
    private static ArrayList<String> Office_Formats = new ArrayList<String>();

    /**  * pdf格式  */  private static final String PDF_POSTFIX= "pdf";

    /**  * 根据操作系统的名称,获取OpenOffice.org 3的安装目录 如我的OpenOffice.org 3安装在:C:/Program  * Files/OpenOffice.org 3  */   public static String getOfficeHome() {
        String osName = System.getProperty("os.name");
        if (Pattern.matches("Linux.*", osName)) {
            return "/opt/openoffice.org3";
        } else if (Pattern.matches("Windows.*", osName)) {
            return "C:/Program Files (x86)/OpenOffice.org 3";
        }
        return null;
    }
    /**  * 转换文件  * @param inputFilePath  * @param outputFilePath  * @param converter  */  public static void converterFile(String inputFilePath, String outputFilePath,
                              OfficeDocumentConverter converter) {
        File inputFile=new File(inputFilePath);
        outputFilePath = inputFilePath.substring(0,inputFilePath.length()-5) + ".pdf";
        File outputFile = new File(outputFilePath);
        // 假如目标路径不存在,则新建该路径
        if (!outputFile.getParentFile().exists()) {
            outputFile.getParentFile().mkdirs();
        }
        converter.convert(inputFile, outputFile);
        System.out.println("文件:" + inputFilePath + "\n转换为\n目标文件:" + outputFile
                + "\n成功!");
    }

    /**  * 使Office2003-2007全部格式的文档(.doc|.docx|.xls|.xlsx|.ppt|.pptx) 转化为pdf文件  * @param inputFilePath 源文件路径,如:"e:/test.docx"  * @param outputFilePath 如果指定则按照指定方法,如果未指定(null)则按照源文件路径自动生成目标文件路径,如:"e:/test_docx.pdf"  * @return  */  public static boolean openOffice2Pdf(String inputFilePath, String outputFilePath) {
        boolean flag = false;
        /*
         * 连接OpenOffice.org 并且启动OpenOffice.org
         */
        DefaultOfficeManagerConfiguration config = new DefaultOfficeManagerConfiguration();
        // 获取OpenOffice.org 3的安装目录
        String officeHome = getOfficeHome();
        config.setOfficeHome(officeHome);
        // 启动OpenOffice的服务
        OfficeManager officeManager = config.buildOfficeManager();
        officeManager.start();
        // 连接OpenOffice
        OfficeDocumentConverter converter = new OfficeDocumentConverter(
                officeManager);
        long begin_time = new Date().getTime();
        File inputFile=new File(inputFilePath);
        Collections.addAll(Office_Formats, OFFICE_POSTFIXS);
        //pdf路径
        String pdfFilePath= null;
        //image路径
        String imageFilePath = null;
        if ((null != inputFilePath) && (inputFile.exists())) {
            // 判断目标文件路径是否为空
            if (Office_Formats.contains(getPostfix(inputFilePath))) {
                pdfFilePath = generateDefaultOutputFilePath(inputFilePath);
                if (null == outputFilePath) {
                    // 转换后的文件路径
                    converterFile(inputFilePath, pdfFilePath, converter);
                    flag = true;

                } else {
                    converterFile(inputFilePath, outputFilePath, converter);
                    flag = true;
                }
                imageFilePath = pdfFilePath.substring(0,pdfFilePath.length()-4)+".png";
                pdf2multiImage(pdfFilePath,imageFilePath,50);
            }

        } else {
            System.out.println("con't find the resource");
        }
        long end_time = new Date().getTime();
        System.out.println("文件转换耗时:[" + (end_time - begin_time) + "]ms");
        officeManager.stop();
        return flag;
    }

    /**  * 如果未设置输出文件路径则按照源文件路径和文件名生成输出文件地址。例,输入为 D:/test.xlsx 则输出为D:/test.pdf  */  public static String generateDefaultOutputFilePath(String inputFilePath) {
        String outputFilePath = inputFilePath.replaceAll("."
                + getPostfix(inputFilePath), ".pdf");
        return outputFilePath;
    }

    /**  * 获取inputFilePath的后缀名,如:"e:/test.pptx"的后缀名为:"pptx"  */  public static String getPostfix(String inputFilePath) {
        String[] p = inputFilePath.split("\\.");
        if (p.length > 0) {// 判断文件有无扩展名
            // 比较文件扩展名
            return p[p.length - 1];
        } else {
            return null;
        }
    }

    /**  * 将pdf中的maxPage页,转换成一张图片  * @param pdfFile pdf的路径  * @param outpath 输出的图片的路径[包括名称]  * @param maxPage pdf的页数【比如Pdf有3页,如果maxPage=2,则将pdf中的前2页转成图片,如果超过pdf实际页数,则按实际页数转换】  */  private static void pdf2multiImage(String pdfFile, String outpath, int maxPage) {
        try {
            InputStream is = new FileInputStream(pdfFile);
            PDDocument pdf = PDDocument.load(is, true);
            List<PDPage> pages = pdf.getDocumentCatalog().getAllPages();
            List<BufferedImage> piclist = new ArrayList<BufferedImage>();
            int actSize = pages.size(); // pdf中实际的页数
            if (actSize < maxPage) maxPage = actSize;
            for (int i = 0; i < maxPage; i++) {
                piclist.add(pages.get(i).convertToImage());
            }
            yPic(piclist, outpath);
            deleteTempleteDoc(outpath);
            is.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**  * 删除临时文件  */  private static void deleteTempleteDoc(String outpath) {
        //删除word和pdf
        String wordName = outpath.substring(0,outpath.length()-4)+".docx";
        String pdfName = outpath.substring(0,outpath.length()-4)+".pdf";
        File file;
        file = new File(wordName);
        if(file.exists()){
            file.delete();
        }
        file = new File(pdfName);
        if(file.exists()) {
            file.delete();
        }
    }

    /**  * 将宽度相同的图片,竖向追加在一起 ##注意:宽度必须相同  * @param piclist 文件流数组  * @param outPath 输出路径  */  public static void yPic(List<BufferedImage> piclist, String outPath) {// 纵向处理图片
        if (piclist == null || piclist.size() <= 0) {
            System.out.println("图片数组为空!");
            return;
        }
        try {
            int height = 0, // 总高度
                    width = 0, // 总宽度
                    _height = 0, // 临时的高度 , 或保存偏移高度
                    __height = 0, // 临时的高度,主要保存每个高度
                    picNum = piclist.size();// 图片的数量
            File fileImg = null; // 保存读取出的图片
            int[] heightArray = new int[picNum]; // 保存每个文件的高度
            BufferedImage buffer = null; // 保存图片流
            List<int[]> imgRGB = new ArrayList<int[]>(); // 保存所有的图片的RGB
            int[] _imgRGB; // 保存一张图片中的RGB数据
            for (int i = 0; i < picNum; i++) {
                buffer = piclist.get(i);
                heightArray[i] = _height = buffer.getHeight();// 图片高度
                if (i == 0) {
                    width = buffer.getWidth();// 图片宽度
                }
                height += _height; // 获取总高度
                _imgRGB = new int[width * _height];// 从图片中读取RGB
                _imgRGB = buffer.getRGB(0, 0, width, _height, _imgRGB, 0, width);
                imgRGB.add(_imgRGB);
            }
            _height = 0; // 设置偏移高度为0
            // 生成新图片
            BufferedImage imageResult = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB);
            for (int i = 0; i < picNum; i++) {
                __height = heightArray[i];
                if (i != 0) _height += __height; // 计算偏移高度
                imageResult.setRGB(0, _height, width, __height, imgRGB.get(i), 0, width); // 写入流中
            }
            File outFile = new File(outPath);
            if(outFile.exists()){
                outFile.delete();
                outFile = new File(outPath);
            }
            ImageIO.write(imageResult, "png", outFile);// 写图片
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

}
openoffice3下载地址 点击打开链接