word 转html,pdf转图片

时间:2021-05-01 06:42:32

maven配置:

<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>ooxml-schemas</artifactId>
                        <version>1.1</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi</artifactId><version>3.9</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi-ooxml</artifactId>
                        <version>3.9</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi-ooxml-schemas</artifactId>
                        <version>3.9</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi-ooxml</artifactId>
                        <version>3.9</version></dependency><dependency><groupId>org.apache.pdfbox</groupId><artifactId>pdfbox-examples</artifactId>
                        <version>1.8.9</version></dependency><dependency><groupId>org.docx4j</groupId><artifactId>docx4j-ImportXHTML</artifactId><version>3.2.2</version><exclusions><exclusion><artifactId>slf4j-log4j12</artifactId><groupId>org.slf4j</groupId></exclusion><exclusion><artifactId>log4j</artifactId><groupId>log4j</groupId></exclusion></exclusions></dependency><dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.8.1</version></dependency>


</pre><pre>
java代码:

private List<AttachmentConvertResultBean> convertToPicture(String inputFileName) {
List<AttachmentConvertResultBean> picList = new ArrayList<AttachmentConvertResultBean>();
if (StringUtils.isEmpty(inputFileName)) {
LOGGER.error("输入的文件名称为空");
} else {
File inputFile = new File(inputFileName);
if (!inputFile.exists()) {
LOGGER.error("要转换的文件不存在, " + inputFileName);
} else {
if (inputFileName.lastIndexOf(".") > 0 && inputFileName.lastIndexOf(".") < inputFileName.length() - 1) {
String fileType = inputFileName.substring(inputFileName.lastIndexOf(".") + 1,
inputFileName.length());
// doc文件转换 html
if (!StringUtils.isEmpty(fileType) && ("DOC".equals(fileType.toUpperCase()))) {
picList = FileConvertUtil.docCovertToHtml(inputFileName);
} else if (!StringUtils.isEmpty(fileType) && ("DOCX".equals(fileType.toUpperCase()))) {
// docx文件转换 html
picList = FileConvertUtil.docxConvertToHtml(inputFileName);
} else if (!StringUtils.isEmpty(fileType) && ("PDF".equals(fileType.toUpperCase()))) {
// pdf文件转换图片
picList = FileConvertUtil.pdfConvertToJpg(inputFileName);
} else {
LOGGER.error("要转换的文件既不是 Word,也不是PDF,Excel或者其他类型的文件不支持转换 " + inputFileName);
return null;
}
}
}
}
return picList;
}

/**
* doc 文档 转换成 Html
*
* @param fileName doc文件路径
* @return
* @see [相关类/方法](可选)
* @since [产品/模块版本](可选)
*/
@SuppressWarnings({ "rawtypes", "finally" })
public static List<AttachmentConvertResultBean> docCovertToHtml(String fileName) {
List<AttachmentConvertResultBean> resultList = new ArrayList<AttachmentConvertResultBean>();
try {
HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(fileName));
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
File file = new File(fileName);
if (!file.exists()) {
LOGGER.error("要转换的文件不存在 -》 " + fileName);
return resultList;
}

final File parentFile = file.getParentFile();
wordToHtmlConverter.setPicturesManager(new PicturesManager() {
public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
return parentFile.getAbsolutePath() + suggestedName;
}
});
wordToHtmlConverter.processDocument(wordDocument);
// save pictures
List pics = wordDocument.getPicturesTable().getAllPictures();
String shutFileName = file.getName();
if (pics != null) {
FileOutputStream fis = null;
// 文件转换结果
File jpgparentFile;
AttachmentConvertResultBean resultBean;
for (int i = 0; i < pics.size(); i++) {
Picture pic = (Picture) pics.get(i);
try {

if (!StringUtils.isEmpty(shutFileName) && shutFileName.lastIndexOf(".") > -1) {
shutFileName = shutFileName.substring(0, shutFileName.lastIndexOf("."));
}
jpgparentFile = new File(parentFile.getAbsolutePath() + File.separatorChar + shutFileName);
if (!jpgparentFile.exists()) {
jpgparentFile.mkdirs();
}
fis = new FileOutputStream(parentFile.getAbsolutePath() + File.separatorChar + shutFileName + File.separatorChar + pic.suggestFullFileName());
pic.writeImageContent(fis);
} catch (FileNotFoundException e) {
e.printStackTrace();
LOGGER.error(e.getMessage());
} finally {
if (null != fis) {
fis.close();
}
}
}
Document htmlDocument = wordToHtmlConverter.getDocument();
ByteArrayOutputStream out = new ByteArrayOutputStream();
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(out);
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "HTML");
serializer.transform(domSource, streamResult);
out.close();
writeFile(new String(out.toByteArray()), parentFile.getAbsolutePath() + File.separatorChar + shutFileName + ".html");
resultBean = new AttachmentConvertResultBean();
resultBean.setTargetFileIndex(1);
resultBean.setTargetFilePath(parentFile.getAbsolutePath() + File.separatorChar + shutFileName + ".html");
resultList.add(resultBean);
resultBean.setTargetFileType("html");
}
} catch (Exception e) {
LOGGER.error(e.getMessage());
} finally {
return resultList;
}
}

/**
* docx 文档 转换成 html
*
* @param inputFileFullPath
* @return
* @see [相关类/方法](可选)
* @since [产品/模块版本](可选)
*/
@SuppressWarnings("finally")
public static List<AttachmentConvertResultBean> docxConvertToHtml(String inputFileFullPath) {
List<AttachmentConvertResultBean> picList = new ArrayList<AttachmentConvertResultBean>();
if (StringUtils.isEmpty(inputFileFullPath)) {
LOGGER.error("要转换的文件不存在,文件路径 -> " + inputFileFullPath);
return picList;
} else {
// 输入文件
File inputFile = new File(inputFileFullPath);
// 文件存在 ,而且是文件
if (inputFile.exists() && inputFile.isFile()) {
XWPFDocument document = null;
try {
document = new XWPFDocument(new FileInputStream(inputFile));
XHTMLOptions options = XHTMLOptions.create();// .indent( 4 );

// 父目录 路径
String parentFilePath = inputFile.getParentFile().getAbsolutePath();
// docx 中的 图片存储路径
File imageFolder = new File(parentFilePath);
options.setExtractor(new FileImageExtractor(imageFolder));
// URI resolver
options.URIResolver(new FileURIResolver(imageFolder));

String inputFileName = inputFile.getName();
if (!StringUtils.isEmpty(inputFileName) && inputFileName.lastIndexOf(".") > 0) {
inputFileName = inputFileName.substring(0, inputFileName.lastIndexOf("."));
}
// 拼接html 输出路径
OutputStream out = new FileOutputStream(new File(parentFilePath + File.separatorChar + inputFileName + ".html"));
// docx转换html
XHTMLConverter.getInstance().convert(document, out, options);

AttachmentConvertResultBean resultBean = new AttachmentConvertResultBean();
resultBean.setTargetFileIndex(1);
resultBean.setTargetFilePath(parentFilePath + File.separatorChar + inputFileName + ".html");
resultBean.setTargetFileType("html");
picList.add(resultBean);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
return picList;
}
} else {
LOGGER.error("要转换的文件不存在,文件路径 -> " + inputFileFullPath);
return picList;
}
}
}

 /**
* PDF转换jpg 图片
*
* @param inputFilePath
* @return
* @see [相关类/方法](可选)
* @since [产品/模块版本](可选)
*/
@SuppressWarnings({ "rawtypes", "finally" })
public static List<AttachmentConvertResultBean> pdfConvertToJpg(String inputFilePath) {
List<AttachmentConvertResultBean> picList = new ArrayList<AttachmentConvertResultBean>();
PDDocument doc;
try {
File pdfFile = new File(inputFilePath);
String fileName;
if (!pdfFile.exists()) {
LOGGER.error("需要转换的文件不存在 -> " + inputFilePath);
}
fileName = pdfFile.getName();
if (!StringUtils.isEmpty(fileName) && fileName.lastIndexOf(".") > 0) {
fileName = fileName.substring(0, fileName.lastIndexOf("."));
}
File parentFile = pdfFile.getParentFile();
doc = PDDocument.load(inputFilePath);
List pages = doc.getDocumentCatalog().getAllPages();
File outFile;
// 文件转换结果
AttachmentConvertResultBean resultBean;
for (int i = 0; i < pages.size(); i++) {
PDPage page = (PDPage) pages.get(i);
BufferedImage image = page.convertToImage();
Iterator iter = ImageIO.getImageWritersBySuffix("jpg");
ImageWriter writer = (ImageWriter) iter.next();
outFile = new File(parentFile.getAbsolutePath() + File.separatorChar + fileName + File.separatorChar + fileName + "_" + i + ".jpg");
resultBean = new AttachmentConvertResultBean();
resultBean.setTargetFileIndex(i + 1);
resultBean.setTargetFilePath(parentFile.getAbsolutePath() + File.separatorChar + fileName + File.separatorChar + fileName + "_" + i + ".jpg");
resultBean.setTargetFileType("jpg");
picList.add(resultBean);
if (!outFile.getParentFile().exists()) {
outFile.getParentFile().mkdirs();
}
FileOutputStream out = new FileOutputStream(outFile);
ImageOutputStream outImage = ImageIO.createImageOutputStream(out);
writer.setOutput(outImage);
writer.write(new IIOImage(image, null, null));
}
doc.close();
} catch (IOException e) {
e.printStackTrace();
LOGGER.error("文件转换失败 -> " + inputFilePath);
} finally {
return picList;
}
}
// 输出html文件    private static void writeFile(String content, String path) {        FileOutputStream fos = null;        BufferedWriter bw = null;        org.jsoup.nodes.Document doc = Jsoup.parse(content);        content = doc.html();        try {            File file = new File(path);            fos = new FileOutputStream(file);            bw = new BufferedWriter(new OutputStreamWriter(fos, "UTF-8"));            bw.write(content);        } catch (Exception e) {            e.printStackTrace();            LOGGER.error(e.getMessage());        } finally {            try {                if (bw != null)                    bw.close();                if (fos != null)                    fos.close();            } catch (Exception e) {                LOGGER.error(e.getMessage());            }        }    }