Java实现Word/Pdf/TXT转html

引言:

最近公司在做一个教育培训学习及在线考试的项目,本人主要从事网络课程模块,主要做课程分类,课程,课件的创建及在线学习和统计的功能,因为课件涉及到多种类型,像视频,音频,图文,外部链接及文档类型.其中就涉及到一个问题,就是文档型课件课程在网页上的展示和学习问题,因为要在线统计学习的课程,学习的人员,学习的时长,所以不能像传统做法将文档下载到本地学习,那样就不受系统控制了,所以最终的方案是,在上传文档型课件的时候,将其文件对应的转换成HTML文件,以便在网页上能够浏览学习

下边主要针对word,pdf和txt文本文件进行转换

一:Java实现将word转换为html

1:引入依赖

 <dependency>

   <groupId>fr.opensagres.xdocreport</groupId>

   <artifactId>fr.opensagres.xdocreport.document</artifactId>

   <version>1.0.5</version>

 </dependency>

 <dependency>

   <groupId>fr.opensagres.xdocreport</groupId>

   <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId>

   <version>1.0.5</version>

 </dependency>

   <dependency>

   <groupId>org.apache.poi</groupId>

   <artifactId>poi</artifactId>

   <version>3.12</version>

 </dependency>

 <dependency>

   <groupId>org.apache.poi</groupId>

   <artifactId>poi-scratchpad</artifactId>

   <version>3.12</version>

 </dependency>

2:代码demo

 package com.svse.controller;

 import javax.xml.parsers.DocumentBuilderFactory;

 import javax.xml.parsers.ParserConfigurationException;

 import javax.xml.transform.OutputKeys;

 import javax.xml.transform.Transformer;

 import javax.xml.transform.TransformerException;

 import javax.xml.transform.TransformerFactory;

 import javax.xml.transform.dom.DOMSource;

 import javax.xml.transform.stream.StreamResult;

 import org.apache.poi.hwpf.HWPFDocument;

 import org.apache.poi.hwpf.converter.PicturesManager;

 import org.apache.poi.hwpf.converter.WordToHtmlConverter;

 import org.apache.poi.hwpf.usermodel.PictureType;

 import org.apache.poi.xwpf.converter.core.BasicURIResolver;

 import org.apache.poi.xwpf.converter.core.FileImageExtractor;

 import org.apache.poi.xwpf.converter.core.FileURIResolver;

 import org.apache.poi.xwpf.converter.core.IURIResolver;

 import org.apache.poi.xwpf.converter.core.IXWPFConverter;

 import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;

 import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;

 import org.apache.poi.xwpf.usermodel.XWPFDocument;

 /**

  * word 转换成html

  */

 public class TestWordToHtml {

     public static  final String STORAGEPATH="C://works//files//";

     public static  final String IP="192.168.30.222";

     public static  final String PORT="8010";

     public static void main(String[] args) throws IOException, TransformerException, ParserConfigurationException {

         TestWordToHtml wt=new TestWordToHtml();

         //wt.Word2003ToHtml("甲骨文考证.doc");

         wt.Word2007ToHtml("甲骨文考证.docx");

     }

      /**

      * 2003版本word转换成html

      * @throws IOException

      * @throws TransformerException

      * @throws ParserConfigurationException

      */

     public void Word2003ToHtml(String fileName) throws IOException, TransformerException, ParserConfigurationException {

         final String imagepath = STORAGEPATH+"fileImage/";//解析时候如果doc文件中有图片  图片会保存在此路径

         final String strRanString=getRandomNum();

         String filepath =STORAGEPATH;

         String htmlName =fileName.substring(0, fileName.indexOf("."))+ "2003.html";

         final String file = filepath + fileName;

         InputStream input = new FileInputStream(new File(file));

         HWPFDocument wordDocument = new HWPFDocument(input);

         WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());

         //设置图片存放的位置

         wordToHtmlConverter.setPicturesManager(new PicturesManager() {

             public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {

                 File imgPath = new File(imagepath);

                 if(!imgPath.exists()){//图片目录不存在则创建

                     imgPath.mkdirs();

                 }

                 File file = new File(imagepath +strRanString+suggestedName);

                 try {

                     OutputStream os = new FileOutputStream(file);

                     os.write(content);

                     os.close();

                 } catch (FileNotFoundException e) {

                     e.printStackTrace();

                 } catch (IOException e) {

                     e.printStackTrace();

                 }

                 return  "http://"+IP+":"+PORT+"//uploadFile/fileImage/"+strRanString+suggestedName;

                // return imagepath +strRanString+suggestedName;

             }

         });

         //解析word文档

         wordToHtmlConverter.processDocument(wordDocument);

         Document htmlDocument = wordToHtmlConverter.getDocument();

         File htmlFile = new File(filepath +strRanString+htmlName);

         OutputStream outStream = new FileOutputStream(htmlFile);

         DOMSource domSource = new DOMSource(htmlDocument);

         StreamResult streamResult = new StreamResult(outStream);

         TransformerFactory factory = TransformerFactory.newInstance();

         Transformer serializer = factory.newTransformer();

         serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");

         serializer.setOutputProperty(OutputKeys.INDENT, "yes");

         serializer.setOutputProperty(OutputKeys.METHOD, "html");

         serializer.transform(domSource, streamResult);

         outStream.close();

         System.out.println("生成html文件路径:"+ "http://"+IP+":"+PORT+"//uploadFile/"+strRanString+htmlName);

     }

     /**

      * 2007版本word转换成html

      * @throws IOException

      */

     public void Word2007ToHtml(String fileName) throws IOException {

        final String strRanString=getRandomNum();

         String filepath = STORAGEPATH+strRanString;

         String htmlName =fileName.substring(0, fileName.indexOf("."))+ "2007.html";

         File f = new File(STORAGEPATH+fileName);

         if (!f.exists()) {

             System.out.println("Sorry File does not Exists!");

         } else {

             if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) {

                 try {

                     // 1) 加载word文档生成 XWPFDocument对象

                     InputStream in = new FileInputStream(f);

                     XWPFDocument document = new XWPFDocument(in);  

                     // 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)

                     File imageFolderFile = new File(filepath);

                     XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile));

                     options.setExtractor(new FileImageExtractor(imageFolderFile));

                     options.URIResolver(new IURIResolver() {

                         public String resolve(String uri) {

                             //http://192.168.30.222:8010//uploadFile/....

                             return "http://"+IP+":"+PORT+"//uploadFile/"+strRanString +"/"+ uri;

                         }

                     });

                     options.setIgnoreStylesIfUnused(false);

                     options.setFragment(true);  

                     // 3) 将 XWPFDocument转换成XHTML

                     OutputStream out = new FileOutputStream(new File(filepath + htmlName));

                     IXWPFConverter<XHTMLOptions> converter = XHTMLConverter.getInstance();

                     converter.convert(document,out, options);

                     //XHTMLConverter.getInstance().convert(document, out, options);

                     System.out.println("html路径:"+"http://"+IP+":"+PORT+"//uploadFile/"+strRanString+htmlName);

                 } catch (Exception e) {

                     e.printStackTrace();

                 }

             } else {

                 System.out.println("Enter only MS Office 2007+ files");

             }

         }

     }  

      /**

      *功能说明:生成时间戳

      *创建人:zsq

      *创建时间:2019年12月7日 下午2:37:09

      *

      */

      public static String getRandomNum(){

          Date dt = new Date();

          SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");

          String str=sdf.format(dt);

          return str;

      }

    }

二:Java实现将Pdf转换为html

1: 引入依赖

 <dependency>

             <groupId>net.sf.cssbox</groupId>

             <artifactId>pdf2dom</artifactId>

             <version>1.7</version>

         </dependency>

         <dependency>

             <groupId>org.apache.pdfbox</groupId>

             <artifactId>pdfbox</artifactId>

             <version>2.0.12</version>

         </dependency>

         <dependency>

             <groupId>org.apache.pdfbox</groupId>

             <artifactId>pdfbox-tools</artifactId>

             <version>2.0.12</version>

  </dependency>

2:代码Demo

 public class PdfToHtml {

   /*

     pdf转换html

      */

     public void pdfToHtmlTest(String inPdfPath,String outputHtmlPath)  {

        // String outputPath = "C:\\works\\files\\ZSQ保密知识测试题库.html";

            //try() 写在()里面会自动关闭流

         try{

             BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(outputHtmlPath)),"utf-8"));

             //加载PDF文档

             //PDDocument document = PDDocument.load(bytes);

             PDDocument document = PDDocument.load(new File(inPdfPath));

             PDFDomTree pdfDomTree = new PDFDomTree();

             pdfDomTree.writeText(document,out);

         } catch (Exception e) {

             e.printStackTrace();

         }

     }

     public static void main(String[] args) throws IOException {

         PdfToHtml ph=new PdfToHtml();

         String pdfPath="C:\\works\\files\\武研中心行政考勤制度.pdf";

         String outputPath="C:\\works\\files\\武研中心行政考勤制度.html";

         ph.pdfToHtmlTest(pdfPath,outputPath);

   }

 }

三:Java实现将TXT转换为html

  /*

      * txt文档转html

        filePath:txt原文件路径

        htmlPosition:转化后生成的html路径

     */

     public static void txtToHtml(String filePath, String htmlPosition) {

         try {

             //String encoding = "GBK";

             File file = new File(filePath);

             if (file.isFile() && file.exists()) { // 判断文件是否存在

                 InputStreamReader read = new InputStreamReader(new FileInputStream(file), "GBK");

                 // 考虑到编码格式

                 BufferedReader bufferedReader = new BufferedReader(read);

                 // 写文件

                 FileOutputStream fos = new FileOutputStream(new File(htmlPosition));

                 OutputStreamWriter osw = new OutputStreamWriter(fos, "GBK");

                 BufferedWriter bw = new BufferedWriter(osw);

                 String lineTxt = null;

                 while ((lineTxt = bufferedReader.readLine()) != null) {

                     bw.write("&nbsp&nbsp&nbsp"+lineTxt + "</br>");

                 }

                 bw.close();

                 osw.close();

                 fos.close();

                 read.close();

             } else {

                 System.out.println("找不到指定的文件");

             }

         } catch (Exception e) {

             System.out.println("读取文件内容出错");

             e.printStackTrace();

         }

     }

秒客网

Java实现Word/Pdf/TXT转html

相关文章