Apache POI 解析 microsoft word 图片文字都不放过

项目需要，写了个 ms word 解析器，贴出来分享！

Apache POI 组件主要用来解析 microsoft word,ppt,excel,Visio 文档，具体介绍看下面吧！

Overview

The following are components of the entire POI project and a brief summary of their purpose.

POIFS for OLE 2 Documents

POIFS is the oldest and most stable part of the project. It is our port of the OLE 2 Compound Document Format to pure Java. It supports both read and write functionality. All of our components ultimately rely on it by definition. Please see the POIFS project page for more information.

HSSF for Excel Documents

HSSF is our port of the Microsoft Excel 97(-2003) file format (BIFF8) to pure Java. It supports read and write capability. (Support for Excel 2007 .xlsx files is in progress). Please see the HSSF project page for more information.

HWPF for Word Documents

HWPF is our port of the Microsoft Word 97 file format to pure Java. It supports read, and limited write capabilities. Please see the HWPF project page for more information. This component is in the early stages of development. It can already read and write simple files.

Presently we are looking for a contributor to foster the HWPF development. Jump in!

HSLF for PowerPoint Documents

HSLF is our port of the Microsoft PowerPoint 97(-2003) file format to pure Java. It supports read and write capabilities. Please see the HSLF project page for more information.

HDGF for Visio Documents

HDGF is our port of the Microsoft Viso 97(-2003) file format to pure Java. It currently only supports reading at a very low level, and simple text extraction. Please see the HDGF project page for more information.

HPSF for Document Properties

HPSF is our port of the OLE 2 property set format to pure Java. Property sets are mostly use to store a document's properties (title, author, date of last modification etc.), but they can be used for application-specific purposes as well.

HPSF supports reading and writing of properties. However, you will need to be using version 3.0 of POI to utilise the write support.

Please see the HPSF project page for more information.

package org.osforce.document.extractor;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;

/**
*
* @author huhaozhong
* @version 1.0 date 2008.7.27
* microsoft word document extractor extract text and picture
*/

public class MSWordExtractor {

private HWPFDocument msWord;

/**
*
* @param input
* InputStream from file system which has word document stream
* @throws IOException
*/
public MSWordExtractor(InputStream input) throws IOException {

msWord = new HWPFDocument(input);
}
/**
*
* @return all paragraphs of text
*/
public String[] extractParagraphTexts() {

Range range = msWord.getRange();

int numParagraph = range.numParagraphs();

String[] paragraphs = new String[numParagraph];

for (int i = 0; i < numParagraph; i++) {

Paragraph p = range.getParagraph(i);

paragraphs = new String(p.text());
}

return paragraphs;
}
/**
*
* @return all text of a word
*/
public String extractMSWordText() {

Range range = msWord.getRange();

String msWordText = range.text();

return msWordText;
}
/**
*
* @param directory
* local file directory that store the images
* @throws IOException
*/
public void extractImagesIntoDirectory(String directory) throws IOException {

PicturesTable pTable = msWord.getPicturesTable();

int numCharacterRuns = msWord.getRange().numCharacterRuns();

for (int i = 0; i < numCharacterRuns; i++) {

CharacterRun characterRun = msWord.getRange().getCharacterRun(i);

if (pTable.hasPicture(characterRun)) {

System.out.println("have picture!");

Picture pic = pTable.extractPicture(characterRun, false);

String fileName = pic.suggestFullFileName();

OutputStream out = new FileOutputStream(new File(directory
+ File.separator + fileName));

pic.writeImageContent(out);
}
}
}
}

代码比较简单，而且在代码中也做了简单的注释，详细就不介绍了！

import java.io.FileInputStream; 
import java.io.FileOutputStream;
import java.io.IOException; 
import java.io.InputStream; 

import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;



public class WordDemoextends HttpServlet { 

private staticfinal long serialVersionUID = 1L;

public void doGet(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
this.doPost(request, response);
} 

public void doPost(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {

//从硬盘读取一个doc文档 
InputStream in = new FileInputStream("F:\\test.doc");
//类从word文档中提取文本,非特殊情况下,都将使用getParagraphText()与getText()
WordExtractor word = new WordExtractor(in);

//获取段文本 
String [] strArray = word.getParagraphText(); 
String str = word.getText(); 

for(int i=0 ; i<strArray.length ; i++){
System.out.println(strArray[i]+"\ti循环:"+i);
} 
System.out.println(str +"\t --");

//这个构造函数从InputStream中加载Word文档。 
HWPFDocument doc = new HWPFDocument((InputStream)new FileInputStream("F:\\test.doc"));

//这个类为HWPF对象模型,对文档范围段操作 
Range range = doc.getRange(); // 

//看看此文档有多少个段落 
int num = range.numParagraphs();
System.out.println(num+"段"); 

//得到word数据流 
byte [] dataStream = doc.getDataStream();
System.out.println("数据流长度:"+dataStream.length);

//用于在一段范围内获得段落数 
int numChar = range.numCharacterRuns();
System.out.println("CharacterRuns 数:"+numChar);

//负责图像提取 和 确定一些文件某块是否包含嵌入的图像。 
PicturesTable table = new PicturesTable(doc, dataStream,null); 

for(int j=0 ; j<numChar ; j++){
//这个类表示一个文本运行，有着共同的属性。 
CharacterRun run = range.getCharacterRun(j); 
//是否存在图片 
boolean bool = table.hasPicture(run);
System.out.println("是否存在图片:"+bool);
if(bool){ 
//返回图片对象绑定到指定的CharacterRun 
Picture pic = table.extractPicture(run, true);
//图片的内容字节写入到指定的输出流。 
pic.writeImageContent(new FileOutputStream("F:\\"+j+".bmp"));
System.out.println("成功提取图片"+j+":");
} 
} 
request.getRequestDispatcher("ok.jsp").forward(request, response);
} 


}

秒客网

Apache POI 解析 microsoft word 图片文字都不放过

Apache POI 解析 microsoft word 图片文字都不放过

相关文章