首先说明Lucene如果想对office进行操作的话,需要额外的包
这里有几种方法
一是使用POI
二是使用jacob
这里的代码使用的是jacob对于word进行处理
代码:
package jacob;
import com.jacob.activeX.ActiveXComponent;
import com.jacob.com.Dispatch;
import com.jacob.com.Variant;
public class WordReader {
public WordReader() {
// TODO Auto-generated constructor stub
}
public static void extractDoc(String inputFile, String outputFile){
boolean flag = false;
//打开word应用程序
ActiveXComponent app = new ActiveXComponent("Word.Application");
try{
//设置word不可见
app.setProperty("Visible", new Variant(false));
//打开word文件
Dispatch doc1 = app.getProperty("Document").toDispatch();
Dispatch doc2 = Dispatch.invoke(doc1, "Open", Dispatch.Method, new Object[]{inputFile, new Variant(false),new Variant(true)}, new int[1]).toDispatch();
//作为txt格式保存到临时文件
Dispatch.invoke(doc2, "SaveAs", Dispatch.Method, new Object[]{outputFile, new Variant(7)}, new int[1]);
//关闭Word
Variant f = new Variant(false);
Dispatch.call(doc2, "Close", f);
flag = true;
}catch(Exception e){
e.printStackTrace();
}finally{
app.invoke("Quit", new Variant[]{});
}
if(flag == true){
System.out.println("Transformed Successfully");
}else{
System.out.println("Transform Failed");
}
}
public static void main(String[] args) {
// TODO Auto-generated method stub
WordReader.extractDoc("E:\\Hadoop.doc", "E:\\jacob.txt");
}
}