初识全文检索Lucene

时间:2022-04-10 03:10:15

1.全文检索存在的意义:应对信息爆炸,满足人们在当下浩如烟海的数据中快速查找所需要的内容。

2.实现快速查找的大体思路:将原始数据分析后生成索引,实现在索引中对信息进行检索从而加快查找的效率;

3.全文检索工具中,都是由三部分组成:索引部分、分词部分、搜索部分。分词部分是服务于索引的建立的。

4.建立索引的步骤:创建Directory→创建IndexWriter→创建Document对象→为Document添加Field→通过IndexWriter添加文档到索引中;

5.搜索的步骤:

a.创建Directory

b.创建IndexReader

c.根据IndexReader创建IndexSearcher

d.创建搜索的Query

e.根据searcher搜索并且返回TopDocs

f.根据TopDocs获取ScoreDoc对象

g.根据searcher和ScoreDoc对象获取具体的Document对象

h.根据Document对象获取需要的值

初识全文检索Lucene

package com.jetsen.learn;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;

public class LearnLucene {

/**
* 建立索引
*/
public void index(){

IndexWriter writer = null;
try {
//1.创建Directory
Directory directory = FSDirectory.open(new File("d:/lucene/index01"));//建立在硬盘中
//Directory dir = new RAMDirectory();//建立在内存中

//2.创建IndexWriter
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35));
writer = new IndexWriter(directory, iwc);

//3.创建Document对象
Document doc = null;
//4.为Document对象添加Field
File f = new File("d://lucene/example");
for(File file : f.listFiles()){
doc = new Document();
doc.add(new Field("content", new FileReader(file)));
doc.add(new Field("filename", file.getName(),Field.Store.YES,Field.Index.NOT_ANALYZED));
doc.add(new Field("path", file.getAbsolutePath(),Field.Store.YES,Field.Index.NOT_ANALYZED));
//5.通过IndexWriter将文档添加到索引中
writer.addDocument(doc);

}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally{
if(writer != null){
try {
writer.close();
} catch (CorruptIndexException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}

}

}


/**
* 搜索
*/
public void searcher(){
try {
//a.创建Directory
Directory directory = FSDirectory.open(new File("d:/lucene/index01"));
//b.创建IndexReader
IndexReader reader = IndexReader.open(directory);
//c.根据IndexReader创建IndexSearcher
IndexSearcher searcher = new IndexSearcher(reader);
//d.创建搜索的Query
//创建parser来确定要搜索文件的内容,第二个参数表示搜索的域
QueryParser parser = new QueryParser(Version.LUCENE_35, "content", new StandardAnalyzer(Version.LUCENE_35));
//创建query,表示搜索域为content中包含“java”的文档
Query query = parser.parse("java");
//e.根据searcher搜索并且返回TopDocs
TopDocs tds = searcher.search(query, 10);
//f.根据TopDocs获取ScoreDoc对象
ScoreDoc[] sds = tds.scoreDocs;
for(ScoreDoc sd : sds){
//g.根据searcher和ScoreDoc对象获取具体的Document对象
Document d = searcher.doc(sd.doc);
//h.根据Document对象获取需要的值
System.out.println(d.get("filename")+"["+d.get("path")+"]");

}
//i.关闭reader
reader.close();
} catch (IOException | ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

}
}

package com.jetsen.learn;

import org.junit.Test;

public class LuceneTest {
@Test
public void testIndex(){
LearnLucene ll = new LearnLucene();
ll.index();//测试建立索引
}

@Test
public void testSearch(){
LearnLucene ll = new LearnLucene();
ll.searcher();//测试搜索

}
}


注:代码参照网上视频