介绍两种全文检索的技术。
1、 lucene+ 中文分词(IK)
关于lucene的原理,在这里可以得到很好的学习。
http://www.blogjava.net/zhyiwww/archive/2006/07/07/57122.html
本帖主要贴几个关于lucene的工具类。
- 索引建立
package com.lpm.fanger.search.base; import java.io.File; import java.io.IOException; import java.text.NumberFormat; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.List; import org.apache.commons.beanutils.PropertyUtils; import org.apache.commons.lang.math.NumberUtils; import org.apache.commons.lang.time.DateFormatUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Similarity; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; import org.wltea.analyzer.lucene.IKSimilarity; /** * @Intro lucene索引工具类 * @author Lee * @Date 2013-8-22 */ public class IndexUtils { private final static String Globle_Lucene_Path = "D:/lucene_index"; private final static String KeyWord_Field_Name= "id"; private final static IKAnalyzer Globle_Analyzer = new IKAnalyzer(); private final static String FMT_DATE = "yyyyMMddHHmmssSSS"; private final static NumberFormat FMT_ID = NumberFormat.getInstance(); static { FMT_ID.setGroupingUsed(false); FMT_ID.setMaximumFractionDigits(0); FMT_ID.setMaximumIntegerDigits(12); FMT_ID.setMinimumIntegerDigits(12); } private IndexUtils(){} /** * 当前分词器 * @return */ public final static Analyzer getAnalyzer(){ return Globle_Analyzer; } /*********************CRUD************************/ /*********************CRUD************************/ /*********************CRUD************************/ /** * 添加索引(建立索引) * @param clazz 目标对象 * @param docs 目标对象详细集合 * @return 成功添加索引的条数 * @throws Exception */ public static int add( Class<? extends LuceneEnable> clazz, List<? extends LuceneEnable> objs) throws Exception{ if (objs == null || objs.size() == 0) return 0; IndexWriter writer = getWriter(clazz); try { int count = add(writer,objs); writer.optimize(); return count; }finally{ writer.close(); writer = null; } } /** * 添加速印(建立索引) * @param doc 当前文档 * @throws Exception */ public static void add(LuceneEnable doc) throws Exception{ if(doc == null) return; IndexWriter writer = getWriter(doc.getClass()); try{ //再添加 writer.addDocument(objectToDocment(doc)); //提交事务 writer.commit(); }finally{ writer.close(); } } /** * 删除索引 * @param doc * @throws Exception */ public static void delete(LuceneEnable doc) throws Exception{ if(doc == null) return; IndexWriter writer = getWriter(doc.getClass()); try{ writer.deleteDocuments(new Term("id", String.valueOf(doc.getPrimeryKey()))); writer.commit(); }finally{ writer.close(); } } /** * 更新索引 * @param doc * @throws Exception */ public static void update(LuceneEnable doc) throws Exception{ if(doc == null) return; IndexWriter writer = getWriter(doc.getClass()); try{ //先删除 writer.deleteDocuments(new Term("id", String.valueOf(doc.getPrimeryKey()))); //再添加 writer.addDocument(objectToDocment(doc)); //提交事务 writer.commit(); }finally{ writer.close(); } } /**********查找**********/ /**********查找**********/ /** * 索引库中查找满足条件的主键结果集 * @param clazz * @param query * @param maxCount * @return 满足条件的主键结果集 * @throws Exception */ public static List<Long> find( Class<? extends LuceneEnable> clazz, Query query,int maxCount) throws Exception{ IndexSearcher reader = getReader(clazz); try{ //获取查询结果 TopDocs hits = reader.search(query, null,maxCount); if(hits == null) return null; List<Long> results = new ArrayList<Long>(); //取得结果数 int num = Math.min(hits.totalHits, maxCount); for(int i = 0; i < num ;i++){ ScoreDoc scoreDoc = hits.scoreDocs[i]; Document doc = reader.doc(scoreDoc.doc); Long primaryKey = NumberUtils.toLong(doc.get(KeyWord_Field_Name)); if(primaryKey > 0 && !results.contains(primaryKey)){ //满足条件值,加到结果集合 results.add(primaryKey); } } return results; }finally{ reader.close(); } } /** * 索引库中查找满足条件的【对象】结果集 * @param clazz * @param query * @param maxCount * @return * @throws Exception */ public static List<? extends LuceneEnable> findList( Class<? extends LuceneEnable> clazz, Query query,int maxCount) throws Exception{ IndexSearcher reader = getReader(clazz); List results = new ArrayList(); try{ TopDocs hits = reader.search(query, null, maxCount); if(hits == null){ return null; } //找最小集合长度 int num = Math.min(hits.totalHits, maxCount); for(int i=0;i<num;i++){ //循环找到对象集合 ScoreDoc scoreDoc = hits.scoreDocs[i]; Document doc = reader.doc(scoreDoc.doc); //实例化对象属性 Object obj = documentToObject(clazz, doc); if(obj != null){ results.add(obj); } } return results; }finally{ reader.close(); } } /** * 获取全文查询对象 * * 任意参数 * @param booleanClauses * @return */ public static BooleanQuery getFullTextQuery(BooleanClause... booleanClauses){ BooleanQuery booleanQuery = new BooleanQuery(); for (BooleanClause booleanClause : booleanClauses){ booleanQuery.add(booleanClause); } return booleanQuery; } /** * 获取全文查询对象 * @param q 查询关键字 * @param fields 查询字段(任意多) * @return 全文查询对象 */ public static BooleanQuery getFullTextQuery(String q, String... fields){ Analyzer analyzer = new IKAnalyzer(); BooleanQuery query = new BooleanQuery(); try { if (q != null && !q.equals("")){ for (String field : fields){ QueryParser parser = new QueryParser(Version.LUCENE_36, field, analyzer); query.add(parser.parse(q), Occur.SHOULD); } } } catch (ParseException e) { e.printStackTrace(); } return query; } /************助手方法**************/ /************助手方法**************/ /************助手方法**************/ /** * 添加索引助手类 * @param indexWriter * @param docs * @return */ protected static int add(IndexWriter writer,List<? extends LuceneEnable> objs) throws Exception{ if(objs == null || objs.size() == 0){ return 0; } int count = 0; for(LuceneEnable obj : objs){ Document doc = objectToDocment(obj); doc.setBoost(obj.GetBoost()); writer.addDocument(doc); count++; } return count; } /** * 获取索引写 * @param clazz * @return * @throws IOException */ protected static IndexWriter getWriter(Class<?> clazz) throws IOException{ String path = Globle_Lucene_Path + File.separator + clazz.getSimpleName(); Directory indexDir = FSDirectory.open(new File(path)); return new IndexWriter( indexDir, Globle_Analyzer, IndexWriter.MaxFieldLength.UNLIMITED); } /** * 获取索引读 * @param clazz * @return * @throws IOException */ protected static IndexSearcher getReader(Class<?> clazz) throws IOException{ String path = Globle_Lucene_Path + File.separator + clazz.getSimpleName(); Directory indexDir = FSDirectory.open(new File(path)); IndexSearcher reader = new IndexSearcher(indexDir); //使用ik的相似度评估器 Similarity similarity = new IKSimilarity(); reader.setSimilarity(similarity); return reader; } /** * Document转换成对象 * @param clazz * @param doc * @return * @throws Exception */ private static Object documentToObject(Class<? extends LuceneEnable> clazz,Document doc) throws Exception{ Object obj = clazz.newInstance(); java.lang.reflect.Field[] fields = clazz.getDeclaredFields(); for(java.lang.reflect.Field field : fields){ String name = field.getName(); String value = doc.get(name); if(name ==null || name.equals("") || value ==null || value.equals("") ) continue;//进入一个字段 setFieldValue(obj, name, value);//需要调试 } return null; } /** * 对象转换成Documents * @param obj * @return * @throws Exception */ private static Document objectToDocment(LuceneEnable obj) throws Exception{ Document doc = new Document(); //设置关键字域 doc.add(keyWord(KeyWord_Field_Name, FMT_ID.format(obj.getPrimeryKey()))); //设置索引域 String[] indexFields = obj.GetIndexFields(); if(indexFields != null && indexFields.length > 0){ for(String indexField : indexFields){ String value = getFieldValue(obj, indexField); if(value != null && !value.equals("")){ doc.add(index(indexField, value)); } } } //设置存储域 String[] storeFields = obj.GetStoreFields(); if(storeFields != null && storeFields.length > 0){ for(String storeField : storeFields){ String value = getFieldValue(obj, storeField); if(value != null && !value.equals("")){ doc.add(keyWord(storeField, value)); } } } //设置扩展索引值 HashMap<String, String> extendIndex = obj.GetExtendIndexValues(); if(extendIndex != null){ for(String key : extendIndex.keySet()){ String value = extendIndex.get(key); doc.add(index(key, value)); } } //设置扩展值 HashMap<String, String> extend = obj.GetExtendValues(); if(extend != null){ for(String key : extend.keySet()){ String value = extend.get(key); doc.add(keyWord(key, value)); } } return doc; } /** * 构造关键字域 * @param name * @param value * @return (关键字)域/字段 */ private static final Field keyWord(String name,String value){ return new Field(name, value, Field.Store.YES, Field.Index.NOT_ANALYZED); } /** * 构造索引域 * @param name * @param value * @return (索引)域/字段 */ private static final Field index(String name,String value){ return new Field(name, value, Field.Store.YES, Field.Index.ANALYZED); } /** * 获取对象属性值 * @param obj * @param fieldName * @return * @throws Exception 只支持属性类型为String/integer/double/float等基本类型 */ private static String getFieldValue(Object obj,String fieldName) throws Exception{ Object fieldValue = PropertyUtils.getProperty(obj, fieldName); if(fieldValue instanceof String) return (String)fieldValue; if(fieldValue instanceof Date) return DateFormatUtils.format((Date)fieldValue, FMT_DATE); return String.valueOf(fieldValue); } /** * 设置属性值 * @param obj * @param fieldName * @param fieldValue * @throws Exception 只支持属性类型为String/integer/double/float等基本类型 */ private static void setFieldValue(Object obj,String fieldName,String fieldValue) throws Exception{ PropertyUtils.setProperty(obj, fieldName, fieldValue); } }
- 查询
package com.lpm.fanger.search.base; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; /** * @Intro Lucene搜索工具类 * @author Lee * @Date 2013-8-24 */ public class LuceneSearchUtils { /** * 获取全文查询对象 */ public static BooleanQuery getFullTextQuery(BooleanClause... booleanClauses){ BooleanQuery booleanQuery = new BooleanQuery(); for (BooleanClause booleanClause : booleanClauses){ booleanQuery.add(booleanClause); } return booleanQuery; } /** * 获取全文查询对象 * @param q 查询关键字 * @param fields 查询字段 * @return 全文查询对象 */ public static BooleanQuery getFullTextQuery(String q, String... fields){ Analyzer analyzer = new IKAnalyzer(); BooleanQuery query = new BooleanQuery(); try { if (q != null && !q.equals("")){ for (String field : fields){ QueryParser parser = new QueryParser(Version.LUCENE_36, field, analyzer); query.add(parser.parse(q), Occur.SHOULD); } } } catch (ParseException e) { e.printStackTrace(); } return query; } }
- 使用
一般在项目中单独开一个端口,不断的更新索引。
/** * 构建索引 * @param objClass * @return */ private static int _BuildIndexOfObject(Class<? extends LuceneEnable> objClass) throws Exception { int ic = 0; long last_id = 0L; do { List<? extends LuceneEnable> objs = dao.listAfter(last_id,BATCH_COUNT); if(objs != null && objs.size()>0){ ic += IndexUtils.add(objClass, objs); last_id = objs.get(objs.size()-1).getPrimeryKey(); } if(objs == null || objs.size() < BATCH_COUNT) break; }while(true); return ic; }
- 测试
private final static Log log = LogFactory.getLog(RebuildLuceneIndex.class); private final static int BATCH_COUNT = 500; // static BookDao dao; // static AticleDao dao; static ExampleDao dao; static{ ApplicationContext app = new ClassPathXmlApplicationContext("spring.xml"); // dao = app.getBean("bookDao", BookDao.class); // dao = app.getBean("aticleDao", AticleDao.class); dao = app.getBean("exampleDao", ExampleDao.class); } @SuppressWarnings({ "rawtypes", "unchecked" }) public static void main(String[] args) throws Exception { String beanName = Example.class.getName();//Book.class.getName();//Aticle.class.getName();// Class beanClass = Class.forName(beanName); // Long t1 = System.currentTimeMillis(); // int ic = _BuildIndexOfObject(beanClass); // log.info(ic + " documents of " + beanName + " created."); // System.out.println("TIME:"+(System.currentTimeMillis() - t1)+"ms"); Long t2 = System.currentTimeMillis(); Query query =// LuceneSearchUtils.getFullTextQuery("神奇校车", new String[]{"bookName"});//,"outline"} IKQueryParser.parseMultiField(new String[]{"title"}, "选择");//经过测试,这个方法比较好一点 //LuceneSearchUtils.getFullTextQuery("java", new String[]{"book_name","out_line"});//IKQueryParser.parseMultiField(new String[]{"title","content"}, "c++"); List<Long> list = IndexUtils.find(beanClass, query, 100);//LuceneIndexUtils.find(beanClass, query, 100); //List<Aticle> list = (List<Aticle>) LuceneIndexUtils.find(beanClass, query, 100, false); //List<Book> list = (List<Book>) LuceneIndexUtils.find(beanClass, query, 100, false); System.out.println(list.size()); System.out.println("TIME:"+(System.currentTimeMillis() - t2)+"ms"); System.exit(0); }
- 相关的bean
package com.lpm.fanger.search.base; import java.util.HashMap; import javax.persistence.GeneratedValue; import javax.persistence.GenerationType; import javax.persistence.Id; import javax.persistence.Table; /** * @Intro descrption here * @author Lee * @Date 2013-8-24 */ @Table(name="t_article") public class Example implements LuceneEnable{ private Integer id; private String title; private String content; private String tag; /************getter and setter**************/ @Id @GeneratedValue(strategy=GenerationType.IDENTITY) public Integer getId() { return id; } public void setId(Integer id) { this.id = id; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } public String getTag() { return tag; } public void setTag(String tag) { this.tag = tag; } /************override method**************/ @Override public Long getPrimeryKey() { return Long.valueOf(this.getId()); } @Override public String[] GetStoreFields() { return new String[]{"tag"}; } @Override public String[] GetIndexFields() { return new String[]{"title","content"}; } @Override public HashMap<String, String> GetExtendValues() { return null; } @Override public HashMap<String, String> GetExtendIndexValues() { return null; } @Override public float GetBoost() { return 0; } }
- 相关的接口(重要)
package com.lpm.fanger.search.base; import java.util.HashMap; import java.util.List; /** * @Intro 支持搜索lucene全文检索 * 功能的Bean类需要实现该接口 * @author Lee * @Date 2013-8-24 */ public interface LuceneEnable { /** * 获取搜索对象的关键字, * 便于搜索得到分析后,得到记录的主键值, * 这样就可以通过查数据库表的方式,来得 * 到记录的完整情况 * @return */ public Long getPrimeryKey(); /** * 返回搜索对象需要存储的字段名,例如createTime, author等 * @return */ public String[] GetStoreFields(); /** * 返回搜索对象的索引字段,例如title,content * @return */ public String[] GetIndexFields(); /** * 返回对象的扩展信息 * @return */ public HashMap<String, String> GetExtendValues(); /** * 返回对象的扩展索引信息 * @return */ public HashMap<String, String> GetExtendIndexValues(); /** * 返回文档的权重 * @return */ public float GetBoost(); }
- 相关的dao
package com.lpm.fanger.jdbc.dao; import java.sql.ResultSet; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; import org.springframework.jdbc.core.RowMapper; import org.springframework.stereotype.Repository; import com.lpm.fanger.jdbc.mysql.BaseDaoMysqlImpl; import com.lpm.fanger.search.base.Example; /** * @Intro db interface * @author Lee * @Date 2013-8-26 */ @Repository("exampleDao") public class ExampleDao extends BaseDaoMysqlImpl<Example, Integer>{ public ExampleDao(){ super(Example.class); } public List<Example> listAfter(Long begain,Integer count){ List<Object> values = new ArrayList<Object>(); values.add(begain); values.add(count); String sql = "select * from "+getTableName()+" limit ?,?"; List<Example> list = search(sql, values,new ExampleRowMappere()); return list; } } class ExampleRowMappere implements RowMapper<Example>{ @Override public Example mapRow(ResultSet rs, int value) throws SQLException { Example ex = new Example(); ex.setContent(rs.getString("content")); ex.setTitle(rs.getString("title")); ex.setTag(rs.getString("tag")); ex.setId(rs.getInt("id")); return ex; } }
2、 mysql + sphinx
这中技术架构,有很好的性能,主要的工作放到了插件sphinx
相关资料:包括原理,实例以及安装,查询语句的书写等等。
http://pan.baidu.com/share/link?shareid=152940799&uk=572544164
感谢书写这些文档的前辈以及大牛门。如有侵权,请您给我留言,我会把这个链接拿掉。