本文简单介绍Lucene.Net实现GroupBy效果的方法,与《Lucene.Net 按类别统计搜索结果数 》一文类似。注意,这种使用方法很影响效率,特别是命中结果多的情况下。这段代码修正自2.3.1版本,其它版本可能会与此有差别。
改造方法仍然是修改IndexSearcher,这里不再修改类库,而是通过自己的代码来实现。
扩充IndexSearcher类
实现与 HitQueue类完全一致,只因为这里无法使用类库提供的构造函数
///
<summary>
/// 增加新的TopDocCollector类,无法直接继承TopDocCollector
/// </summary>
public class TopDocCollectorExtension : HitCollector
{
private ScoreDoc reusableSD;
internal int totalHits;
internal PriorityQueue hq;
/// <summary> Construct to collect a given number of hits. </summary>
/// <param name="numHits"> the maximum number of hits to collect
/// </param>
public TopDocCollectorExtension( int numHits)
: this (numHits, new HitQueueExtension(numHits))
{
}
/// <summary>
/// 注入IndexSearcherExtension对象
/// </summary>
private IndexSearcherExtension searcher;
/// <summary>
/// 构造函数注入对象
/// </summary>
/// <param name="numHits"></param>
/// <param name="searcher"></param>
public TopDocCollectorExtension( int numHits, IndexSearcherExtension searcher)
: this (numHits)
{
this .searcher = searcher;
}
internal TopDocCollectorExtension( int numHits, PriorityQueue hq)
{
this .hq = hq;
}
/// <summary>
/// 临时数据,用于排重
/// </summary>
private Dictionary < int , int > dict = new Dictionary < int , int > ();
// javadoc inherited
public override void Collect( int doc, float score)
{
if (score > 0.0f )
{
// 排重算法
if ( ! string .IsNullOrEmpty(searcher.FieldName))
{
IndexReader reader = searcher.GetIndexReader();
Document docment = reader.Document(doc);
string value = docment.Get(searcher.FieldName).Trim();
string value1 = string .Empty;
string value2 = string .Empty;
int len = value.Length;
int len1 = ( int )Math.Ceiling(len / 2.0f );
int len2 = len - len1;
int hash1 = value.Substring( 0 , len1).GetHashCode();
int hash2 = value.Substring(len1, len2).GetHashCode();
if ( ! (dict.ContainsKey(hash1) && dict.ContainsValue(hash2)))
dict.Add(hash1, hash2);
else
return ;
}
totalHits ++ ;
if (reusableSD == null )
{
reusableSD = new ScoreDoc(doc, score);
}
else if (score >= reusableSD.score)
{
// reusableSD holds the last "rejected" entry, so, if
// this new score is not better than that, there's no
// need to try inserting it
reusableSD.doc = doc;
reusableSD.score = score;
}
else
{
return ;
}
reusableSD = (ScoreDoc)hq.InsertWithOverflow(reusableSD);
}
}
/// <summary> The total number of documents that matched this query. </summary>
public virtual int GetTotalHits()
{
return totalHits;
}
/// <summary> The top-scoring hits. </summary>
public virtual TopDocs TopDocs()
{
ScoreDoc[] scoreDocs = new ScoreDoc[hq.Size()];
for ( int i = hq.Size() - 1 ; i >= 0 ; i -- )
// put docs in array
scoreDocs[i] = (ScoreDoc)hq.Pop();
float maxScore = (totalHits == 0 ) ? System.Single.NegativeInfinity : scoreDocs[ 0 ].score;
return new TopDocs(totalHits, scoreDocs, maxScore);
}
}
/// 增加新的TopDocCollector类,无法直接继承TopDocCollector
/// </summary>
public class TopDocCollectorExtension : HitCollector
{
private ScoreDoc reusableSD;
internal int totalHits;
internal PriorityQueue hq;
/// <summary> Construct to collect a given number of hits. </summary>
/// <param name="numHits"> the maximum number of hits to collect
/// </param>
public TopDocCollectorExtension( int numHits)
: this (numHits, new HitQueueExtension(numHits))
{
}
/// <summary>
/// 注入IndexSearcherExtension对象
/// </summary>
private IndexSearcherExtension searcher;
/// <summary>
/// 构造函数注入对象
/// </summary>
/// <param name="numHits"></param>
/// <param name="searcher"></param>
public TopDocCollectorExtension( int numHits, IndexSearcherExtension searcher)
: this (numHits)
{
this .searcher = searcher;
}
internal TopDocCollectorExtension( int numHits, PriorityQueue hq)
{
this .hq = hq;
}
/// <summary>
/// 临时数据,用于排重
/// </summary>
private Dictionary < int , int > dict = new Dictionary < int , int > ();
// javadoc inherited
public override void Collect( int doc, float score)
{
if (score > 0.0f )
{
// 排重算法
if ( ! string .IsNullOrEmpty(searcher.FieldName))
{
IndexReader reader = searcher.GetIndexReader();
Document docment = reader.Document(doc);
string value = docment.Get(searcher.FieldName).Trim();
string value1 = string .Empty;
string value2 = string .Empty;
int len = value.Length;
int len1 = ( int )Math.Ceiling(len / 2.0f );
int len2 = len - len1;
int hash1 = value.Substring( 0 , len1).GetHashCode();
int hash2 = value.Substring(len1, len2).GetHashCode();
if ( ! (dict.ContainsKey(hash1) && dict.ContainsValue(hash2)))
dict.Add(hash1, hash2);
else
return ;
}
totalHits ++ ;
if (reusableSD == null )
{
reusableSD = new ScoreDoc(doc, score);
}
else if (score >= reusableSD.score)
{
// reusableSD holds the last "rejected" entry, so, if
// this new score is not better than that, there's no
// need to try inserting it
reusableSD.doc = doc;
reusableSD.score = score;
}
else
{
return ;
}
reusableSD = (ScoreDoc)hq.InsertWithOverflow(reusableSD);
}
}
/// <summary> The total number of documents that matched this query. </summary>
public virtual int GetTotalHits()
{
return totalHits;
}
/// <summary> The top-scoring hits. </summary>
public virtual TopDocs TopDocs()
{
ScoreDoc[] scoreDocs = new ScoreDoc[hq.Size()];
for ( int i = hq.Size() - 1 ; i >= 0 ; i -- )
// put docs in array
scoreDocs[i] = (ScoreDoc)hq.Pop();
float maxScore = (totalHits == 0 ) ? System.Single.NegativeInfinity : scoreDocs[ 0 ].score;
return new TopDocs(totalHits, scoreDocs, maxScore);
}
}
OK生产者完成了,下面看看消费者怎么搞。
static
void
Main(
string
[] args)
{
IndexWriter writer = new IndexWriter( " e:\\index " , new StandardAnalyzer(), true );
Document doc = new Document();
doc.Add( new Field( " field " , " query value! " , Field.Store.YES, Field.Index.TOKENIZED));
writer.AddDocument(doc);
writer.AddDocument(doc);
writer.AddDocument(doc);
writer.Close();
IndexSearcherExtension searcher = new IndexSearcherExtension( " e:\\index " );
searcher.GroupBy( " field " );
Query q = new QueryParser( " field " , new StandardAnalyzer())
.Parse( " query " );
Hits docs = searcher.Search(q);
for ( int i = 0 ; i < docs.Length(); i ++ )
{
Console.WriteLine(docs.Doc(i).Get( " field " ));
}
searcher.Close();
Console.ReadKey();
}
添加了三个相同的文档,结果只查询到一个结果,从而达到了目的。这段修改比较简单,应该还可以设计出更加高效的算法。好长时间没写博客有些生疏了~~!
{
IndexWriter writer = new IndexWriter( " e:\\index " , new StandardAnalyzer(), true );
Document doc = new Document();
doc.Add( new Field( " field " , " query value! " , Field.Store.YES, Field.Index.TOKENIZED));
writer.AddDocument(doc);
writer.AddDocument(doc);
writer.AddDocument(doc);
writer.Close();
IndexSearcherExtension searcher = new IndexSearcherExtension( " e:\\index " );
searcher.GroupBy( " field " );
Query q = new QueryParser( " field " , new StandardAnalyzer())
.Parse( " query " );
Hits docs = searcher.Search(q);
for ( int i = 0 ; i < docs.Length(); i ++ )
{
Console.WriteLine(docs.Doc(i).Get( " field " ));
}
searcher.Close();
Console.ReadKey();
}