Lucene.Net实现GroupBy的效果(2.3.1版)

本文简单介绍Lucene.Net实现GroupBy效果的方法,与《Lucene.Net 按类别统计搜索结果数》一文类似。注意,这种使用方法很影响效率,特别是命中结果多的情况下。这段代码修正自2.3.1版本,其它版本可能会与此有差别。

改造方法仍然是修改IndexSearcher,这里不再修改类库,而是通过自己的代码来实现。

扩充IndexSearcher类

 

实现与 HitQueue类完全一致,只因为这里无法使用类库提供的构造函数

 

    /// <summary>
    
/// 增加新的TopDocCollector类,无法直接继承TopDocCollector
    
/// </summary>
    public class TopDocCollectorExtension : HitCollector
    {
        
private ScoreDoc reusableSD;

        
internal int totalHits;
        
internal PriorityQueue hq;

        
/// <summary>Construct to collect a given number of hits.</summary>
        
/// <param name="numHits">the maximum number of hits to collect
        
/// </param>
        public TopDocCollectorExtension(int numHits)
            : 
this(numHits, new HitQueueExtension(numHits))
        {
        }
        
/// <summary>
        
/// 注入IndexSearcherExtension对象
        
/// </summary>
        private IndexSearcherExtension searcher;
        
/// <summary>
        
/// 构造函数注入对象
        
/// </summary>
        
/// <param name="numHits"></param>
        
/// <param name="searcher"></param>
        public TopDocCollectorExtension(int numHits, IndexSearcherExtension searcher)
            : 
this(numHits)
        {
            
this.searcher = searcher;
        }

        
internal TopDocCollectorExtension(int numHits, PriorityQueue hq)
        {
            
this.hq = hq;
        }

        
/// <summary>
        
/// 临时数据,用于排重
        
/// </summary>
        private Dictionary<intint> dict = new Dictionary<intint>();
        
// javadoc inherited
        public override void Collect(int doc, float score)
        {
            
if (score > 0.0f)
            {
                
//排重算法
                if (!string.IsNullOrEmpty(searcher.FieldName))
                {
                    IndexReader reader 
= searcher.GetIndexReader();
                    Document docment 
= reader.Document(doc);
                    
string value = docment.Get(searcher.FieldName).Trim();
                    
string value1 = string.Empty;
                    
string value2 = string.Empty;
                    
int len = value.Length;
                    
int len1 = (int)Math.Ceiling(len / 2.0f);
                    
int len2 = len - len1;
                    
int hash1 = value.Substring(0, len1).GetHashCode();
                    
int hash2 = value.Substring(len1, len2).GetHashCode();
                    
if (!(dict.ContainsKey(hash1) && dict.ContainsValue(hash2)))
                        dict.Add(hash1, hash2);
                    
else
                        
return;
                }

                totalHits
++;
                
if (reusableSD == null)
                {
                    reusableSD 
= new ScoreDoc(doc, score);
                }
                
else if (score >= reusableSD.score)
                {
                    
// reusableSD holds the last "rejected" entry, so, if
                    
// this new score is not better than that, there's no
                    
// need to try inserting it
                    reusableSD.doc = doc;
                    reusableSD.score 
= score;
                }
                
else
                {
                    
return;
                }
                reusableSD 
= (ScoreDoc)hq.InsertWithOverflow(reusableSD);
            }
        }

        
/// <summary>The total number of documents that matched this query. </summary>
        public virtual int GetTotalHits()
        {
            
return totalHits;
        }

        
/// <summary>The top-scoring hits. </summary>
        public virtual TopDocs TopDocs()
        {
            ScoreDoc[] scoreDocs 
= new ScoreDoc[hq.Size()];
            
for (int i = hq.Size() - 1; i >= 0; i--)
                
// put docs in array
                scoreDocs[i] = (ScoreDoc)hq.Pop();

            
float maxScore = (totalHits == 0? System.Single.NegativeInfinity : scoreDocs[0].score;

            
return new TopDocs(totalHits, scoreDocs, maxScore);
        }
    }
OK生产者完成了,下面看看消费者怎么搞。
        static void Main(string[] args)
        {
            IndexWriter writer 
= new IndexWriter("e:\\index"new StandardAnalyzer(), true);
            Document doc 
= new Document();
            doc.Add(
new Field("field""query value!", Field.Store.YES, Field.Index.TOKENIZED));
            writer.AddDocument(doc);
            writer.AddDocument(doc);
            writer.AddDocument(doc);
            writer.Close();

            IndexSearcherExtension searcher 
= new IndexSearcherExtension("e:\\index");
            searcher.GroupBy(
"field");
            Query q 
= new QueryParser("field"new StandardAnalyzer())
                .Parse(
"query");
            Hits docs 
= searcher.Search(q);
            
for (int i = 0; i < docs.Length(); i++)
            {
                Console.WriteLine(docs.Doc(i).Get(
"field"));
            }
            searcher.Close();

            Console.ReadKey();
        }
添加了三个相同的文档,结果只查询到一个结果,从而达到了目的。这段修改比较简单,应该还可以设计出更加高效的算法。好长时间没写博客有些生疏了~~!

http://www.cnblogs.com/birdshover/archive/2009/07/28/1533368.html

加支付宝好友偷能量挖...


评论(0)网络
阅读(107)喜欢(0)lucene.net/分词技术