lucene.net实现similarity自定义排序

  最近碰到公司要求修改搜索排序,要求和这篇文章说的差不多,Lucene关于实现Similarity自定义排序,非常感谢原作者
 

原创文章,欢迎转载,请注明 Author:kernaling.wong#gmail.com
http://kernaling-wong.iteye.com/blog/586043

  职位搜索的结果排序应该是,相关度优先,然后才是职位的发布时间倒序.即如果关键字匹配是一定要全部命中了才会排在第一位,然后再是只命中一部分关键字记录.具体如下图,(搜索"php 开发",这样的话,只有php,开发这两个关键字都全部匹配了才会排前.然后全部命中关键字的记录按职位的发布时间来递减.)
lucene.net实现similarity自定义排序


  只是这篇文章是基于java的Similarity,非C#实现,而且使用的版本为3.0的,对照了一下现在公司使用的lucene.net2.4版本的similarity源代码,发现lucene.net这个版本的缺少了computeNorm方法,不知道直接修改lucene.net2.4不知道会出问题没。没办法只好下载最新版本lucene.net3.0.3的,自己研究下了。

  SimilarityMy.cs源代码,修改过的

using System;
using FieldInvertState = Lucene.Net.Index.FieldInvertState;
namespace Lucene.Net.Search
{
    [Serializable]
    public class SimilarityMy : Similarity
    {
        public override float ComputeNorm(String field, FieldInvertState state)
        {
            //return (float) (state.Boost * LengthNorm(field, state.Length));
            int numTerms;
            if (discountOverlaps)
                numTerms = state.Length - state.NumOverlap;
            else
                numTerms = state.Length;
            return (state.Boost * LengthNorm(field, numTerms));
        }
        /** Implemented as <code>1/sqrt(numTerms)</code>. */
        public override float LengthNorm(String fieldName, int numTerms)
        {
            //return (float)(1.0 / Math.sqrt(numTerms));  
            return 1.0f;
        }
        /** Implemented as <code>1/sqrt(sumOfSquaredWeights)</code>. */
        public override float QueryNorm(float sumOfSquaredWeights)
        {
            // return (float)(1.0 / Math.sqrt(sumOfSquaredWeights));\  
            return 1.0f;
        }
        /** Implemented as <code>sqrt(freq)</code>. */
        //term freq 表示 term 在一个document的出现次数,这里设置为1.0f表示不考滤这个因素影响  
        //    
        public override float Tf(float freq)
        {
            return 1.0f;
        }
        /** Implemented as <code>1 / (distance + 1)</code>. */
        //这里表示匹配的 term 与 term之间的距离因素,同样也不应该受影响
        public override float SloppyFreq(int distance)
        {
            return 1.0f;
        }
        /** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
        //这里表示匹配的docuemnt在全部document的影响因素,同理也不考滤  
        public override float Idf(int docFreq, int numDocs)
        {
            return 1.0f;
        }
        /** Implemented as <code>overlap / maxOverlap</code>. */
        //这里表示每一个Document中所有匹配的关键字与当前关键字的匹配比例因素影响,同理也不考滤.  
        public override float Coord(int overlap, int maxOverlap)
        {
            return 1.0f;
        }
        // Default false  
        protected bool discountOverlaps;
        /** Determines whether overlap tokens (Tokens with 
         *  0 position increment) are ignored when computing 
         *  norm.  By default this is false, meaning overlap 
         *  tokens are counted just like non-overlap tokens. 
         * 
         *  <p><b>WARNING</b>: This API is new and experimental, and may suddenly 
         *  change.</p> 
         * 
         *  @see #computeNorm 
         */
        public void SetDiscountOverlaps(bool v)
        {
            discountOverlaps = v;
        }
        /** @see #setDiscountOverlaps */
        public bool GetDiscountOverlaps()
        {
            return discountOverlaps;
        }
    }
}

  C#控制台测试

using System;
using System.IO;
using Lucene.Net.Store;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Analysis.Tokenattributes;
namespace TestLucene.Net
{
    class Program
    {
        static string cmd = string.Empty, keyword = string.Empty, analyzeKeywod = string.Empty;
        static RAMDirectory ram;
        static IndexSearcher searcher;
        static SimilarityMy sm;
        static StandardAnalyzer analyzer;
        static string[] fields = new string[] { "Name", "Info" };
        static TokenStream ts;
        static ITermAttribute ita;
        static StringReader sr;
        private static void InitRAM()
        {
            ram = new RAMDirectory();
            sm = new SimilarityMy();
            analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
            IndexWriter writer = new IndexWriter(ram, analyzer, IndexWriter.MaxFieldLength.UNLIMITED);
            writer.SetSimilarity(sm);
            Document doc_0 = new Document();
            doc_0.Add(new Field("Name", "java开发人员", Field.Store.YES, Field.Index.ANALYZED));
            doc_0.Add(new Field("Info", "招聘网站开发人员,要求一年或以上工作经验", Field.Store.YES, Field.Index.ANALYZED));
            doc_0.Add(new Field("Time", "20100210", Field.Store.YES, Field.Index.NOT_ANALYZED));
            writer.AddDocument(doc_0);
            Document doc_1 = new Document();
            doc_1.Add(new Field("Name", "高级开发人员(java方向)", Field.Store.YES, Field.Index.ANALYZED));
            doc_1.Add(new Field("Info", "需要有四年或者以上的工作经验,有大型项目实践,java基本扎实", Field.Store.YES, Field.Index.ANALYZED));
            doc_1.Add(new Field("Time", "20100202", Field.Store.YES, Field.Index.NOT_ANALYZED));
            writer.AddDocument(doc_1);
            Document doc_2 = new Document();
            doc_2.Add(new Field("Name", "php开发工程师", Field.Store.YES, Field.Index.ANALYZED));
            doc_2.Add(new Field("Info", "主要是维护公司的网站php开发,能独立完成网站的功能", Field.Store.YES, Field.Index.ANALYZED));
            doc_2.Add(new Field("Time", "20100203", Field.Store.YES, Field.Index.NOT_ANALYZED));
            writer.AddDocument(doc_2);
            Document doc_3 = new Document();
            doc_3.Add(new Field("Name", "linux管理员", Field.Store.YES, Field.Index.ANALYZED));
            doc_3.Add(new Field("Info", "管理及维护公司的linux服务器,职责包括完成mysql数据备份及日常管理,apache的性能调优等", Field.Store.YES, Field.Index.ANALYZED));
            doc_3.Add(new Field("Time", "20100204", Field.Store.YES, Field.Index.NOT_ANALYZED));
            writer.AddDocument(doc_3);
            Document doc_4 = new Document();
            doc_4.Add(new Field("Name", "lucene开发工作师", Field.Store.YES, Field.Index.ANALYZED));
            doc_4.Add(new Field("Info", "需要两年或者以上的从事lucene java 开发工作的经验,需要对算法,排序规则等有相关经验,java水平及基础要扎实", Field.Store.YES, Field.Index.ANALYZED));
            doc_4.Add(new Field("Time", "20100131", Field.Store.YES, Field.Index.NOT_ANALYZED));
            writer.AddDocument(doc_4);
            Document doc_5 = new Document();
            doc_5.Add(new Field("Name", "php软件工程师", Field.Store.YES, Field.Index.ANALYZED));
            doc_5.Add(new Field("Info", "具有大量的php开发经验,如熟悉 java 开发,数据库管理则更佳", Field.Store.YES, Field.Index.ANALYZED));
            doc_5.Add(new Field("Time", "20100130", Field.Store.YES, Field.Index.NOT_ANALYZED));
            writer.AddDocument(doc_5);
            writer.Dispose();
            Console.WriteLine("建立内存索引完毕!");
        }
        private static void DoSearch()
        {
            if (searcher == null) { searcher = new IndexSearcher(ram); searcher.Similarity = sm; }
            BooleanQuery bq = new BooleanQuery();
            TermQuery tq;
            sr = new StringReader(keyword);
            ts = analyzer.TokenStream(null, sr);
            while (ts.IncrementToken())
            {
                ita = ts.GetAttribute<ITermAttribute>();
                analyzeKeywod = ita.Term;
                for (int i = 0; i < fields.Length; i++)
                {
                    tq = new TermQuery(new Term(fields[i], analyzeKeywod));
                    if (fields[i] == "Name") tq.Boost = 100.0f;//在Name这一个Field需要给大的比重
                    else tq.Boost = 0.0f;//其他的不需要考滤  
                    bq.Add(tq, Occur.SHOULD);
                }
            }
            ts.Dispose();
            sr.Close();
            Console.WriteLine(bq + "\n\n");
            Sort sort = new Sort(new SortField[] { new SortField(null, SortField.SCORE, false), new SortField("Time", SortField.INT, true) });
            TopFieldCollector collector = TopFieldCollector.Create(sort, 10, false, true, false, false);
            searcher.Search(bq, collector);
            TopDocs tDocs = collector.TopDocs();
            ScoreDoc[] sDocs = tDocs.ScoreDocs;
            ScoreDoc tScore;
            int len = sDocs.Length, docId;
            Document doc;
            string Name, Info, Time;
            float score;
            Console.WriteLine("一共有" + len.ToString() + "条记录!\n");
            for (int i = 0; i < len; i++)
            {
                tScore = sDocs[i];
                docId = tScore.Doc;
                Explanation exp = searcher.Explain(bq, docId);
                doc = searcher.Doc(docId);
                Name = doc.Get("Name");
                Info = doc.Get("Info");
                Time = doc.Get("Time");
                score = exp.Value;
                Console.WriteLine("DocId:" + docId + "\tScore:" + score + "\tName:" + Name + "\tTime:" + Time + "\tInfo:" + Info + "\n");
            }
        }
        static void Main(string[] args)
        {
            InitRAM();
            while (true)
            {
                Console.WriteLine("请选择操作!\n1:查询\n2:退出");
                cmd = Console.ReadLine();
                switch (cmd)
                {
                    case "1":
                        Console.Write("请输入关键字:");
                        keyword = Console.ReadLine();
                        if (keyword.Trim() != "") DoSearch();
                        break;
                    case "2": if (searcher != null) searcher.Dispose(); return;
                    default: break;
                }
            }
        }
    }
}

  效果

lucene.net实现similarity自定义排序

 

  注意:从上边的测试结果可以看到一个疑问,这些记录匹配的关键字 java开发中, 无论是命中全部关键字还是一个,得到的score都是一样的,但是排序的时候却按我们之前设置的意义去排序,理论上来说,只匹配一半的关键字,score 会是全部匹配的一半的,这里的话,不知道是否是一个bug.有待继续研究。

加支付宝好友偷能量挖...


原创文章,转载请注明出处:lucene.net实现similarity自定义排序

评论(0)Web开发网
阅读(170)喜欢(0)lucene.net/分词技术