lucene.net实现similarity自定义排序
最近碰到公司要求修改搜索排序,要求和这篇文章说的差不多,Lucene关于实现Similarity自定义排序,非常感谢原作者
原创文章,欢迎转载,请注明 Author:kernaling.wong#gmail.com
http://kernaling-wong.iteye.com/blog/586043
职位搜索的结果排序应该是,相关度优先,然后才是职位的发布时间倒序.即如果关键字匹配是一定要全部命中了才会排在第一位,然后再是只命中一部分关键字记录.具体如下图,(搜索"php 开发",这样的话,只有php,开发这两个关键字都全部匹配了才会排前.然后全部命中关键字的记录按职位的发布时间来递减.)
只是这篇文章是基于java的Similarity,非C#实现,而且使用的版本为3.0的,对照了一下现在公司使用的lucene.net2.4版本的similarity源代码,发现lucene.net这个版本的缺少了computeNorm方法,不知道直接修改lucene.net2.4不知道会出问题没。没办法只好下载最新版本lucene.net3.0.3的,自己研究下了。
SimilarityMy.cs源代码,修改过的
using System; using FieldInvertState = Lucene.Net.Index.FieldInvertState; namespace Lucene.Net.Search { [Serializable] public class SimilarityMy : Similarity { public override float ComputeNorm(String field, FieldInvertState state) { //return (float) (state.Boost * LengthNorm(field, state.Length)); int numTerms; if (discountOverlaps) numTerms = state.Length - state.NumOverlap; else numTerms = state.Length; return (state.Boost * LengthNorm(field, numTerms)); } /** Implemented as <code>1/sqrt(numTerms)</code>. */ public override float LengthNorm(String fieldName, int numTerms) { //return (float)(1.0 / Math.sqrt(numTerms)); return 1.0f; } /** Implemented as <code>1/sqrt(sumOfSquaredWeights)</code>. */ public override float QueryNorm(float sumOfSquaredWeights) { // return (float)(1.0 / Math.sqrt(sumOfSquaredWeights));\ return 1.0f; } /** Implemented as <code>sqrt(freq)</code>. */ //term freq 表示 term 在一个document的出现次数,这里设置为1.0f表示不考滤这个因素影响 // public override float Tf(float freq) { return 1.0f; } /** Implemented as <code>1 / (distance + 1)</code>. */ //这里表示匹配的 term 与 term之间的距离因素,同样也不应该受影响 public override float SloppyFreq(int distance) { return 1.0f; } /** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */ //这里表示匹配的docuemnt在全部document的影响因素,同理也不考滤 public override float Idf(int docFreq, int numDocs) { return 1.0f; } /** Implemented as <code>overlap / maxOverlap</code>. */ //这里表示每一个Document中所有匹配的关键字与当前关键字的匹配比例因素影响,同理也不考滤. public override float Coord(int overlap, int maxOverlap) { return 1.0f; } // Default false protected bool discountOverlaps; /** Determines whether overlap tokens (Tokens with * 0 position increment) are ignored when computing * norm. By default this is false, meaning overlap * tokens are counted just like non-overlap tokens. * * <p><b>WARNING</b>: This API is new and experimental, and may suddenly * change.</p> * * @see #computeNorm */ public void SetDiscountOverlaps(bool v) { discountOverlaps = v; } /** @see #setDiscountOverlaps */ public bool GetDiscountOverlaps() { return discountOverlaps; } } }
C#控制台测试
using System; using System.IO; using Lucene.Net.Store; using Lucene.Net.Documents; using Lucene.Net.Index; using Lucene.Net.Search; using Lucene.Net.Analysis; using Lucene.Net.Analysis.Standard; using Lucene.Net.Analysis.Tokenattributes; namespace TestLucene.Net { class Program { static string cmd = string.Empty, keyword = string.Empty, analyzeKeywod = string.Empty; static RAMDirectory ram; static IndexSearcher searcher; static SimilarityMy sm; static StandardAnalyzer analyzer; static string[] fields = new string[] { "Name", "Info" }; static TokenStream ts; static ITermAttribute ita; static StringReader sr; private static void InitRAM() { ram = new RAMDirectory(); sm = new SimilarityMy(); analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30); IndexWriter writer = new IndexWriter(ram, analyzer, IndexWriter.MaxFieldLength.UNLIMITED); writer.SetSimilarity(sm); Document doc_0 = new Document(); doc_0.Add(new Field("Name", "java开发人员", Field.Store.YES, Field.Index.ANALYZED)); doc_0.Add(new Field("Info", "招聘网站开发人员,要求一年或以上工作经验", Field.Store.YES, Field.Index.ANALYZED)); doc_0.Add(new Field("Time", "20100210", Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.AddDocument(doc_0); Document doc_1 = new Document(); doc_1.Add(new Field("Name", "高级开发人员(java方向)", Field.Store.YES, Field.Index.ANALYZED)); doc_1.Add(new Field("Info", "需要有四年或者以上的工作经验,有大型项目实践,java基本扎实", Field.Store.YES, Field.Index.ANALYZED)); doc_1.Add(new Field("Time", "20100202", Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.AddDocument(doc_1); Document doc_2 = new Document(); doc_2.Add(new Field("Name", "php开发工程师", Field.Store.YES, Field.Index.ANALYZED)); doc_2.Add(new Field("Info", "主要是维护公司的网站php开发,能独立完成网站的功能", Field.Store.YES, Field.Index.ANALYZED)); doc_2.Add(new Field("Time", "20100203", Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.AddDocument(doc_2); Document doc_3 = new Document(); doc_3.Add(new Field("Name", "linux管理员", Field.Store.YES, Field.Index.ANALYZED)); doc_3.Add(new Field("Info", "管理及维护公司的linux服务器,职责包括完成mysql数据备份及日常管理,apache的性能调优等", Field.Store.YES, Field.Index.ANALYZED)); doc_3.Add(new Field("Time", "20100204", Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.AddDocument(doc_3); Document doc_4 = new Document(); doc_4.Add(new Field("Name", "lucene开发工作师", Field.Store.YES, Field.Index.ANALYZED)); doc_4.Add(new Field("Info", "需要两年或者以上的从事lucene java 开发工作的经验,需要对算法,排序规则等有相关经验,java水平及基础要扎实", Field.Store.YES, Field.Index.ANALYZED)); doc_4.Add(new Field("Time", "20100131", Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.AddDocument(doc_4); Document doc_5 = new Document(); doc_5.Add(new Field("Name", "php软件工程师", Field.Store.YES, Field.Index.ANALYZED)); doc_5.Add(new Field("Info", "具有大量的php开发经验,如熟悉 java 开发,数据库管理则更佳", Field.Store.YES, Field.Index.ANALYZED)); doc_5.Add(new Field("Time", "20100130", Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.AddDocument(doc_5); writer.Dispose(); Console.WriteLine("建立内存索引完毕!"); } private static void DoSearch() { if (searcher == null) { searcher = new IndexSearcher(ram); searcher.Similarity = sm; } BooleanQuery bq = new BooleanQuery(); TermQuery tq; sr = new StringReader(keyword); ts = analyzer.TokenStream(null, sr); while (ts.IncrementToken()) { ita = ts.GetAttribute<ITermAttribute>(); analyzeKeywod = ita.Term; for (int i = 0; i < fields.Length; i++) { tq = new TermQuery(new Term(fields[i], analyzeKeywod)); if (fields[i] == "Name") tq.Boost = 100.0f;//在Name这一个Field需要给大的比重 else tq.Boost = 0.0f;//其他的不需要考滤 bq.Add(tq, Occur.SHOULD); } } ts.Dispose(); sr.Close(); Console.WriteLine(bq + "\n\n"); Sort sort = new Sort(new SortField[] { new SortField(null, SortField.SCORE, false), new SortField("Time", SortField.INT, true) }); TopFieldCollector collector = TopFieldCollector.Create(sort, 10, false, true, false, false); searcher.Search(bq, collector); TopDocs tDocs = collector.TopDocs(); ScoreDoc[] sDocs = tDocs.ScoreDocs; ScoreDoc tScore; int len = sDocs.Length, docId; Document doc; string Name, Info, Time; float score; Console.WriteLine("一共有" + len.ToString() + "条记录!\n"); for (int i = 0; i < len; i++) { tScore = sDocs[i]; docId = tScore.Doc; Explanation exp = searcher.Explain(bq, docId); doc = searcher.Doc(docId); Name = doc.Get("Name"); Info = doc.Get("Info"); Time = doc.Get("Time"); score = exp.Value; Console.WriteLine("DocId:" + docId + "\tScore:" + score + "\tName:" + Name + "\tTime:" + Time + "\tInfo:" + Info + "\n"); } } static void Main(string[] args) { InitRAM(); while (true) { Console.WriteLine("请选择操作!\n1:查询\n2:退出"); cmd = Console.ReadLine(); switch (cmd) { case "1": Console.Write("请输入关键字:"); keyword = Console.ReadLine(); if (keyword.Trim() != "") DoSearch(); break; case "2": if (searcher != null) searcher.Dispose(); return; default: break; } } } } }
效果
注意:从上边的测试结果可以看到一个疑问,这些记录匹配的关键字 java开发中, 无论是命中全部关键字还是一个,得到的score都是一样的,但是排序的时候却按我们之前设置的意义去排序,理论上来说,只匹配一半的关键字,score 会是全部匹配的一半的,这里的话,不知道是否是一个bug.有待继续研究。
加支付宝好友偷能量挖...
原创文章,转载请注明出处:lucene.net实现similarity自定义排序