xapian-core  1.4.26
bm25weight.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2009,2010,2011,2012,2014,2015 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <config.h>
22 
23 #include "xapian/weight.h"
24 
25 #include "debuglog.h"
26 #include "omassert.h"
27 #include "serialise-double.h"
28 
29 #include "xapian/error.h"
30 
31 #include <algorithm>
32 #include <cmath>
33 
34 using namespace std;
35 
36 namespace Xapian {
37 
38 BM25Weight *
39 BM25Weight::clone() const
40 {
41  return new BM25Weight(param_k1, param_k2, param_k3, param_b,
42  param_min_normlen);
43 }
44 
45 void
46 BM25Weight::init(double factor)
47 {
48  Xapian::doccount tf = get_termfreq();
49 
50  double tw = 0;
51  if (get_rset_size() != 0) {
52  Xapian::doccount reltermfreq = get_reltermfreq();
53 
54  // There can't be more relevant documents indexed by a term than there
55  // are documents indexed by that term.
56  AssertRel(reltermfreq,<=,tf);
57 
58  // There can't be more relevant documents indexed by a term than there
59  // are relevant documents.
60  AssertRel(reltermfreq,<=,get_rset_size());
61 
62  Xapian::doccount reldocs_not_indexed = get_rset_size() - reltermfreq;
63 
64  // There can't be more relevant documents not indexed by a term than
65  // there are documents not indexed by that term.
66  AssertRel(reldocs_not_indexed,<=,get_collection_size() - tf);
67 
68  Xapian::doccount Q = get_collection_size() - reldocs_not_indexed;
69 
70  Xapian::doccount nonreldocs_indexed = tf - reltermfreq;
71  double numerator = (reltermfreq + 0.5) * (Q - tf + 0.5);
72  double denom = (reldocs_not_indexed + 0.5) * (nonreldocs_indexed + 0.5);
73  tw = numerator / denom;
74  } else {
75  tw = (get_collection_size() - tf + 0.5) / (tf + 0.5);
76  }
77 
78  AssertRel(tw,>,0);
79 
80  // The "official" formula can give a negative termweight in unusual cases
81  // (without an RSet, when a term indexes more than half the documents in
82  // the database). These negative weights aren't actually helpful, and it
83  // is common for implementations to replace them with a small positive
84  // weight or similar.
85  //
86  // Truncating to zero doesn't seem a great approach in practice as it
87  // means that some terms in the query can have no effect at all on the
88  // ranking, and that some results can have zero weight, both of which
89  // are seem surprising.
90  //
91  // Xapian 1.0.x and earlier adjusted the termweight for any term indexing
92  // more than a third of documents, which seems rather "intrusive". That's
93  // what the code currently enabled does, but perhaps it would be better to
94  // do something else. (FIXME)
95 #if 0
96  if (rare(tw <= 1.0)) {
97  termweight = 0;
98  } else {
99  termweight = log(tw) * factor;
100  if (param_k3 != 0) {
101  double wqf_double = get_wqf();
102  termweight *= (param_k3 + 1) * wqf_double / (param_k3 + wqf_double);
103  }
104  }
105 #else
106  if (tw < 2) tw = tw * 0.5 + 1;
107  termweight = log(tw) * factor;
108  if (param_k3 != 0) {
109  double wqf_double = get_wqf();
110  termweight *= (param_k3 + 1) * wqf_double / (param_k3 + wqf_double);
111  }
112 #endif
113  termweight *= (param_k1 + 1);
114 
115  LOGVALUE(WTCALC, termweight);
116 
117  if (param_k2 == 0 && (param_b == 0 || param_k1 == 0)) {
118  // If k2 is 0, and either param_b or param_k1 is 0 then the document
119  // length doesn't affect the weight.
120  len_factor = 0;
121  } else {
122  len_factor = get_average_length();
123  // len_factor can be zero if all documents are empty (or the database
124  // is empty!)
125  if (len_factor != 0) len_factor = 1 / len_factor;
126  }
127 
128  LOGVALUE(WTCALC, len_factor);
129 }
130 
131 string
133 {
134  return "Xapian::BM25Weight";
135 }
136 
137 string
138 BM25Weight::serialise() const
139 {
140  string result = serialise_double(param_k1);
141  result += serialise_double(param_k2);
142  result += serialise_double(param_k3);
143  result += serialise_double(param_b);
144  result += serialise_double(param_min_normlen);
145  return result;
146 }
147 
148 BM25Weight *
149 BM25Weight::unserialise(const string & s) const
150 {
151  const char *ptr = s.data();
152  const char *end = ptr + s.size();
153  double k1 = unserialise_double(&ptr, end);
154  double k2 = unserialise_double(&ptr, end);
155  double k3 = unserialise_double(&ptr, end);
156  double b = unserialise_double(&ptr, end);
157  double min_normlen = unserialise_double(&ptr, end);
158  if (rare(ptr != end))
159  throw Xapian::SerialisationError("Extra data in BM25Weight::unserialise()");
160  return new BM25Weight(k1, k2, k3, b, min_normlen);
161 }
162 
163 double
164 BM25Weight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
165  Xapian::termcount) const
166 {
167  LOGCALL(WTCALC, double, "BM25Weight::get_sumpart", wdf | len);
168  Xapian::doclength normlen = max(len * len_factor, param_min_normlen);
169 
170  double wdf_double = wdf;
171  double denom = param_k1 * (normlen * param_b + (1 - param_b)) + wdf_double;
172  AssertRel(denom,>,0);
173  RETURN(termweight * (wdf_double / denom));
174 }
175 
176 double
177 BM25Weight::get_maxpart() const
178 {
179  LOGCALL(WTCALC, double, "BM25Weight::get_maxpart", NO_ARGS);
180  double denom = param_k1;
181  Xapian::termcount wdf_max = get_wdf_upper_bound();
182  if (param_k1 != 0.0) {
183  if (param_b != 0.0) {
184  // "Upper-bound Approximations for Dynamic Pruning" Craig
185  // Macdonald, Nicola Tonellotto and Iadh Ounis. ACM Transactions on
186  // Information Systems. 29(4), 2011 shows that evaluating at
187  // doclen=wdf_max is a good bound.
188  //
189  // However, we can do better if doclen_min > wdf_max since then a
190  // better bound can be found by simply evaluating at
191  // doclen=doclen_min and wdf=wdf_max.
192  Xapian::doclength normlen_lb =
193  max(max(wdf_max, get_doclength_lower_bound()) * len_factor,
194  param_min_normlen);
195  denom *= (normlen_lb * param_b + (1 - param_b));
196  }
197  }
198  denom += wdf_max;
199  AssertRel(denom,>,0);
200  RETURN(termweight * (wdf_max / denom));
201 }
202 
203 /* The BM25 formula gives:
204  *
205  * param_k2 * query_length * (1 - normlen) / (1 + normlen)
206  *
207  * To avoid negative sumextra we add the constant (param_k2 * query_length)
208  * to give:
209  *
210  * 2 * param_k2 * query_length / (1 + normlen)
211  */
212 double
213 BM25Weight::get_sumextra(Xapian::termcount len, Xapian::termcount) const
214 {
215  LOGCALL(WTCALC, double, "BM25Weight::get_sumextra", len);
216  double num = (2.0 * param_k2 * get_query_length());
217  RETURN(num / (1.0 + max(len * len_factor, param_min_normlen)));
218 }
219 
220 double
221 BM25Weight::get_maxextra() const
222 {
223  LOGCALL(WTCALC, double, "BM25Weight::get_maxextra", NO_ARGS);
224  if (param_k2 == 0.0)
225  RETURN(0.0);
226  double num = (2.0 * param_k2 * get_query_length());
227  RETURN(num / (1.0 + max(get_doclength_lower_bound() * len_factor,
228  param_min_normlen)));
229 }
230 
231 }
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
#define RETURN(A)
Definition: debuglog.h:493
#define AssertRel(A, REL, B)
Definition: omassert.h:123
STL namespace.
#define rare(COND)
Definition: config.h:575
Hierarchy of classes which Xapian can throw as exceptions.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
functions to serialise and unserialise a double
double unserialise_double(const char **p, const char *end)
Unserialise a double serialised by serialise_double.
Indicates an error in the std::string serialisation of an object.
Definition: error.h:929
#define LOGVALUE(a, b)
Definition: debuglog.h:495
double doclength
A normalised document length.
Definition: types.h:59
Weighting scheme API.
std::string serialise_double(double v)
Serialise a double to a string.
char name[9]
Definition: dbcheck.cc:55
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Various assertion macros.
Debug logging macros.
Xapian::Weight subclass implementing the BM25 probabilistic formula.
Definition: weight.h:546
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:487