xapian-core  2.0.0
bm25weight.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2009,2010,2011,2012,2014,2015,2024 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include "xapian/weight.h"
24 #include "weightinternal.h"
25 
26 #include "debuglog.h"
27 #include "omassert.h"
28 #include "serialise-double.h"
29 
30 #include "xapian/error.h"
31 
32 #include <algorithm>
33 #include <cmath>
34 
35 using namespace std;
36 
37 namespace Xapian {
38 
39 BM25Weight *
40 BM25Weight::clone() const
41 {
42  return new BM25Weight(param_k1, param_k2, param_k3, param_b,
43  param_min_normlen);
44 }
45 
46 void
47 BM25Weight::init(double factor)
48 {
49  Xapian::doccount tf = get_termfreq();
50 
51  double tw = 0;
52  if (get_rset_size() != 0) {
53  Xapian::doccount reltermfreq = get_reltermfreq();
54 
55  // There can't be more relevant documents indexed by a term than there
56  // are documents indexed by that term.
57  AssertRel(reltermfreq,<=,tf);
58 
59  // There can't be more relevant documents indexed by a term than there
60  // are relevant documents.
61  AssertRel(reltermfreq,<=,get_rset_size());
62 
63  Xapian::doccount reldocs_not_indexed = get_rset_size() - reltermfreq;
64 
65  // There can't be more relevant documents not indexed by a term than
66  // there are documents not indexed by that term.
67  AssertRel(reldocs_not_indexed,<=,get_collection_size() - tf);
68 
69  Xapian::doccount Q = get_collection_size() - reldocs_not_indexed;
70 
71  Xapian::doccount nonreldocs_indexed = tf - reltermfreq;
72  double numerator = (reltermfreq + 0.5) * (Q - tf + 0.5);
73  double denom = (reldocs_not_indexed + 0.5) * (nonreldocs_indexed + 0.5);
74  tw = numerator / denom;
75  } else {
76  tw = (get_collection_size() - tf + 0.5) / (tf + 0.5);
77  }
78 
79  AssertRel(tw,>,0);
80 
81  // The "official" formula can give a negative termweight in unusual cases
82  // (without an RSet, when a term indexes more than half the documents in
83  // the database). These negative weights aren't actually helpful, and it
84  // is common for implementations to replace them with a small positive
85  // weight or similar.
86  //
87  // Truncating to zero doesn't seem a great approach in practice as it
88  // means that some terms in the query can have no effect at all on the
89  // ranking, and that some results can have zero weight, both of which
90  // seem surprising.
91  //
92  // Xapian 1.0.x and earlier adjusted the termweight for any term indexing
93  // more than a third of documents, which seems rather "intrusive". That's
94  // what the code currently enabled does, but perhaps it would be better to
95  // do something else. (FIXME)
96 #if 0
97  if (rare(tw <= 1.0)) {
98  termweight = 0;
99  } else {
100  termweight = log(tw) * factor;
101  if (param_k3 != 0) {
102  double wqf_double = get_wqf();
103  termweight *= (param_k3 + 1) * wqf_double / (param_k3 + wqf_double);
104  }
105  }
106 #else
107  if (tw < 2) tw = tw * 0.5 + 1;
108  termweight = log(tw) * factor;
109  if (param_k3 != 0) {
110  double wqf_double = get_wqf();
111  termweight *= (param_k3 + 1) * wqf_double / (param_k3 + wqf_double);
112  }
113 #endif
114  termweight *= (param_k1 + 1);
115 
116  LOGVALUE(WTCALC, termweight);
117 
118  if (param_k2 == 0 && (param_b == 0 || param_k1 == 0)) {
119  // If k2 is 0, and either param_b or param_k1 is 0 then the document
120  // length doesn't affect the weight.
121  len_factor = 0;
122  } else {
123  len_factor = get_average_length();
124  // len_factor can be zero if all documents are empty (or the database
125  // is empty!)
126  if (len_factor != 0) len_factor = 1 / len_factor;
127  }
128 
129  LOGVALUE(WTCALC, len_factor);
130 }
131 
132 string
134 {
135  return "bm25";
136 }
137 
138 string
139 BM25Weight::serialise() const
140 {
141  string result = serialise_double(param_k1);
142  result += serialise_double(param_k2);
143  result += serialise_double(param_k3);
144  result += serialise_double(param_b);
145  result += serialise_double(param_min_normlen);
146  return result;
147 }
148 
149 BM25Weight *
150 BM25Weight::unserialise(const string & s) const
151 {
152  const char *ptr = s.data();
153  const char *end = ptr + s.size();
154  double k1 = unserialise_double(&ptr, end);
155  double k2 = unserialise_double(&ptr, end);
156  double k3 = unserialise_double(&ptr, end);
157  double b = unserialise_double(&ptr, end);
158  double min_normlen = unserialise_double(&ptr, end);
159  if (rare(ptr != end))
160  throw Xapian::SerialisationError("Extra data in BM25Weight::unserialise()");
161  return new BM25Weight(k1, k2, k3, b, min_normlen);
162 }
163 
164 double
165 BM25Weight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
167 {
168  LOGCALL(WTCALC, double, "BM25Weight::get_sumpart", wdf | len);
169  Xapian::doclength normlen = max(len * len_factor, param_min_normlen);
170 
171  double wdf_double = wdf;
172  double denom = param_k1 * (normlen * param_b + (1 - param_b)) + wdf_double;
173  AssertRel(denom,>,0);
174  RETURN(termweight * (wdf_double / denom));
175 }
176 
177 double
178 BM25Weight::get_maxpart() const
179 {
180  LOGCALL(WTCALC, double, "BM25Weight::get_maxpart", NO_ARGS);
181  double denom = param_k1;
182  Xapian::termcount wdf_max = get_wdf_upper_bound();
183  if (param_k1 != 0.0) {
184  if (param_b != 0.0) {
185  // "Upper-bound Approximations for Dynamic Pruning" Craig
186  // Macdonald, Nicola Tonellotto and Iadh Ounis. ACM Transactions on
187  // Information Systems. 29(4), 2011 shows that evaluating at
188  // doclen=wdf_max is a good bound.
189  //
190  // However, we can do better if doclen_min > wdf_max since then a
191  // better bound can be found by simply evaluating at
192  // doclen=doclen_min and wdf=wdf_max.
193  Xapian::doclength normlen_lb =
194  max(max(wdf_max, get_doclength_lower_bound()) * len_factor,
195  param_min_normlen);
196  denom *= (normlen_lb * param_b + (1 - param_b));
197  }
198  }
199  denom += wdf_max;
200  AssertRel(denom,>,0);
201  RETURN(termweight * (wdf_max / denom));
202 }
203 
204 /* The BM25 formula gives:
205  *
206  * param_k2 * query_length * (1 - normlen) / (1 + normlen)
207  *
208  * To avoid negative sumextra we add the constant (param_k2 * query_length)
209  * to give:
210  *
211  * 2 * param_k2 * query_length / (1 + normlen)
212  */
213 double
214 BM25Weight::get_sumextra(Xapian::termcount len,
216  Xapian::termcount) const
217 {
218  LOGCALL(WTCALC, double, "BM25Weight::get_sumextra", len);
219  double num = (2.0 * param_k2 * get_query_length());
220  RETURN(num / (1.0 + max(len * len_factor, param_min_normlen)));
221 }
222 
223 double
224 BM25Weight::get_maxextra() const
225 {
226  LOGCALL(WTCALC, double, "BM25Weight::get_maxextra", NO_ARGS);
227  if (param_k2 == 0.0)
228  RETURN(0.0);
229  double num = (2.0 * param_k2 * get_query_length());
230  RETURN(num / (1.0 + max(get_doclength_lower_bound() * len_factor,
231  param_min_normlen)));
232 }
233 
234 [[noreturn]]
235 static inline void
236 parameter_error(const char* message, const char* params)
237 {
238  Xapian::Weight::Internal::parameter_error(message, "bm25", params);
239 }
240 
241 BM25Weight*
242 BM25Weight::create_from_parameters(const char* params) const
243 {
244  const char* p = params;
245  if (*p == '\0')
246  return new Xapian::BM25Weight();
247  double k1 = 1;
248  double k2 = 0;
249  double k3 = 1;
250  double b = 0.5;
251  double min_normlen = 0.5;
253  parameter_error("Parameter 1 (k1) is invalid", params);
255  parameter_error("Parameter 2 (k2) is invalid", params);
257  parameter_error("Parameter 3 (k3) is invalid", params);
259  parameter_error("Parameter 4 (b) is invalid", params);
260  if (*p && !Xapian::Weight::Internal::double_param(&p, &min_normlen))
261  parameter_error("Parameter 5 (min_normlen) is invalid", params);
262  if (*p)
263  parameter_error("Extra data after parameter 5", params);
264  return new Xapian::BM25Weight(k1, k2, k3, b, min_normlen);
265 }
266 
267 }
char name[9]
Definition: dbcheck.cc:57
Xapian::Weight subclass implementing the BM25 probabilistic formula.
Definition: weight.h:1050
Indicates an error in the std::string serialisation of an object.
Definition: error.h:917
static void parameter_error(const char *msg, const std::string &scheme, const char *params)
static bool double_param(const char **p, double *ptr_val)
#define rare(COND)
Definition: config.h:607
PositionList * p
Debug logging macros.
#define RETURN(...)
Definition: debuglog.h:484
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:478
#define LOGVALUE(a, b)
Definition: debuglog.h:486
Hierarchy of classes which Xapian can throw as exceptions.
static void parameter_error(const char *message, const std::string &scheme, const char *params)
Definition: lmweight.cc:41
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
double doclength
A normalised document length.
Definition: types.h:58
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
Various assertion macros.
#define AssertRel(A, REL, B)
Definition: omassert.h:123
string serialise_double(double v)
Serialise a double to a string.
double unserialise_double(const char **p, const char *end)
Unserialise a double serialised by serialise_double.
functions to serialise and unserialise a double
Weighting scheme API.
Xapian::Weight::Internal class, holding database and term statistics.