xapian-core  2.0.0
bm25plusweight.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2009,2010,2011,2012,2014,2015,2016,2024 Olly Betts
5  * Copyright (C) 2016 Vivek Pal
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, see
19  * <https://www.gnu.org/licenses/>.
20  */
21 
22 #include <config.h>
23 
24 #include "xapian/weight.h"
25 #include "weightinternal.h"
26 
27 #include "debuglog.h"
28 #include "omassert.h"
29 #include "serialise-double.h"
30 
31 #include "xapian/error.h"
32 
33 #include <algorithm>
34 #include <cmath>
35 
36 using namespace std;
37 
38 namespace Xapian {
39 
40 BM25PlusWeight *
41 BM25PlusWeight::clone() const
42 {
43  return new BM25PlusWeight(param_k1, param_k2, param_k3, param_b,
44  param_min_normlen, param_delta);
45 }
46 
47 void
48 BM25PlusWeight::init(double factor)
49 {
50  Xapian::doccount tf = get_termfreq();
51 
52  if (rare(tf == 0)) {
53  termweight = 0;
54  } else {
55  // BM25+ formula uses IDF = log((total_no_of_docs + 1) / tf)
56  termweight = log(double(get_collection_size() + 1) / tf);
57  termweight *= factor;
58  if (param_k3 != 0) {
59  double wqf_double = get_wqf();
60  termweight *= (param_k3 + 1) * wqf_double / (param_k3 + wqf_double);
61  }
62  }
63 
64  LOGVALUE(WTCALC, termweight);
65 
66  if (param_k2 == 0 && (param_b == 0 || param_k1 == 0)) {
67  // If k2 is 0, and either param_b or param_k1 is 0 then the document
68  // length doesn't affect the weight.
69  len_factor = 0;
70  } else {
71  len_factor = get_average_length();
72  // len_factor can be zero if all documents are empty (or the database
73  // is empty!)
74  if (len_factor != 0) len_factor = 1 / len_factor;
75  }
76 
77  LOGVALUE(WTCALC, len_factor);
78 }
79 
80 string
82 {
83  return "bm25+";
84 }
85 
86 string
87 BM25PlusWeight::serialise() const
88 {
89  string result = serialise_double(param_k1);
90  result += serialise_double(param_k2);
91  result += serialise_double(param_k3);
92  result += serialise_double(param_b);
93  result += serialise_double(param_min_normlen);
94  result += serialise_double(param_delta);
95  return result;
96 }
97 
99 BM25PlusWeight::unserialise(const string & s) const
100 {
101  const char *ptr = s.data();
102  const char *end = ptr + s.size();
103  double k1 = unserialise_double(&ptr, end);
104  double k2 = unserialise_double(&ptr, end);
105  double k3 = unserialise_double(&ptr, end);
106  double b = unserialise_double(&ptr, end);
107  double min_normlen = unserialise_double(&ptr, end);
108  double delta = unserialise_double(&ptr, end);
109  if (rare(ptr != end))
110  throw Xapian::SerialisationError("Extra data in BM25PlusWeight::unserialise()");
111  return new BM25PlusWeight(k1, k2, k3, b, min_normlen, delta);
112 }
113 
114 double
115 BM25PlusWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
117 {
118  LOGCALL(WTCALC, double, "BM25PlusWeight::get_sumpart", wdf | len);
119  Xapian::doclength normlen = max(len * len_factor, param_min_normlen);
120 
121  double wdf_double = wdf;
122  double denom = param_k1 * (normlen * param_b + (1 - param_b)) + wdf_double;
123  AssertRel(denom,>,0);
124  // Parameter delta (δ) is a pseudo tf value to control the scale of the
125  // tf lower bound. δ can be tuned for e.g from 0.0 to 1.5 but BM25+ can
126  // still work effectively across collections with a fixed δ = 1.0
127  RETURN(termweight * ((param_k1 + 1) * wdf_double / denom + param_delta));
128 }
129 
130 double
131 BM25PlusWeight::get_maxpart() const
132 {
133  LOGCALL(WTCALC, double, "BM25PlusWeight::get_maxpart", NO_ARGS);
134  double denom = param_k1;
135  Xapian::termcount wdf_max = get_wdf_upper_bound();
136  if (param_k1 != 0.0) {
137  if (param_b != 0.0) {
138  // "Upper-bound Approximations for Dynamic Pruning" Craig
139  // Macdonald, Nicola Tonellotto and Iadh Ounis. ACM Transactions on
140  // Information Systems. 29(4), 2011 shows that evaluating at
141  // doclen=wdf_max is a good bound.
142  //
143  // However, we can do better if doclen_min > wdf_max since then a
144  // better bound can be found by simply evaluating at
145  // doclen=doclen_min and wdf=wdf_max.
146  Xapian::doclength normlen_lb =
147  max(max(wdf_max, get_doclength_lower_bound()) * len_factor,
148  param_min_normlen);
149  denom *= (normlen_lb * param_b + (1 - param_b));
150  }
151  }
152  denom += wdf_max;
153  AssertRel(denom,>,0);
154  RETURN(termweight * ((param_k1 + 1) * wdf_max / denom + param_delta));
155 }
156 
157 /* The paper which describes BM25+ ignores BM25's document-independent
158  * component (so implicitly k2=0), but we support non-zero k2 too.
159  *
160  * The BM25 formula gives:
161  *
162  * param_k2 * query_length * (1 - normlen) / (1 + normlen)
163  *
164  * To avoid negative sumextra we add the constant (param_k2 * query_length)
165  * to give:
166  *
167  * 2 * param_k2 * query_length / (1 + normlen)
168  */
169 double
170 BM25PlusWeight::get_sumextra(Xapian::termcount len,
172  Xapian::termcount) const
173 {
174  LOGCALL(WTCALC, double, "BM25PlusWeight::get_sumextra", len);
175  double num = (2.0 * param_k2 * get_query_length());
176  RETURN(num / (1.0 + max(len * len_factor, param_min_normlen)));
177 }
178 
179 double
180 BM25PlusWeight::get_maxextra() const
181 {
182  LOGCALL(WTCALC, double, "BM25PlusWeight::get_maxextra", NO_ARGS);
183  if (param_k2 == 0.0)
184  RETURN(0.0);
185  double num = (2.0 * param_k2 * get_query_length());
186  RETURN(num / (1.0 + max(get_doclength_lower_bound() * len_factor,
187  param_min_normlen)));
188 }
189 
190 [[noreturn]]
191 static inline void
192 parameter_error(const char* message, const char* params)
193 {
194  Xapian::Weight::Internal::parameter_error(message, "bm25+", params);
195 }
196 
197 BM25PlusWeight *
198 BM25PlusWeight::create_from_parameters(const char* params) const
199 {
200  const char* p = params;
201  if (*p == '\0')
202  return new Xapian::BM25PlusWeight();
203  double k1 = 1;
204  double k2 = 0;
205  double k3 = 1;
206  double b = 0.5;
207  double min_normlen = 0.5;
208  double delta = 1.0;
210  parameter_error("Parameter 1 (k1) is invalid", params);
212  parameter_error("Parameter 2 (k2) is invalid", params);
214  parameter_error("Parameter 3 (k3) is invalid", params);
216  parameter_error("Parameter 4 (b) is invalid", params);
217  if (*p && !Xapian::Weight::Internal::double_param(&p, &min_normlen))
218  parameter_error("Parameter 5 (min_normlen) is invalid", params);
219  if (*p && !Xapian::Weight::Internal::double_param(&p, &delta))
220  parameter_error("Parameter 6 (delta) is invalid", params);
221  if (*p)
222  parameter_error("Extra data after parameter 6", params);
223  return new Xapian::BM25PlusWeight(k1, k2, k3, b, min_normlen, delta);
224 }
225 
226 }
char name[9]
Definition: dbcheck.cc:57
Xapian::Weight subclass implementing the BM25+ probabilistic formula.
Definition: weight.h:1161
Indicates an error in the std::string serialisation of an object.
Definition: error.h:917
static void parameter_error(const char *msg, const std::string &scheme, const char *params)
static bool double_param(const char **p, double *ptr_val)
#define rare(COND)
Definition: config.h:607
PositionList * p
Debug logging macros.
#define RETURN(...)
Definition: debuglog.h:484
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:478
#define LOGVALUE(a, b)
Definition: debuglog.h:486
Hierarchy of classes which Xapian can throw as exceptions.
static void parameter_error(const char *message, const std::string &scheme, const char *params)
Definition: lmweight.cc:41
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
double doclength
A normalised document length.
Definition: types.h:58
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
Various assertion macros.
#define AssertRel(A, REL, B)
Definition: omassert.h:123
string serialise_double(double v)
Serialise a double to a string.
double unserialise_double(const char **p, const char *end)
Unserialise a double serialised by serialise_double.
functions to serialise and unserialise a double
Weighting scheme API.
Xapian::Weight::Internal class, holding database and term statistics.