xapian-core  2.0.0
lmweight.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2012 Gaurav Arora
5  * Copyright (C) 2016,2019,2024,2025 Olly Betts
6  * Copyright (C) 2016 Vivek Pal
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as
10  * published by the Free Software Foundation; either version 2 of the
11  * License, or (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, see
20  * <https://www.gnu.org/licenses/>.
21  */
22 
23 #include <config.h>
24 
25 #include "xapian/weight.h"
26 #include "weightinternal.h"
27 
28 #include "debuglog.h"
29 #include "omassert.h"
30 #include "serialise-double.h"
31 #include "stringutils.h"
32 
33 #include "xapian/error.h"
34 
35 #include <cmath>
36 
37 using namespace std;
38 
39 [[noreturn]]
40 static inline void
41 parameter_error(const char* message, const std::string& scheme,
42  const char* params)
43 {
44  Xapian::Weight::Internal::parameter_error(message, scheme, params);
45 }
46 
47 /* The equations for Jelinek-Mercer, Dirichlet, and Absolute Discount smoothing
48  * are taken from:
49  *
50  * Zhai, C., & Lafferty, J.D. (2004). A study of smoothing methods for language
51  * models applied to information retrieval. ACM Trans. Inf. Syst., 22, 179-214.
52  *
53  * The equations for Two-Stage smoothing are also from this paper, but aren't
54  * given explicitly there, so we have derived them using the approach
55  * described in the paper.
56  *
57  * Dir+ comes from:
58  *
59  * Lv, Y., & Zhai, C. (2011). Lower-bounding term frequency normalization.
60  * International Conference on Information and Knowledge Management.
61  */
62 
63 namespace Xapian {
64 
65 void
66 LMJMWeight::init(double factor_)
67 {
68  factor = factor_ * get_wqf();
69 
70  auto collection_freq = get_collection_freq();
71  if (rare(collection_freq == 0)) {
72  // Avoid dividing by zero in the corner case where the term has zero
73  // collection frequency.
74  factor = 0;
75  multiplier = 0;
76  return;
77  }
78 
79  double lambda = param_lambda;
80  if (lambda <= 0.0 || lambda >= 1.0) {
81  auto query_len = get_query_length();
82  if (query_len <= 2) {
83  lambda = 0.1;
84  } else if (query_len < 8) {
85  lambda = (query_len - 1) * 0.1;
86  } else {
87  lambda = 0.7;
88  }
89  }
90 
91  // Pre-calculate multiplier.
92  Xapian::totallength total_length = get_total_length();
93  multiplier = (1.0 - lambda) * total_length / (lambda * collection_freq);
94 }
95 
96 double
97 LMJMWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
99 {
100  double w = multiplier * wdf / len;
101  return factor * log(1.0 + w);
102 }
103 
104 double
105 LMJMWeight::get_maxpart() const
106 {
107  Xapian::termcount wdf_max = get_wdf_upper_bound();
108  Xapian::termcount len_min = get_doclength_lower_bound();
109  double w = multiplier;
110  if (wdf_max < len_min) {
111  // Clearly wdf / len <= wdf_max / len_min, but we also know that
112  // wdf <= len so wdf / len <= 1 so we use whichever bound is tighter.
113  w *= double(wdf_max) / len_min;
114  }
115  return factor * log(1.0 + w);
116 }
117 
118 LMJMWeight*
119 LMJMWeight::clone() const {
120  return new LMJMWeight(param_lambda);
121 }
122 
123 string
125 {
126  return "lmjm";
127 }
128 
129 string
130 LMJMWeight::serialise() const
131 {
132  return serialise_double(param_lambda);
133 }
134 
135 LMJMWeight*
136 LMJMWeight::unserialise(const string& s) const
137 {
138  const char *ptr = s.data();
139  const char *end = ptr + s.size();
140  double lambda = unserialise_double(&ptr, end);
141  if (rare(ptr != end))
142  throw Xapian::SerialisationError("Extra data in "
143  "LMJMWeight::unserialise()");
144  return new LMJMWeight(lambda);
145 }
146 
147 
148 LMJMWeight*
149 LMJMWeight::create_from_parameters(const char* params) const
150 {
151  const char* p = params;
152  double lambda = 0.0;
153  if (*p && !Xapian::Weight::Internal::double_param(&p, &lambda))
154  parameter_error("Parameter lambda is invalid", "lmjm", params);
155  if (*p)
156  parameter_error("Extra data after parameter", "lmjm", params);
157  return new Xapian::LMJMWeight(lambda);
158 }
159 
160 void
161 LMDirichletWeight::init(double factor_)
162 {
163  factor = factor_ * get_wqf();
164 
165  double mu = param_mu;
166 
167  auto doclen_max = get_doclength_upper_bound();
168  extra_offset = get_query_length() * log(doclen_max + mu);
169  if (factor == 0.0) {
170  // This object is for the term-independent contribution.
171  return;
172  }
173 
174  auto collection_freq = get_collection_freq();
175  if (rare(collection_freq == 0)) {
176  // Avoid dividing by zero in the corner case where the term has zero
177  // collection frequency.
178  factor = 0;
179  multiplier = 0;
180  return;
181  }
182 
183  // Pre-calculate the factor to multiply by.
184  multiplier = get_total_length() / (collection_freq * mu);
185 
186  double delta = param_delta;
187  if (delta != 0.0) {
188  // Include the query-independent Dir+ extra contribution in factor.
189  factor *= log(1.0 + delta * multiplier);
190  }
191 }
192 
193 double
194 LMDirichletWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount,
196 {
197  return factor * log(1.0 + wdf * multiplier);
198 }
199 
200 double
201 LMDirichletWeight::get_maxpart() const
202 {
203  Xapian::termcount wdf_max = get_wdf_upper_bound();
204  return factor * log(1.0 + wdf_max * multiplier);
205 }
206 
207 double
208 LMDirichletWeight::get_sumextra(Xapian::termcount doclen,
210  Xapian::termcount) const
211 {
212  // Formula (6) in Zhai and Lafferty 2004 includes "a document-dependent
213  // constant" which is:
214  //
215  // query_len * log(mu / (doclen + mu))
216  //
217  // (The equation for Dir+ is the same.)
218  //
219  // Both mu and doclen are positive, so this value will be negative but
220  // get_sumextra() must return a non-negative value. To solve this we
221  // decompose this into:
222  //
223  // query_len * log(mu) - query_len * log(doclen + mu)
224  //
225  // The first part is constant for a given query so we ignore it. The second
226  // part is still negative, but we can add query_len * log(doclen_max + mu)
227  // (also constant for a given query) to address that, giving:
228  //
229  // query_len * log(doclen_max + mu) - query_len * log(doclen + mu)
230  //
231  // We pre-calculate the first part in init() and save it in member variable
232  // extra_offset.
233  return extra_offset - get_query_length() * log(doclen + param_mu);
234 }
235 
236 double
237 LMDirichletWeight::get_maxextra() const
238 {
239  auto doclen_min = get_doclength_lower_bound();
240  return extra_offset - get_query_length() * log(doclen_min + param_mu);
241 }
242 
244 LMDirichletWeight::clone() const {
245  return new LMDirichletWeight(param_mu, param_delta);
246 }
247 
248 string
250 {
251  return "lmdirichlet";
252 }
253 
254 string
255 LMDirichletWeight::serialise() const
256 {
257  string result = serialise_double(param_mu);
258  result += serialise_double(param_delta);
259  return result;
260 }
261 
263 LMDirichletWeight::unserialise(const string& s) const
264 {
265  const char *ptr = s.data();
266  const char *end = ptr + s.size();
267  double mu = unserialise_double(&ptr, end);
268  double delta = unserialise_double(&ptr, end);
269  if (rare(ptr != end))
270  throw Xapian::SerialisationError("Extra data in "
271  "LMDirichletWeight::unserialise()");
272  return new LMDirichletWeight(mu, delta);
273 }
274 
275 
277 LMDirichletWeight::create_from_parameters(const char* params) const
278 {
279  const char* p = params;
280  double mu = 2000.0;
281  double delta = 0.05;
283  parameter_error("Parameter mu is invalid", "lmdirichlet", params);
284  if (*p && !Xapian::Weight::Internal::double_param(&p, &delta))
285  parameter_error("Parameter delta is invalid", "lmdirichlet", params);
286  if (*p)
287  parameter_error("Extra data after parameters", "lmdirichlet", params);
288  return new Xapian::LMDirichletWeight(mu, delta);
289 }
290 
291 void
292 LMAbsDiscountWeight::init(double factor_)
293 {
294  factor = factor_ * get_wqf();
295 
296  auto doclen_max = get_doclength_upper_bound();
297  extra_offset = get_query_length() * log(double(doclen_max));
298 
299  auto collection_freq = get_collection_freq();
300  if (rare(collection_freq == 0)) {
301  // Avoid dividing by zero in the corner case where the database has no
302  // collection frequency.
303  factor = 0;
304  multiplier = 0;
305  return;
306  }
307 
308  // Pre-calculate the factor to multiply by.
309  Xapian::totallength total_length = get_total_length();
310  multiplier = total_length / (param_delta * collection_freq);
311 }
312 
313 double
314 LMAbsDiscountWeight::get_sumpart(Xapian::termcount wdf,
316  Xapian::termcount uniqterms,
317  Xapian::termcount) const
318 {
319  return factor * log(1.0 + (wdf - param_delta) / uniqterms * multiplier);
320 }
321 
322 double
323 LMAbsDiscountWeight::get_maxpart() const
324 {
325  Xapian::termcount doclen_min = get_doclength_lower_bound();
326  Xapian::termcount wdf_max = get_wdf_upper_bound();
327  double x = (wdf_max - param_delta) * multiplier;
328  // We need a lower bound on uniqterms. We have doclen = sum(wdf) so:
329  //
330  // uniqterms >= ceil(doclen_min / wdf_max)
331  //
332  // We also know uniqterms >= 1 (since documents without terms won't match).
333  if (doclen_min > wdf_max)
334  x *= (doclen_min - 1) / wdf_max + 1;
335  return factor * log(1.0 + x);
336 }
337 
338 double
339 LMAbsDiscountWeight::get_sumextra(Xapian::termcount doclen,
340  Xapian::termcount uniqterms,
341  Xapian::termcount) const
342 {
343  // Formula (6) in Zhai and Lafferty 2004 includes "a document-dependent
344  // constant" which is:
345  //
346  // query_len * log(delta * uniqterms / doclen)
347  //
348  // We know delta < 1 and uniqterms <= doclen so this value will be negative
349  // but get_sumextra() must return a non-negative value. To solve this we
350  // decompose this into:
351  //
352  // query_len * log(delta) + query_len * log(uniqterms / doclen)
353  //
354  // The first part is constant for a given query so we ignore it. The second
355  // part is still negative, but we can add query_len * log(1 / doclen_min)
356  // (also constant for a given query) to address that, giving:
357  //
358  // -query_len * log(1 / doclen_max) + query_len * log(uniqterms / doclen)
359  // = query_len * log(doclen_max) + query_len * log(uniqterms / doclen)
360  //
361  // We pre-calculate the first part in init() and save it in member variable
362  // extra_offset.
363  return extra_offset + get_query_length() * log(double(uniqterms) / doclen);
364 }
365 
366 double
367 LMAbsDiscountWeight::get_maxextra() const
368 {
369  // Our best bound seems to be based on uniqterms <= doclen, which gives
370  // log(uniqterms / doclen> <= 0
371  return extra_offset;
372 }
373 
375 LMAbsDiscountWeight::clone() const {
376  return new LMAbsDiscountWeight(param_delta);
377 }
378 
379 string
381 {
382  return "lmabsdiscount";
383 }
384 
385 string
386 LMAbsDiscountWeight::serialise() const
387 {
388  return serialise_double(param_delta);
389 }
390 
392 LMAbsDiscountWeight::unserialise(const string& s) const
393 {
394  const char *ptr = s.data();
395  const char *end = ptr + s.size();
396  double delta = unserialise_double(&ptr, end);
397  if (rare(ptr != end))
398  throw Xapian::SerialisationError("Extra data in "
399  "LMAbsDiscountWeight::unserialise()");
400  return new LMAbsDiscountWeight(delta);
401 }
402 
403 
405 LMAbsDiscountWeight::create_from_parameters(const char* params) const
406 {
407  const char* p = params;
408  double delta = 0.7;
409  if (*p && !Xapian::Weight::Internal::double_param(&p, &delta))
410  parameter_error("Parameter delta is invalid", "lmabsdiscount", params);
411  if (*p)
412  parameter_error("Extra data after parameter", "lmabsdiscount", params);
413  return new Xapian::LMAbsDiscountWeight(delta);
414 }
415 
416 void
417 LM2StageWeight::init(double factor_)
418 {
419  factor = factor_ * get_wqf();
420 
421  double lambda = param_lambda;
422  double mu = param_mu;
423 
424  auto doclen_max = get_doclength_upper_bound();
425  extra_offset = -log((lambda * doclen_max + mu) / (doclen_max + mu));
426  extra_offset *= get_query_length();
427 
428  auto collection_freq = get_collection_freq();
429  if (rare(collection_freq == 0)) {
430  // Avoid dividing by zero in the corner case where the database has no
431  // collection_freq.
432  factor = 0;
433  multiplier = 0;
434  return;
435  }
436 
437  // Pre-calculate the factor to multiply by.
438  multiplier = (1 - lambda) * get_total_length() / collection_freq;
439 }
440 
441 double
442 LM2StageWeight::get_sumpart(Xapian::termcount wdf,
443  Xapian::termcount doclen,
445  Xapian::termcount) const
446 {
447  // Termweight formula is: log{ 1 + (1-λ) c(w;d) / ( (λ|d|+μ) p(w|C) ) }
448  double lambda = param_lambda;
449  double mu = param_mu;
450  return factor * log(1.0 + wdf / (lambda * doclen + mu) * multiplier);
451 }
452 
453 double
454 LM2StageWeight::get_maxpart() const
455 {
456  double lambda = param_lambda;
457  double mu = param_mu;
458  Xapian::termcount doclen_min = get_doclength_lower_bound();
459  Xapian::termcount wdf_max = get_wdf_upper_bound();
460  // We know wdf <= doclen so if the bounds don't rule out them being equal
461  // we want to find wdf value w to maximise (w / (lambda * w + mu)) which is
462  // just a case of maximising w, i.e. wdf_max. Otherwise we evaluate at
463  // wdf = wdf_max, doclen = doclen_min.
464  double x = wdf_max / (lambda * max(doclen_min, wdf_max) + mu);
465  return factor * log(1.0 + x * multiplier);
466 }
467 
468 double
469 LM2StageWeight::get_sumextra(Xapian::termcount doclen,
471  Xapian::termcount) const
472 {
473  // Formula (6) in Zhai and Lafferty 2004 includes "a document-dependent
474  // constant" which is:
475  //
476  // query_len * log αd
477  //
478  // where αd = ( λ|d| + μ ) / ( |d| + μ ) for two-stage smoothing, so:
479  //
480  // query_len * log{ (lamba * doclen + mu) / (doclen + mu) }
481  //
482  // We know lambda < 1 so this will be negative but get_sumextra() must
483  // return a non-negative value. To achieve this we subtract:
484  //
485  // query_len * log{ (lamba * doclen_max + mu) / (doclen_max + mu) }
486  //
487  // We pre-calculate the negation of this and store it in extra_offset.
488  double lambda = param_lambda;
489  double mu = param_mu;
490  return extra_offset +
491  get_query_length() * log((lambda * doclen + mu) / (doclen + mu));
492 }
493 
494 double
495 LM2StageWeight::get_maxextra() const
496 {
497  // Our best bound is at the doclen value which maximises
498  //
499  // log((lambda * doclen + mu) / (doclen + mu))
500  //
501  // which means maximising
502  //
503  // (lambda * doclen + mu) / (doclen + mu)
504  //
505  // Putting d' = doclen + mu we want to maximise
506  //
507  // lambda + (1 - lambda) * mu / d'
508  //
509  // (1 - lambda) > 0 so this is achieved by minimising d' which means
510  // minimising doclen.
511  double lambda = param_lambda;
512  double mu = param_mu;
513  auto doclen = get_doclength_lower_bound();
514  return extra_offset +
515  get_query_length() * log((lambda * doclen + mu) / (doclen + mu));
516 }
517 
519 LM2StageWeight::clone() const {
520  return new LM2StageWeight(param_lambda, param_mu);
521 }
522 
523 string
525 {
526  return "lm2stage";
527 }
528 
529 string
530 LM2StageWeight::serialise() const
531 {
532  string result = serialise_double(param_lambda);
533  result += serialise_double(param_mu);
534  return result;
535 }
536 
538 LM2StageWeight::unserialise(const string & s) const
539 {
540  const char *ptr = s.data();
541  const char *end = ptr + s.size();
542  double lambda = unserialise_double(&ptr, end);
543  double mu = unserialise_double(&ptr, end);
544  if (rare(ptr != end))
545  throw Xapian::SerialisationError("Extra data in "
546  "LM2StageWeight::unserialise()");
547  return new LM2StageWeight(lambda, mu);
548 }
549 
551 LM2StageWeight::create_from_parameters(const char* params) const
552 {
553  const char* p = params;
554  double lambda = 0.7;
555  double mu = 2000.0;
556  if (*p && !Xapian::Weight::Internal::double_param(&p, &lambda))
557  parameter_error("Parameter lambda is invalid", "lm2stage", params);
559  parameter_error("Parameter mu is invalid", "lm2stage", params);
560  if (*p)
561  parameter_error("Extra data after parameters", "lm2stage", params);
562  return new Xapian::LM2StageWeight(lambda, mu);
563 }
564 
565 }
char name[9]
Definition: dbcheck.cc:57
Language Model weighting with Two Stage smoothing.
Definition: weight.h:2093
Language Model weighting with Absolute Discount smoothing.
Definition: weight.h:2024
Language Model weighting with Dirichlet or Dir+ smoothing.
Definition: weight.h:1948
Language Model weighting with Jelinek-Mercer smoothing.
Definition: weight.h:1875
Indicates an error in the std::string serialisation of an object.
Definition: error.h:917
static void parameter_error(const char *msg, const std::string &scheme, const char *params)
static bool double_param(const char **p, double *ptr_val)
#define rare(COND)
Definition: config.h:607
PositionList * p
Debug logging macros.
Hierarchy of classes which Xapian can throw as exceptions.
static void parameter_error(const char *message, const std::string &scheme, const char *params)
Definition: lmweight.cc:41
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
Definition: types.h:114
Various assertion macros.
string serialise_double(double v)
Serialise a double to a string.
double unserialise_double(const char **p, const char *end)
Unserialise a double serialised by serialise_double.
functions to serialise and unserialise a double
Various handy string-related helpers.
Weighting scheme API.
Xapian::Weight::Internal class, holding database and term statistics.