xapian-core  2.0.0
tfidfweight.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2013 Aarsh Shah
5  * Copyright (C) 2016 Vivek Pal
6  * Copyright (C) 2016,2017,2024 Olly Betts
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as
10  * published by the Free Software Foundation; either version 2 of the
11  * License, or (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, see
20  * <https://www.gnu.org/licenses/>.
21  */
22 
23 #include <config.h>
24 
25 #include "xapian/weight.h"
26 #include "keyword.h"
29 #include "weightinternal.h"
30 #include <cmath>
31 #include <cstring>
32 
33 #include "debuglog.h"
34 #include "omassert.h"
35 #include "serialise-double.h"
36 
37 #include "xapian/error.h"
38 
39 using namespace std;
40 
41 namespace Xapian {
42 
43 static TfIdfWeight::wdf_norm
44 decode_wdf_norm(const string& normalizations)
45 {
46  if (normalizations.length() != 3)
47  throw Xapian::InvalidArgumentError("Normalization string is invalid");
48  switch (normalizations[0]) {
49  case 'b':
51  case 's':
52  return TfIdfWeight::wdf_norm::SQUARE;
53  case 'l':
54  return TfIdfWeight::wdf_norm::LOG;
55  case 'P':
56  return TfIdfWeight::wdf_norm::PIVOTED;
57  case 'L':
58  return TfIdfWeight::wdf_norm::LOG_AVERAGE;
59  case 'n':
61  case 'm':
62  return TfIdfWeight::wdf_norm::MAX;
63  case 'a':
64  return TfIdfWeight::wdf_norm::AUG;
65  }
66  throw Xapian::InvalidArgumentError("Normalization string is invalid");
67 }
68 
69 static TfIdfWeight::idf_norm
70 decode_idf_norm(const string& normalizations)
71 {
72  if (normalizations.length() != 3)
73  throw Xapian::InvalidArgumentError("Normalization string is invalid");
74  switch (normalizations[1]) {
75  case 'n':
77  case 's':
78  return TfIdfWeight::idf_norm::SQUARE;
79  case 'f':
80  return TfIdfWeight::idf_norm::FREQ;
81  case 'P':
82  return TfIdfWeight::idf_norm::PIVOTED;
83  case 'p':
84  return TfIdfWeight::idf_norm::PROB;
85  case 't':
86  return TfIdfWeight::idf_norm::TFIDF;
87  }
88  throw Xapian::InvalidArgumentError("Normalization string is invalid");
89 }
90 
91 static TfIdfWeight::wt_norm
92 decode_wt_norm(const string& normalizations)
93 {
94  if (normalizations.length() != 3)
95  throw Xapian::InvalidArgumentError("Normalization string is invalid");
96  switch (normalizations[2]) {
97  case 'n':
99  }
100  throw Xapian::InvalidArgumentError("Normalization string is invalid");
101 }
102 
103 TfIdfWeight::TfIdfWeight(const std::string& normals,
104  double slope, double delta)
106  decode_idf_norm(normals),
107  decode_wt_norm(normals),
108  slope, delta) {}
109 
111  idf_norm idf_normalization,
112  wt_norm wt_normalization,
113  double slope, double delta)
114  : wdf_norm_(wdf_normalization), idf_norm_(idf_normalization),
115  wt_norm_(wt_normalization), param_slope(slope), param_delta(delta)
116 {
117  if (param_slope <= 0)
118  throw Xapian::InvalidArgumentError("Parameter slope is invalid");
119  if (param_delta <= 0)
120  throw Xapian::InvalidArgumentError("Parameter delta is invalid");
121  if (idf_norm_ != idf_norm::NONE) {
124  }
125  need_stat(WDF);
127  need_stat(WQF);
132  }
139  }
142  }
148  }
149 }
150 
151 TfIdfWeight *
153 {
156 }
157 
158 void
159 TfIdfWeight::init(double factor_)
160 {
161  if (factor_ == 0.0) {
162  // This object is for the term-independent contribution, and that's
163  // always zero for this scheme.
164  return;
165  }
166 
167  wqf_factor = get_wqf() * factor_;
169 }
170 
171 string
173 {
174  return "tfidf";
175 }
176 
177 string
179 {
180  string result = serialise_double(param_slope);
181  result += serialise_double(param_delta);
182  result += static_cast<unsigned char>(wdf_norm_);
183  result += static_cast<unsigned char>(idf_norm_);
184  result += static_cast<unsigned char>(wt_norm_);
185  return result;
186 }
187 
188 TfIdfWeight *
189 TfIdfWeight::unserialise(const string & s) const
190 {
191  const char *ptr = s.data();
192  const char *end = ptr + s.size();
193  double slope = unserialise_double(&ptr, end);
194  double delta = unserialise_double(&ptr, end);
195  if (rare(end - ptr != 3))
197  ("Incorrect data in TfIdfWeight::unserialise()");
198  wdf_norm wdf_normalization = static_cast<wdf_norm>(*(ptr)++);
199  idf_norm idf_normalization = static_cast<idf_norm>(*(ptr)++);
200  wt_norm wt_normalization = static_cast<wt_norm>(*(ptr)++);
201  return new TfIdfWeight(wdf_normalization, idf_normalization,
202  wt_normalization, slope, delta);
203 }
204 
205 double
207  Xapian::termcount doclen,
208  Xapian::termcount uniqterms,
209  Xapian::termcount wdfdocmax) const
210 {
211  double wdfn = get_wdfn(wdf, doclen, uniqterms, wdfdocmax, wdf_norm_);
212  return get_wtn(wdfn * idfn, wt_norm_) * wqf_factor;
213 }
214 
215 // An upper bound can be calculated simply on the basis of wdf_max as termfreq
216 // and N are constants.
217 double
219 {
222  double wdfn = get_wdfn(wdf_max, len_min, len_min, wdf_max, wdf_norm_);
223  return get_wtn(wdfn * idfn, wt_norm_) * wqf_factor;
224 }
225 
226 // Return normalized wdf, idf and weight depending on the normalization string.
227 double
229  Xapian::termcount uniqterms,
230  Xapian::termcount wdfdocmax,
231  wdf_norm wdf_normalization) const
232 {
233  switch (wdf_normalization) {
234  case wdf_norm::BOOLEAN:
235  if (wdf == 0) return 0;
236  return 1.0;
237  case wdf_norm::SQUARE:
238  return (wdf * wdf);
239  case wdf_norm::LOG:
240  if (wdf == 0) return 0;
241  return (1 + log(double(wdf)));
242  case wdf_norm::PIVOTED: {
243  if (wdf == 0) return 0;
244  double normlen = doclen / get_average_length();
245  double norm_factor = 1 / (1 - param_slope + (param_slope * normlen));
246  return ((1 + log(1 + log(double(wdf)))) * norm_factor + param_delta);
247  }
248  case wdf_norm::LOG_AVERAGE: {
249  if (wdf == 0) return 0;
250  double uniqterm_double = uniqterms;
251  double doclen_double = doclen;
252  double wdf_avg = 1;
253  if (doclen_double == 0 || uniqterm_double == 0)
254  wdf_avg = 1;
255  else
256  wdf_avg = doclen_double / uniqterm_double;
257  double num = 1 + log(double(wdf));
258  double den = 1 + log(wdf_avg);
259  return num / den;
260  }
261  case wdf_norm::AUG_LOG: {
262  if (wdf == 0) return 0;
263  return (0.2 + 0.8 * log(1.0 + wdf));
264  }
265  case wdf_norm::SQRT: {
266  if (wdf == 0) return 0;
267  return (sqrt(wdf - 0.5) + 1);
268  }
269  case wdf_norm::AUG_AVERAGE: {
270  if (wdf == 0) return 0;
271  return 0.9 + 0.1 * (double(wdf) / (double(doclen) / uniqterms));
272  }
273  case wdf_norm::MAX:
274  if (rare(wdfdocmax == 0)) return 0;
275  return double(wdf) / wdfdocmax;
276  case wdf_norm::AUG: {
277  if (wdf == 0) return 0;
278  return 0.5 + 0.5 * (double(wdf) / wdfdocmax);
279  }
280  case wdf_norm::NONE:
281  break;
282  }
283  return wdf;
284 }
285 
286 double
287 TfIdfWeight::get_idfn(idf_norm idf_normalization) const
288 {
289  Xapian::doccount termfreq = 1;
290  if (idf_normalization != idf_norm::NONE) termfreq = get_termfreq();
291  double N = 1.0;
292  if (idf_normalization == idf_norm::PROB ||
293  idf_normalization == idf_norm::SQUARE ||
294  idf_normalization == idf_norm::PIVOTED ||
295  idf_normalization == idf_norm::TFIDF)
296  N = get_collection_size();
297  Xapian::termcount collfreq = 1;
298  switch (idf_normalization) {
299  case idf_norm::NONE:
300  return 1.0;
301  case idf_norm::PROB:
302  // All documents are indexed by the term
303  if (N == termfreq) return 0;
304  return log((N - termfreq) / termfreq);
305  case idf_norm::FREQ:
306  return (1.0 / termfreq);
307  case idf_norm::SQUARE: {
308  double x = log(N / termfreq);
309  return x * x;
310  }
311  case idf_norm::PIVOTED:
312  return log((N + 1) / termfreq);
313  case idf_norm::GLOBAL_FREQ: {
314  collfreq = get_collection_freq();
315  return (double(collfreq) / termfreq);
316  }
318  collfreq = get_collection_freq();
319  return log(double(collfreq) / termfreq + 1);
320  }
322  collfreq = get_collection_freq();
323  return (double(collfreq) / termfreq + 1);
324  }
326  collfreq = get_collection_freq();
327  return sqrt(double(collfreq) / termfreq - 0.9);
328  }
329  case idf_norm::TFIDF:
330  break;
331  }
332  return (log(N / termfreq));
333 }
334 
335 double
336 TfIdfWeight::get_wtn(double wt, wt_norm wt_normalization) const
337 {
338  (void)wt_normalization;
339  return wt;
340 }
341 
342 [[noreturn]]
343 static inline void
344 parameter_error(const char* message, const char* params)
345 {
346  Xapian::Weight::Internal::parameter_error(message, "tfidf", params);
347 }
348 
349 TfIdfWeight *
350 TfIdfWeight::create_from_parameters(const char* params) const
351 {
352  const char* p = params;
353  if (*p == '\0')
354  return new Xapian::TfIdfWeight();
355 
356  string s;
357  int code = 0;
358 
360  (code = keyword(wdf_norm_tab, s.data(), s.size())) < 0) {
361  if (code < 0 && s.size() == 3 && *p == '\0') {
362  // Support 3 letter SMART codes such as "ntn".
363  return new Xapian::TfIdfWeight(s);
364  }
365  parameter_error("Parameter 1 (wdf_normalisation) is invalid", params);
366  }
367  wdf_norm wdf_normalisation_ = static_cast<wdf_norm>(code);
368 
369  s.resize(0);
371  (code = keyword(idf_norm_tab, s.data(), s.size())) < 0) {
372  parameter_error("Parameter 2 (idf_normalisation) is invalid", params);
373  }
374  idf_norm idf_normalisation_ = static_cast<idf_norm>(code);
375 
376  s.resize(0);
377  if (!Xapian::Weight::Internal::param_name(&p, s) || s != "NONE") {
378  parameter_error("Parameter 3 (wt_normalisation) is invalid", params);
379  }
380  wt_norm wt_normalisation_ = wt_norm::NONE;
381 
382  if (*p)
383  parameter_error("Extra data after parameter 3", params);
384  return new Xapian::TfIdfWeight(wdf_normalisation_, idf_normalisation_,
385  wt_normalisation_);
386 }
387 
388 }
InvalidArgumentError indicates an invalid parameter value was passed to the API.
Definition: error.h:229
Indicates an error in the std::string serialisation of an object.
Definition: error.h:917
Xapian::Weight subclass implementing the tf-idf weighting scheme.
Definition: weight.h:704
double wqf_factor
The factor to multiply with the weight.
Definition: weight.h:868
idf_norm
Idf normalizations.
Definition: weight.h:784
@ GLOBAL_FREQ
Global frequency IDF.
@ LOG_GLOBAL_FREQ
Log global frequency IDF.
@ SQRT_GLOBAL_FREQ
Square root global frequency IDF.
@ INCREMENTED_GLOBAL_FREQ
Incremented global frequency IDF.
double param_delta
Definition: weight.h:874
double get_wdfn(Xapian::termcount wdf, Xapian::termcount len, Xapian::termcount uniqterms, Xapian::termcount wdfdocmax, wdf_norm wdf_normalization) const
Definition: tfidfweight.cc:228
TfIdfWeight * create_from_parameters(const char *params) const
Create from a human-readable parameter string.
Definition: tfidfweight.cc:350
double get_maxpart() const
Return an upper bound on what get_sumpart() can return for any document.
Definition: tfidfweight.cc:218
wt_norm wt_norm_
The parameter for normalization for the document weight.
Definition: weight.h:865
TfIdfWeight * unserialise(const std::string &serialised) const
Unserialise parameters.
Definition: tfidfweight.cc:189
double get_sumpart(Xapian::termcount wdf, Xapian::termcount doclen, Xapian::termcount uniqterm, Xapian::termcount wdfdocmax) const
Calculate the weight contribution for this object's term to a document.
Definition: tfidfweight.cc:206
void init(double factor)
Allow the subclass to perform any initialisation it needs to.
Definition: tfidfweight.cc:159
TfIdfWeight * clone() const
Clone this object.
Definition: tfidfweight.cc:152
wdf_norm wdf_norm_
The parameter for normalization for the wdf.
Definition: weight.h:861
wt_norm
Weight normalizations.
Definition: weight.h:852
idf_norm idf_norm_
The parameter for normalization for the idf.
Definition: weight.h:863
double idfn
Normalised IDF value (document-independent).
Definition: weight.h:871
std::string name() const
Return the name of this weighting scheme, e.g.
Definition: tfidfweight.cc:172
TfIdfWeight()
Construct a TfIdfWeight using the default normalizations ("ntn").
Definition: weight.h:1023
double get_idfn(idf_norm idf_normalization) const
Definition: tfidfweight.cc:287
wdf_norm
Wdf normalizations.
Definition: weight.h:710
@ AUG
Augmented max wdf.
@ AUG_AVERAGE
Augmented average term frequency.
std::string serialise() const
Return this object's parameters serialised as a single string.
Definition: tfidfweight.cc:178
double get_wtn(double wt, wt_norm wt_normalization) const
Definition: tfidfweight.cc:336
double param_slope
Parameters slope and delta in the Piv+ normalization weighting formula.
Definition: weight.h:874
static void parameter_error(const char *msg, const std::string &scheme, const char *params)
static bool param_name(const char **p, std::string &name)
Xapian::termcount get_doclength_lower_bound() const
A lower bound on the minimum length of any document in the shard.
Definition: weight.h:586
Xapian::doccount get_termfreq() const
The number of documents which this term indexes.
Definition: weight.h:558
void need_stat(stat_flags flag)
Tell Xapian that your subclass will want a particular statistic.
Definition: weight.h:183
Xapian::termcount get_wqf() const
The within-query-frequency of this term.
Definition: weight.h:570
Xapian::termcount get_collection_freq() const
The collection frequency of the term.
Definition: weight.h:564
Xapian::doccount get_collection_size() const
The number of documents in the collection.
Definition: weight.h:549
Xapian::doclength get_average_length() const
The average length of a document in the collection.
Definition: weight.h:555
@ WDF_DOC_MAX
Maximum wdf in the current document.
Definition: weight.h:94
@ UNIQUE_TERMS
Number of unique terms in the current document.
Definition: weight.h:85
@ AVERAGE_LENGTH
Average length of documents in the collection.
Definition: weight.h:47
@ DOC_LENGTH_MAX
Upper bound on document lengths.
Definition: weight.h:73
@ DOC_LENGTH
Length of the current document (sum wdf).
Definition: weight.h:59
@ TERMFREQ
How many documents the current term is in.
Definition: weight.h:49
@ WQF
Within-query-frequency of the current term.
Definition: weight.h:55
@ COLLECTION_SIZE
Number of documents in the collection.
Definition: weight.h:43
@ WDF_MAX
Upper bound on wdf.
Definition: weight.h:81
@ DOC_LENGTH_MIN
Lower bound on (non-zero) document lengths.
Definition: weight.h:65
@ COLLECTION_FREQ
Sum of wdf over the whole collection for the current term.
Definition: weight.h:83
@ WDF
Within-document-frequency of the current term in the current document.
Definition: weight.h:57
Xapian::termcount get_wdf_upper_bound() const
An upper bound on the wdf of this term in the shard.
Definition: weight.h:594
#define rare(COND)
Definition: config.h:607
PositionList * p
Debug logging macros.
Hierarchy of classes which Xapian can throw as exceptions.
Map string to idf normalisation code.
static const unsigned char idf_norm_tab[]
int keyword(const unsigned char *p, const char *s, size_t len)
Definition: keyword.cc:32
Efficient keyword to enum lookup.
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
static TfIdfWeight::wdf_norm decode_wdf_norm(const string &normalizations)
Definition: tfidfweight.cc:44
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
static TfIdfWeight::idf_norm decode_idf_norm(const string &normalizations)
Definition: tfidfweight.cc:70
static void parameter_error(const char *message, const char *params)
Definition: bb2weight.cc:185
static TfIdfWeight::wt_norm decode_wt_norm(const string &normalizations)
Definition: tfidfweight.cc:92
Various assertion macros.
@ NONE
Definition: sbl-dispatch.h:26
string serialise_double(double v)
Serialise a double to a string.
double unserialise_double(const char **p, const char *end)
Unserialise a double serialised by serialise_double.
functions to serialise and unserialise a double
Map string to wdf normalisation code.
static const unsigned char wdf_norm_tab[]
Weighting scheme API.
Xapian::Weight::Internal class, holding database and term statistics.