xapian-core  1.4.25
tfidfweight.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2013 Aarsh Shah
5  * Copyright (C) 2016 Vivek Pal
6  * Copyright (C) 2016,2017 Olly Betts
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as
10  * published by the Free Software Foundation; either version 2 of the
11  * License, or (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include <config.h>
24 
25 #include "xapian/weight.h"
26 #include <cmath>
27 #include <cstring>
28 
29 #include "debuglog.h"
30 #include "omassert.h"
31 
32 #include "xapian/error.h"
33 
34 using namespace std;
35 
36 namespace Xapian {
37 
38 TfIdfWeight::TfIdfWeight(const std::string &normals)
39  : normalizations(normals)
40 {
41  if (normalizations.length() != 3 ||
42  !strchr("nbslL", normalizations[0]) ||
43  !strchr("ntpfs", normalizations[1]) ||
44  !strchr("n", normalizations[2]))
45  throw Xapian::InvalidArgumentError("Normalization string is invalid");
46  if (normalizations[1] != 'n') {
49  }
50  need_stat(WDF);
52  need_stat(WQF);
53  if (normalizations[0] == 'L') {
58  }
59 }
60 
63 {
64  return new TfIdfWeight(normalizations);
65 }
66 
67 void
68 TfIdfWeight::init(double factor_)
69 {
70  if (factor_ == 0.0) {
71  // This object is for the term-independent contribution, and that's
72  // always zero for this scheme.
73  return;
74  }
75 
76  factor = get_wqf() * factor_;
77 }
78 
79 string
81 {
82  return "Xapian::TfIdfWeight";
83 }
84 
85 string
87 {
88  return normalizations;
89 }
90 
92 TfIdfWeight::unserialise(const string & s) const
93 {
94  if (s.length() != 3)
95  throw Xapian::SerialisationError("Extra data in TfIdfWeight::unserialise()");
96  return new TfIdfWeight(s);
97 }
98 
99 static double
101  Xapian::termcount uniqterms)
102 {
103  if (wdf == 0) return 0;
104  double uniqterm_double = uniqterms;
105  double doclen_double = doclen;
106  double wdf_avg = 1;
107  if (doclen_double == 0 || uniqterm_double == 0)
108  wdf_avg = 1;
109  else
110  wdf_avg = doclen_double / uniqterm_double;
111  double num = 1 + log(double(wdf));
112  double den = 1 + log(wdf_avg);
113  return num / den;
114 }
115 
116 double
118  Xapian::termcount uniqterms) const
119 {
120  Xapian::doccount termfreq = 1;
121  if (normalizations[1] != 'n') termfreq = get_termfreq();
122  double wt;
123  if (normalizations[0] != 'L') {
124  wt = get_wdfn(wdf, normalizations[0]);
125  } else {
126  wt = get_wdfn_for_L(wdf, doclen, uniqterms);
127  }
128  wt *= get_idfn(termfreq, normalizations[1]);
129  return get_wtn(wt, normalizations[2]) * factor;
130 }
131 
132 // An upper bound can be calculated simply on the basis of wdf_max as termfreq
133 // and N are constants.
134 double
136 {
137  Xapian::doccount termfreq = 1;
138  if (normalizations[1] != 'n') termfreq = get_termfreq();
140  double wt;
141  if (normalizations[0] != 'L') {
142  wt = get_wdfn(wdf_max, normalizations[0]);
143  } else {
145  wt = get_wdfn_for_L(wdf_max, len_min, len_min);
146  }
147  wt *= get_idfn(termfreq, normalizations[1]);
148  return get_wtn(wt, normalizations[2]) * factor;
149 }
150 
151 // There is no extra per document component in the TfIdfWeighting scheme.
152 double
154 {
155  return 0;
156 }
157 
158 double
160 {
161  return 0;
162 }
163 
164 // Return normalized wdf, idf and weight depending on the normalization string.
165 double
167 {
168  switch (c) {
169  case 'b':
170  if (wdf == 0) return 0;
171  return 1.0;
172  case 's':
173  return (wdf * wdf);
174  case 'l':
175  if (wdf == 0) return 0;
176  return (1 + log(double(wdf)));
177  default:
178  AssertEq(c, 'n');
179  return wdf;
180  }
181 }
182 
183 double
185 {
186  double N = 1.0;
187  if (c != 'n' && c != 'f') N = get_collection_size();
188  switch (c) {
189  case 'n':
190  return 1.0;
191  case 'p':
192  // All documents are indexed by the term
193  if (N == termfreq) return 0;
194  return log((N - termfreq) / termfreq);
195  case 'f':
196  return (1.0 / termfreq);
197  case 's': {
198  double x = log(N / termfreq);
199  return x * x;
200  }
201  default:
202  AssertEq(c, 't');
203  return (log(N / termfreq));
204  }
205 }
206 
207 double
208 TfIdfWeight::get_wtn(double wt, char c) const
209 {
210  (void)c;
211  AssertEq(c, 'n');
212  return wt;
213 }
214 
215 }
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
TfIdfWeight * clone() const
Clone this object.
Definition: tfidfweight.cc:62
Xapian::doccount get_collection_size() const
The number of documents in the collection.
Definition: weight.h:363
std::string normalizations
Definition: weight.h:450
void init(double factor)
Allow the subclass to perform any initialisation it needs to.
Definition: tfidfweight.cc:68
#define AssertEq(A, B)
Definition: omassert.h:124
double factor
The factor to multiply with the weight.
Definition: weight.h:453
Upper bound on document lengths.
Definition: weight.h:60
double get_maxpart() const
Return an upper bound on what get_sumpart() can return for any document.
Definition: tfidfweight.cc:135
STL namespace.
Lower bound on (non-zero) document lengths.
Definition: weight.h:58
double get_maxextra() const
Return an upper bound on what get_sumextra() can return for any document.
Definition: tfidfweight.cc:159
std::string serialise() const
Return this object&#39;s parameters serialised as a single string.
Definition: tfidfweight.cc:86
double get_wtn(double wt, char c) const
Definition: tfidfweight.cc:208
TfIdfWeight()
Construct a TfIdfWeight using the default normalizations ("ntn").
Definition: weight.h:509
Hierarchy of classes which Xapian can throw as exceptions.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
Length of the current document (sum wdf).
Definition: weight.h:56
InvalidArgumentError indicates an invalid parameter value was passed to the API.
Definition: error.h:241
Xapian::termcount get_doclength_lower_bound() const
A lower bound on the minimum length of any document in the database.
Definition: weight.h:400
TfIdfWeight * unserialise(const std::string &serialised) const
Unserialise parameters.
Definition: tfidfweight.cc:92
Indicates an error in the std::string serialisation of an object.
Definition: error.h:929
Within-query-frequency of the current term.
Definition: weight.h:52
double get_sumextra(Xapian::termcount doclen, Xapian::termcount uniqterms) const
Calculate the term-independent weight component for a document.
Definition: tfidfweight.cc:153
double get_wdfn(Xapian::termcount wdf, char c) const
Definition: tfidfweight.cc:166
Xapian::termcount get_wqf() const
The within-query-frequency of this term.
Definition: weight.h:384
double get_sumpart(Xapian::termcount wdf, Xapian::termcount doclen, Xapian::termcount uniqterm) const
Calculate the weight contribution for this object&#39;s term to a document.
Definition: tfidfweight.cc:117
Weighting scheme API.
Within-document-frequency of the current term in the current document.
Definition: weight.h:54
static double get_wdfn_for_L(Xapian::termcount wdf, Xapian::termcount doclen, Xapian::termcount uniqterms)
Definition: tfidfweight.cc:100
Upper bound on wdf.
Definition: weight.h:62
Xapian::doccount get_termfreq() const
The number of documents which this term indexes.
Definition: weight.h:372
How many documents the current term is in.
Definition: weight.h:46
double get_idfn(Xapian::doccount termfreq, char c) const
Definition: tfidfweight.cc:184
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Number of documents in the collection.
Definition: weight.h:40
void need_stat(stat_flags flag)
Tell Xapian that your subclass will want a particular statistic.
Definition: weight.h:83
Number of unique terms in the current document.
Definition: weight.h:66
Various assertion macros.
Xapian::termcount get_wdf_upper_bound() const
An upper bound on the wdf of this term.
Definition: weight.h:408
std::string name() const
Return the name of this weighting scheme.
Definition: tfidfweight.cc:80
Debug logging macros.
Xapian::Weight subclass implementing the tf-idf weighting scheme.
Definition: weight.h:447