xapian-core  2.0.0
weight.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007,2008,2009,2014,2017,2019,2024 Olly Betts
5  * Copyright (C) 2009 Lemur Consulting Ltd
6  * Copyright (C) 2017 Vivek Pal
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as
10  * published by the Free Software Foundation; either version 2 of the
11  * License, or (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, see
20  * <https://www.gnu.org/licenses/>.
21  */
22 
23 #include <config.h>
24 
25 #include "xapian/weight.h"
26 
27 #include "backends/leafpostlist.h"
28 #include "weightinternal.h"
29 
30 #include "omassert.h"
31 #include "debuglog.h"
32 
33 #include "xapian/error.h"
34 
35 using namespace std;
36 
37 namespace Xapian {
38 
39 void
40 Weight::init_(const Internal & stats, Xapian::termcount query_length,
41  const Xapian::Database::Internal* shard)
42 {
43  LOGCALL_VOID(MATCH, "Weight::init_", stats | query_length | shard);
44  collection_size_ = stats.collection_size;
45  rset_size_ = stats.rset_size;
46  if (stats_needed & AVERAGE_LENGTH)
47  average_length_ = stats.get_average_length();
48  if (stats_needed & DOC_LENGTH_MAX)
49  doclength_upper_bound_ = shard->get_doclength_upper_bound();
50  if (stats_needed & DOC_LENGTH_MIN)
51  doclength_lower_bound_ = shard->get_doclength_lower_bound();
52  if (stats_needed & UNIQUE_TERMS_MAX)
53  unique_terms_upper_bound_ = shard->get_unique_terms_upper_bound();
54  if (stats_needed & UNIQUE_TERMS_MIN)
55  unique_terms_lower_bound_ = shard->get_unique_terms_lower_bound();
56  if (stats_needed & TOTAL_LENGTH)
57  total_length_ = stats.total_length;
58  if (stats_needed & DB_DOC_LENGTH_MAX)
59  db_doclength_upper_bound_ = stats.db_doclength_upper_bound;
60  if (stats_needed & DB_DOC_LENGTH_MIN)
61  db_doclength_lower_bound_ = stats.db_doclength_lower_bound;
62  if (stats_needed & DB_UNIQUE_TERMS_MAX)
63  db_unique_terms_upper_bound_ = stats.db_unique_terms_upper_bound;
64  if (stats_needed & DB_UNIQUE_TERMS_MIN)
65  db_unique_terms_lower_bound_ = stats.db_unique_terms_lower_bound;
66  collectionfreq_ = 0;
67  wdf_upper_bound_ = 0;
68  termfreq_ = 0;
69  reltermfreq_ = 0;
70  query_length_ = query_length;
71  wqf_ = 1;
72  init(0.0);
73 }
74 
75 void
76 Weight::init_(const Internal & stats, Xapian::termcount query_length,
77  const string & term, Xapian::termcount wqf, double factor,
78  const Xapian::Database::Internal* shard,
79  void* postlist_void)
80 {
81  LOGCALL_VOID(MATCH, "Weight::init_", stats | query_length | term | wqf | factor | shard | postlist_void);
82  collection_size_ = stats.collection_size;
83  rset_size_ = stats.rset_size;
84  if (stats_needed & AVERAGE_LENGTH)
85  average_length_ = stats.get_average_length();
86  if (stats_needed & DOC_LENGTH_MAX)
87  doclength_upper_bound_ = shard->get_doclength_upper_bound();
88  if (stats_needed & DOC_LENGTH_MIN)
89  doclength_lower_bound_ = shard->get_doclength_lower_bound();
90  if (stats_needed & UNIQUE_TERMS_MAX)
91  unique_terms_upper_bound_ = shard->get_unique_terms_upper_bound();
92  if (stats_needed & UNIQUE_TERMS_MIN)
93  unique_terms_lower_bound_ = shard->get_unique_terms_lower_bound();
94  if (stats_needed & TOTAL_LENGTH)
95  total_length_ = stats.total_length;
96  if (stats_needed & WDF_MAX) {
97  auto postlist = static_cast<LeafPostList*>(postlist_void);
98  wdf_upper_bound_ = postlist->get_wdf_upper_bound();
99  }
100  if (stats_needed & DB_DOC_LENGTH_MAX)
101  db_doclength_upper_bound_ = stats.db_doclength_upper_bound;
102  if (stats_needed & DB_DOC_LENGTH_MIN)
103  db_doclength_lower_bound_ = stats.db_doclength_lower_bound;
104  if (stats_needed & DB_UNIQUE_TERMS_MAX)
105  db_unique_terms_upper_bound_ = stats.db_unique_terms_upper_bound;
106  if (stats_needed & DB_UNIQUE_TERMS_MIN)
107  db_unique_terms_lower_bound_ = stats.db_unique_terms_lower_bound;
108  if (stats_needed & DB_WDF_MAX) {
109  // FIXME: Nothing uses this stat, so for now return a correct but
110  // likely fairly loose upper bound. Once we have something that
111  // wants to use this we can implement tracking a per-term wdf_max
112  // across the whole database.
113  db_wdf_upper_bound_ = stats.db_doclength_upper_bound;
114  }
115  if (stats_needed & (TERMFREQ | RELTERMFREQ | COLLECTION_FREQ)) {
116  bool ok = stats.get_stats(term,
117  termfreq_, reltermfreq_, collectionfreq_);
118  (void)ok;
119  Assert(ok);
120  }
121  query_length_ = query_length;
122  wqf_ = wqf;
123  init(factor);
124 }
125 
126 void
127 Weight::init_(const Internal & stats, Xapian::termcount query_length,
128  double factor, Xapian::doccount termfreq,
129  Xapian::doccount reltermfreq, Xapian::termcount collection_freq,
130  const Xapian::Database::Internal* shard)
131 {
132  LOGCALL_VOID(MATCH, "Weight::init_", stats | query_length | factor | termfreq | reltermfreq | collection_freq | shard);
133  // Synonym case.
134  collection_size_ = stats.collection_size;
135  rset_size_ = stats.rset_size;
136  if (stats_needed & AVERAGE_LENGTH)
137  average_length_ = stats.get_average_length();
138  if (stats_needed & (DOC_LENGTH_MAX | WDF_MAX)) {
139  doclength_upper_bound_ = shard->get_doclength_upper_bound();
140  // The doclength is an upper bound on the wdf. This is obviously true
141  // for normal terms, but SynonymPostList ensures that it is also true
142  // for synonym terms by clamping the wdf values returned to the
143  // doclength.
144  //
145  // (This clamping is only actually necessary in cases where a
146  // constituent term of the synonym is repeated.)
147  wdf_upper_bound_ = doclength_upper_bound_;
148  }
149  if (stats_needed & DOC_LENGTH_MIN)
150  doclength_lower_bound_ = shard->get_doclength_lower_bound();
151  if (stats_needed & UNIQUE_TERMS_MAX)
152  unique_terms_upper_bound_ = shard->get_unique_terms_upper_bound();
153  if (stats_needed & UNIQUE_TERMS_MIN)
154  unique_terms_lower_bound_ = shard->get_unique_terms_lower_bound();
155  if (stats_needed & TOTAL_LENGTH)
156  total_length_ = stats.total_length;
157  if (stats_needed & (DB_DOC_LENGTH_MAX | DB_WDF_MAX)) {
158  db_doclength_upper_bound_ = stats.db_doclength_upper_bound;
159  // The doclength is an upper bound on the wdf. This is obviously true
160  // for normal terms, but SynonymPostList ensures that it is also true
161  // for synonym terms by clamping the wdf values returned to the
162  // doclength.
163  //
164  // (This clamping is only actually necessary in cases where a
165  // constituent term of the synonym is repeated.)
166  db_wdf_upper_bound_ = db_doclength_upper_bound_;
167  }
168  if (stats_needed & DB_DOC_LENGTH_MIN)
169  db_doclength_lower_bound_ = stats.db_doclength_lower_bound;
170  if (stats_needed & DB_UNIQUE_TERMS_MAX)
171  db_unique_terms_upper_bound_ = stats.db_unique_terms_upper_bound;
172  if (stats_needed & DB_UNIQUE_TERMS_MIN)
173  db_unique_terms_lower_bound_ = stats.db_unique_terms_lower_bound;
174 
175  termfreq_ = termfreq;
176  reltermfreq_ = reltermfreq;
177  query_length_ = query_length;
178  collectionfreq_ = collection_freq;
179  wqf_ = 1;
180  init(factor);
181 }
182 
183 Weight::~Weight() { }
184 
185 string
187 {
188  return string();
189 }
190 
191 string
192 Weight::serialise() const
193 {
194  throw Xapian::UnimplementedError("serialise() not supported for this Xapian::Weight subclass");
195 }
196 
197 Weight *
198 Weight::unserialise(const string &) const
199 {
200  throw Xapian::UnimplementedError("unserialise() not supported for this Xapian::Weight subclass");
201 }
202 
203 double
204 Weight::get_sumextra(Xapian::termcount,
206  Xapian::termcount) const
207 {
208  return 0.0;
209 }
210 
211 double
212 Weight::get_maxextra() const
213 {
214  return 0.0;
215 }
216 
217 [[noreturn]]
218 static inline void
219 parameter_error(const char* message, const string& scheme, const char* params)
220 {
221  Xapian::Weight::Internal::parameter_error(message, scheme, params);
222 }
223 
224 const Weight *
225 Weight::create(const string & s, const Registry & reg)
226 {
227  const char *p = s.c_str();
228  std::string scheme;
229 
230  while (*p != ' ') {
231  if (*p == '\0') break;
232  scheme += *p;
233  p++;
234  }
235 
236  if (*p == ' ') p++;
237  auto weight = reg.get_weighting_scheme(scheme);
238  if (!weight) {
239  // Allow "trad" and "trad <k>" to work despite TradWeight now just
240  // being a thin subclass of BM25Weight.
241  if (scheme == "trad") {
242  const char* params = p;
243  double k = 1.0;
244  if (*p != '\0') {
246  parameter_error("Parameter is invalid", scheme, params);
247  if (*p)
248  parameter_error("Extra data after parameter",
249  scheme, params);
250  }
251  return new BM25Weight(k, 0.0, 0.0, 1.0, 0.0);
252  }
253  throw InvalidArgumentError("Unknown weighting scheme: " + scheme);
254  }
255  return weight->create_from_parameters(p);
256 }
257 
258 Weight *
259 Weight::create_from_parameters(const char *) const
260 {
261  throw Xapian::UnimplementedError("create_from_parameters() not supported for this Xapian::Weight subclass");
262 }
263 
264 }
char name[9]
Definition: dbcheck.cc:57
Abstract base class for leaf postlists.
Definition: leafpostlist.h:40
virtual Xapian::termcount get_wdf_upper_bound() const =0
Xapian::Weight subclass implementing the BM25 probabilistic formula.
Definition: weight.h:1050
Virtual base class for Database internals.
virtual termcount get_unique_terms_lower_bound() const
Get a lower bound on the unique terms size of a document in this DB.
virtual termcount get_doclength_upper_bound() const =0
Get an upper bound on the length of a document in this DB.
virtual termcount get_unique_terms_upper_bound() const
Get an upper bound on the unique terms size of a document in this DB.
virtual termcount get_doclength_lower_bound() const =0
Get a lower bound on the length of a document in this DB.
InvalidArgumentError indicates an invalid parameter value was passed to the API.
Definition: error.h:229
Registry for user subclasses.
Definition: registry.h:47
const Xapian::Weight * get_weighting_scheme(std::string_view name) const
Get the weighting scheme given a name.
Definition: registry.cc:317
UnimplementedError indicates an attempt to use an unimplemented feature.
Definition: error.h:313
Class to hold statistics for a given collection.
static void parameter_error(const char *msg, const std::string &scheme, const char *params)
Xapian::totallength total_length
Total length of all documents in the collection.
bool get_stats(std::string_view term, Xapian::doccount &termfreq, Xapian::doccount &reltermfreq, Xapian::termcount &collfreq) const
Get the frequencies for the given term.
Xapian::termcount db_doclength_upper_bound
An upper bound on the maximum length of any document in the database.
Xapian::termcount db_doclength_lower_bound
A lower bound on the minimum length of any document in the database.
Xapian::termcount db_unique_terms_lower_bound
A lower bound on the number of unique terms in any document.
Xapian::doccount rset_size
Number of relevant documents in the collection.
Xapian::doccount collection_size
Number of documents in the collection.
static bool double_param(const char **p, double *ptr_val)
Xapian::doclength get_average_length() const
Xapian::termcount db_unique_terms_upper_bound
An upper bound on the number of unique terms in any document.
Abstract base class for weighting schemes.
Definition: weight.h:38
string term
PositionList * p
Debug logging macros.
#define LOGCALL_VOID(CATEGORY, FUNC, PARAMS)
Definition: debuglog.h:479
Hierarchy of classes which Xapian can throw as exceptions.
Abstract base class for leaf postlists.
static void parameter_error(const char *message, const std::string &scheme, const char *params)
Definition: lmweight.cc:41
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
Various assertion macros.
#define Assert(COND)
Definition: omassert.h:122
Weighting scheme API.
Xapian::Weight::Internal class, holding database and term statistics.