xapian-core  2.0.0
weightinternal.h
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007 Lemur Consulting Ltd
5  * Copyright (C) 2009,2010,2011,2013,2014,2015,2020,2024 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, see
19  * <https://www.gnu.org/licenses/>.
20  */
21 
22 #ifndef XAPIAN_INCLUDED_WEIGHTINTERNAL_H
23 #define XAPIAN_INCLUDED_WEIGHTINTERNAL_H
24 
25 #include "xapian/weight.h"
26 
27 #include "xapian/database.h"
28 #include "xapian/error.h"
29 #include "xapian/query.h"
30 
32 #include "internaltypes.h"
33 #include "omassert.h"
34 #include "stringutils.h"
35 
36 #include <algorithm>
37 #include <cerrno>
38 #include <cstdlib>
39 #include <functional>
40 #include <map>
41 #include <string>
42 #include <string_view>
43 #ifdef HAVE_STD_FROM_CHARS_DOUBLE
44 # include <cstring>
45 # include <charconv>
46 #endif
47 
48 namespace Xapian {
49 
50 namespace Internal {
51 
53 struct TermFreqs {
57 
58  double max_part = 0.0;
59 
60  TermFreqs() {}
62  Xapian::doccount reltermfreq_,
63  Xapian::termcount collfreq_,
64  double max_part_ = 0.0)
65  : termfreq(termfreq_),
66  reltermfreq(reltermfreq_),
67  collfreq(collfreq_),
68  max_part(max_part_) {}
69 
70  void operator+=(const TermFreqs & other) {
71  termfreq += other.termfreq;
72  reltermfreq += other.reltermfreq;
73  collfreq += other.collfreq;
74  // max_part shouldn't be set yet.
75  Assert(max_part == 0.0);
76  Assert(other.max_part == 0.0);
77  }
78 
79  void operator*=(double factor) {
80  termfreq = Xapian::doccount(termfreq * factor + 0.5);
81  reltermfreq = Xapian::doccount(reltermfreq * factor + 0.5);
82  collfreq = Xapian::termcount(collfreq * factor + 0.5);
83  }
84 
85  void operator/=(unsigned x) {
86  termfreq /= x;
87  reltermfreq /= x;
88  collfreq /= x;
89  }
90 
92  std::string get_description() const;
93 };
94 
95 }
96 
97 }
98 
100 
101 namespace Xapian {
102 
103 class RSet;
104 
107 #ifdef XAPIAN_ASSERTIONS
109  size_t subdbs = 0;
110 
115  mutable bool finalised = false;
116 #endif
117 
118  public:
121 
124 
127 
130 
133 
136 
139 
144  bool have_max_part = false;
145 
148 
151  std::map<std::string, TermFreqs, std::less<>> termfreqs;
152 
153  Internal() { }
154 
160  Internal & operator+=(const Internal & inc);
161 
162  void merge(const Weight::Internal& o);
163 
164  void set_query(const Xapian::Query &query_) {
165  AssertEq(subdbs, 0);
166  query = query_;
167  }
168 
170  void accumulate_stats(const Xapian::Database::Internal &sub_db,
171  const Xapian::RSet &rset);
172 
184  bool get_stats(std::string_view term,
185  Xapian::doccount & termfreq,
186  Xapian::doccount & reltermfreq,
187  Xapian::termcount & collfreq) const {
188 #ifdef XAPIAN_ASSERTIONS
189  finalised = true;
190 #endif
191  // We pass an empty std::string for term when calculating the extra
192  // weight.
193  if (term.empty()) {
194  termfreq = collection_size;
195  collfreq = collection_size;
196  reltermfreq = rset_size;
197  return true;
198  }
199 
200  auto i = termfreqs.find(term);
201  if (i == termfreqs.end()) {
202  termfreq = reltermfreq = collfreq = 0;
203  return false;
204  }
205 
206  termfreq = i->second.termfreq;
207  reltermfreq = i->second.reltermfreq;
208  collfreq = i->second.collfreq;
209  return true;
210  }
211 
213  bool get_stats(std::string_view term,
214  Xapian::doccount & termfreq) const {
215  Xapian::doccount dummy1;
216  Xapian::termcount dummy2;
217  return get_stats(term, termfreq, dummy1, dummy2);
218  }
219 
221  bool get_termweight(std::string_view term, double& termweight) const {
222 #ifdef XAPIAN_ASSERTIONS
223  finalised = true;
224 #endif
225  termweight = 0.0;
226  if (term.empty()) {
227  return false;
228  }
229 
230  auto i = termfreqs.find(term);
231  if (i == termfreqs.end()) {
232  return false;
233  }
234 
235  termweight = i->second.max_part;
236  return true;
237  }
238 
243  void get_max_termweight(double & min_tw, double & max_tw) {
244  auto i = termfreqs.begin();
245  while (i != termfreqs.end() && i->second.max_part == 0.0) ++i;
246  if (rare(i == termfreqs.end())) {
247  min_tw = max_tw = 0.0;
248  return;
249  }
250  min_tw = max_tw = i->second.max_part;
251  while (++i != termfreqs.end()) {
252  double max_part = i->second.max_part;
253  if (max_part > max_tw) {
254  max_tw = max_part;
255  } else if (max_part < min_tw && max_part != 0.0) {
256  min_tw = max_part;
257  }
258  }
259  }
260 
262  void set_max_part(const std::string & term, double max_part) {
263  Assert(!term.empty());
264  auto i = termfreqs.find(term);
265  if (i != termfreqs.end()) {
266  have_max_part = true;
267  double& val = i->second.max_part;
268  val = std::max(val, max_part);
269  }
270  }
271 
273 #ifdef XAPIAN_ASSERTIONS
274  finalised = true;
275 #endif
276  // We shortcut an empty shard and avoid creating a postlist tree for
277  // it, and all shards must be empty for collection_size to be zero.
280  }
281 
283  std::string get_description() const;
284 
285  static bool double_param(const char ** p, double * ptr_val) {
286 #ifdef HAVE_STD_FROM_CHARS_DOUBLE
287  const char* startptr = *p;
288  // Unlike strtod(), std::from_chars() doesn't skip leading whitespace.
289  while (C_isspace(*startptr)) ++startptr;
290  const char* endptr = startptr + std::strlen(startptr);
291  double v;
292  const auto& r = std::from_chars(startptr, endptr, v);
293  if (r.ec != std::errc()) {
294  return false;
295  }
296  *p = r.ptr;
297  *ptr_val = v;
298 #else
299  char *end;
300  errno = 0;
301  double v = strtod(*p, &end);
302  if (*p == end || errno) return false;
303  *p = end;
304  *ptr_val = v;
305 #endif
306  return true;
307  }
308 
309  static bool param_name(const char** p, std::string& name) {
310  const char* q = *p;
311  while (*q != ' ') {
312  if (*q == '\0') break;
313  name += *(q)++;
314  }
315  if (q == *p) return false;
316  if (*q == ' ') q++;
317  *p = q;
318  return true;
319  }
320 
321  [[noreturn]]
322  static void parameter_error(const char* msg,
323  const std::string& scheme,
324  const char* params) {
325  std::string m(msg);
326  m += ": '";
327  m += scheme;
328  if (*params) {
329  m += ' ';
330  m += params;
331  }
332  m += "'";
333  throw InvalidArgumentError(m);
334  }
335 };
336 
337 }
338 
339 #endif // XAPIAN_INCLUDED_WEIGHTINTERNAL_H
Virtual base class for Database internals.
InvalidArgumentError indicates an invalid parameter value was passed to the API.
Definition: error.h:229
Class representing a query.
Definition: query.h:45
Class representing a set of documents judged as relevant.
Definition: rset.h:39
Class to hold statistics for a given collection.
static void parameter_error(const char *msg, const std::string &scheme, const char *params)
Xapian::totallength total_length
Total length of all documents in the collection.
Xapian::Query query
The query.
bool get_stats(std::string_view term, Xapian::doccount &termfreq, Xapian::doccount &reltermfreq, Xapian::termcount &collfreq) const
Get the frequencies for the given term.
std::string get_description() const
Return a std::string describing this object.
static bool param_name(const char **p, std::string &name)
Xapian::termcount db_doclength_upper_bound
An upper bound on the maximum length of any document in the database.
bool have_max_part
Has max_part been set for any term?
bool get_termweight(std::string_view term, double &termweight) const
Get the termweight.
Xapian::termcount db_doclength_lower_bound
A lower bound on the minimum length of any document in the database.
void get_max_termweight(double &min_tw, double &max_tw)
Get the minimum and maximum termweights.
void merge(const Weight::Internal &o)
Xapian::termcount db_unique_terms_lower_bound
A lower bound on the number of unique terms in any document.
Xapian::doccount rset_size
Number of relevant documents in the collection.
Xapian::doccount collection_size
Number of documents in the collection.
static bool double_param(const char **p, double *ptr_val)
bool get_stats(std::string_view term, Xapian::doccount &termfreq) const
Get just the termfreq.
Xapian::doclength get_average_length() const
void set_max_part(const std::string &term, double max_part)
Set max_part for a term.
void accumulate_stats(const Xapian::Database::Internal &sub_db, const Xapian::RSet &rset)
Accumulate the rtermfreqs for terms in the query.
std::map< std::string, TermFreqs, std::less<> > termfreqs
Map of term frequencies and relevant term frequencies for the collection.
Internal & operator+=(const Internal &inc)
Add in the supplied statistics from a sub-database.
void set_query(const Xapian::Query &query_)
Xapian::termcount db_unique_terms_upper_bound
An upper bound on the number of unique terms in any document.
#define rare(COND)
Definition: config.h:607
An indexed database of documents.
string term
PositionList * p
Virtual base class for Database internals.
Hierarchy of classes which Xapian can throw as exceptions.
Types used internally.
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
double doclength
A normalised document length.
Definition: types.h:58
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
Definition: types.h:114
Various assertion macros.
#define AssertEq(A, B)
Definition: omassert.h:124
#define Assert(COND)
Definition: omassert.h:122
Xapian::Query API class.
Various handy string-related helpers.
bool C_isspace(char ch)
Definition: stringutils.h:213
The frequencies for a term.
Xapian::doccount reltermfreq
void operator+=(const TermFreqs &other)
std::string get_description() const
Return a std::string describing this object.
TermFreqs(Xapian::doccount termfreq_, Xapian::doccount reltermfreq_, Xapian::termcount collfreq_, double max_part_=0.0)
void operator*=(double factor)
void operator/=(unsigned x)
Xapian::termcount collfreq
Definition: header.h:215
Weighting scheme API.