xapian-core  2.0.0
postingsource.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2008-2024 Olly Betts
5  * Copyright (C) 2008,2009 Lemur Consulting Ltd
6  * Copyright (C) 2010 Richard Boulton
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as
10  * published by the Free Software Foundation; either version 2 of the
11  * License, or (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, see
20  * <https://www.gnu.org/licenses/>.
21  */
22 
23 #include <config.h>
24 
25 #include "xapian/postingsource.h"
26 
29 #include "matcher/postlisttree.h"
30 
31 #include "xapian/document.h"
32 #include "xapian/error.h"
33 #include "xapian/queryparser.h" // For sortable_unserialise().
34 
35 #include "omassert.h"
36 #include "pack.h"
37 #include "serialise-double.h"
38 #include "str.h"
39 
40 #include <cfloat>
41 #include <memory>
42 
43 using namespace std;
44 
45 namespace Xapian {
46 
47 PostingSource::~PostingSource() { }
48 
49 double
50 PostingSource::get_weight() const
51 {
52  return 0;
53 }
54 
55 void
56 PostingSource::skip_to(Xapian::docid did, double min_wt)
57 {
58  while (!at_end() && get_docid() < did) {
59  next(min_wt);
60  }
61 }
62 
63 bool
64 PostingSource::check(Xapian::docid did, double min_wt)
65 {
66  skip_to(did, min_wt);
67  return true;
68 }
69 
71 PostingSource::clone() const
72 {
73  return NULL;
74 }
75 
76 string
78 {
79  return string();
80 }
81 
82 string
83 PostingSource::serialise() const
84 {
85  throw Xapian::UnimplementedError("serialise() not supported for this PostingSource");
86 }
87 
89 PostingSource::unserialise(const string &) const
90 {
91  throw Xapian::UnimplementedError("unserialise() not supported for this PostingSource");
92 }
93 
95 PostingSource::unserialise_with_registry(const std::string &s,
96  const Registry &) const
97 {
98  return unserialise(s);
99 }
100 
101 void
102 PostingSource::reset(const Database& db, Xapian::doccount)
103 {
104  init(db);
105 }
106 
107 void
108 PostingSource::init(const Database&)
109 {
110  const char* msg = "Either PostingSource::reset() or PostingSource::init() "
111  "must be overridden";
113 }
114 
115 string
116 PostingSource::get_description() const
117 {
118  return "Xapian::PostingSource subclass";
119 }
120 
122 ValuePostingSource::get_termfreq_min() const
123 {
124  return termfreq_min;
125 }
126 
128 ValuePostingSource::get_termfreq_est() const
129 {
130  return termfreq_est;
131 }
132 
134 ValuePostingSource::get_termfreq_max() const
135 {
136  return termfreq_max;
137 }
138 
139 void
140 ValuePostingSource::next(double min_wt)
141 {
142  if (!started) {
143  started = true;
144  value_it = db.valuestream_begin(slot);
145  } else {
146  ++value_it;
147  }
148 
149  if (value_it == db.valuestream_end(slot)) return;
150 
151  if (min_wt > get_maxweight()) {
152  value_it = db.valuestream_end(slot);
153  return;
154  }
155 }
156 
157 void
158 ValuePostingSource::skip_to(Xapian::docid min_docid, double min_wt)
159 {
160  if (!started) {
161  started = true;
162  value_it = db.valuestream_begin(slot);
163 
164  if (value_it == db.valuestream_end(slot)) return;
165  }
166 
167  if (min_wt > get_maxweight()) {
168  value_it = db.valuestream_end(slot);
169  return;
170  }
171  value_it.skip_to(min_docid);
172 }
173 
174 bool
175 ValuePostingSource::check(Xapian::docid min_docid, double min_wt)
176 {
177  if (!started) {
178  started = true;
179  value_it = db.valuestream_begin(slot);
180 
181  if (value_it == db.valuestream_end(slot)) return true;
182  }
183 
184  if (min_wt > get_maxweight()) {
185  value_it = db.valuestream_end(slot);
186  return true;
187  }
188  return value_it.check(min_docid);
189 }
190 
191 bool
192 ValuePostingSource::at_end() const
193 {
194  return started && value_it == db.valuestream_end(slot);
195 }
196 
198 ValuePostingSource::get_docid() const
199 {
200  return value_it.get_docid();
201 }
202 
203 void
204 ValuePostingSource::reset(const Database& db_, Xapian::doccount)
205 {
206  db = db_;
207  started = false;
208  set_maxweight(DBL_MAX);
209  termfreq_max = db.get_value_freq(slot);
210  termfreq_est = termfreq_max;
211  termfreq_min = termfreq_max;
212 }
213 
214 string
215 ValuePostingSource::get_description() const
216 {
217  string desc("Xapian::ValuePostingSource(slot=");
218  desc += str(get_slot());
219  desc += ")";
220  return desc;
221 }
222 
223 
224 ValueWeightPostingSource::ValueWeightPostingSource(Xapian::valueno slot_)
225  : ValuePostingSource(slot_)
226 {
227 }
228 
229 double
231 {
232  Assert(!at_end());
233  Assert(get_started());
235 }
236 
239 {
240  return new ValueWeightPostingSource(get_slot());
241 }
242 
243 string
245 {
246  return string("Xapian::ValueWeightPostingSource");
247 }
248 
249 string
251 {
252  string result;
253  pack_uint_last(result, get_slot());
254  return result;
255 }
256 
259 {
260  const char * p = s.data();
261  const char * end = p + s.size();
262 
263  Xapian::valueno new_slot;
264  if (!unpack_uint_last(&p, end, &new_slot)) {
266  }
267 
268  return new ValueWeightPostingSource(new_slot);
269 }
270 
271 void
273  Xapian::doccount shard_index)
274 {
275  ValuePostingSource::reset(db_, shard_index);
276 
277  string upper_bound = get_database().get_value_upper_bound(get_slot());
278  if (upper_bound.empty()) {
279  // This should only happen if there are no entries, in which case the
280  // maxweight is 0.
281  set_maxweight(0.0);
282  } else {
283  set_maxweight(sortable_unserialise(upper_bound));
284  }
285 }
286 
287 string
289 {
290  string desc("Xapian::ValueWeightPostingSource(slot=");
291  desc += str(get_slot());
292  desc += ")";
293  return desc;
294 }
295 
296 
298  : ValuePostingSource(slot_),
299  default_weight(0.0),
300  max_weight_in_map(0.0)
301 {
302 }
303 
304 void
305 ValueMapPostingSource::add_mapping(const string & key, double wt)
306 {
307  weight_map[key] = wt;
309 }
310 
311 void
313 {
314  weight_map.clear();
315  max_weight_in_map = 0.0;
316 }
317 
318 void
320 {
321  default_weight = wt;
322 }
323 
324 double
326 {
327  map<string, double>::const_iterator wit = weight_map.find(get_value());
328  if (wit == weight_map.end()) {
329  return default_weight;
330  }
331  return wit->second;
332 }
333 
336 {
337  unique_ptr<ValueMapPostingSource> res(
339  map<string, double>::const_iterator i;
340  for (i = weight_map.begin(); i != weight_map.end(); ++i) {
341  res->add_mapping(i->first, i->second);
342  }
343  res->set_default_weight(default_weight);
344  return res.release();
345 }
346 
347 string
349 {
350  return string("Xapian::ValueMapPostingSource");
351 }
352 
353 string
355 {
356  string result;
357  pack_uint(result, get_slot());
358  result += serialise_double(default_weight);
359 
360  map<string, double>::const_iterator i;
361  for (i = weight_map.begin(); i != weight_map.end(); ++i) {
362  pack_string(result, i->first);
363  result.append(serialise_double(i->second));
364  }
365 
366  return result;
367 }
368 
371 {
372  const char * p = s.data();
373  const char * end = p + s.size();
374 
375  Xapian::valueno new_slot;
376  if (!unpack_uint(&p, end, &new_slot)) {
378  }
379  unique_ptr<ValueMapPostingSource> res(new ValueMapPostingSource(new_slot));
380  res->set_default_weight(unserialise_double(&p, end));
381  while (p != end) {
382  string key;
383  if (!unpack_string(&p, end, key)) {
385  }
386  res->add_mapping(key, unserialise_double(&p, end));
387  }
388  return res.release();
389 }
390 
391 void
393 {
394  ValuePostingSource::reset(db_, shard_index);
396 }
397 
398 string
400 {
401  string desc("Xapian::ValueMapPostingSource(slot=");
402  desc += str(get_slot());
403  desc += ")";
404  return desc;
405 }
406 
408  : started(false)
409 {
410  // The weight is fixed at wt, so that's the maxweight too. So just store wt
411  // as the maxweight and we can read it from there when we need it.
412  set_maxweight(wt);
413 }
414 
417 {
418  return termfreq;
419 }
420 
423 {
424  return termfreq;
425 }
426 
429 {
430  return termfreq;
431 }
432 
433 double
435 {
436  return get_maxweight();
437 }
438 
439 void
441 {
442  if (!started) {
443  started = true;
444  it = db.postlist_begin(string_view());
445  } else {
446  ++it;
447  }
448 
449  if (it == db.postlist_end(string_view())) return;
450 
451  if (check_docid) {
452  it.skip_to(check_docid + 1);
453  check_docid = 0;
454  }
455 
456  if (min_wt > get_maxweight()) {
457  it = db.postlist_end(string_view());
458  }
459 }
460 
461 void
463 {
464  if (!started) {
465  started = true;
466  it = db.postlist_begin(string_view());
467 
468  if (it == db.postlist_end(string_view())) return;
469  }
470 
471  if (check_docid) {
472  if (min_docid < check_docid)
473  min_docid = check_docid + 1;
474  check_docid = 0;
475  }
476 
477  if (min_wt > get_maxweight()) {
478  it = db.postlist_end(string_view());
479  return;
480  }
481  it.skip_to(min_docid);
482 }
483 
484 bool
486 {
487  // We're guaranteed not to be called if the document doesn't
488  // exist, so just remember the docid passed, and return true.
489  check_docid = min_docid;
490  return true;
491 }
492 
493 bool
495 {
496  if (check_docid != 0) return false;
497  return started && it == db.postlist_end(string_view());
498 }
499 
502 {
503  if (check_docid != 0) return check_docid;
504  return *it;
505 }
506 
509 {
511 }
512 
513 string
515 {
516  return string("Xapian::FixedWeightPostingSource");
517 }
518 
519 string
521 {
523 }
524 
527 {
528  const char * p = s.data();
529  const char * s_end = p + s.size();
530  double new_wt = unserialise_double(&p, s_end);
531  if (p != s_end) {
532  throw Xapian::NetworkError("Bad serialised FixedWeightPostingSource - junk at end");
533  }
534  return new FixedWeightPostingSource(new_wt);
535 }
536 
537 void
539 {
540  db = db_;
541  termfreq = db_.get_doccount();
542  started = false;
543  check_docid = 0;
544 }
545 
546 string
548 {
549  string desc("Xapian::FixedWeightPostingSource(wt=");
550  desc += str(get_maxweight());
551  desc += ")";
552  return desc;
553 }
554 
555 }
char name[9]
Definition: dbcheck.cc:57
An indexed database of documents.
Definition: database.h:75
PostingIterator postlist_begin(std::string_view term) const
Start iterating the postings of a term.
Definition: database.cc:192
std::string get_value_upper_bound(Xapian::valueno slot) const
Get an upper bound on the values stored in the given value slot.
Definition: database.cc:296
Xapian::doccount get_doccount() const
Get the number of documents in the database.
Definition: database.cc:233
PostingIterator postlist_end(std::string_view) const noexcept
End iterator corresponding to postlist_begin().
Definition: database.h:258
A posting source which returns a fixed weight for all documents.
Xapian::doccount get_termfreq_est() const override
An estimate of the number of documents this object can return.
Xapian::PostingIterator it
Iterator over all documents.
std::string get_description() const override
Return a string describing this object.
Xapian::docid get_docid() const override
Return the current docid.
std::string name() const override
Name of the posting source class.
std::string serialise() const override
Serialise object parameters into a string.
FixedWeightPostingSource * unserialise(const std::string &serialised) const override
Create object given string serialisation returned by serialise().
void next(double min_wt) override
Advance the current position to the next matching document.
double get_weight() const override
Return the weight contribution for the current document.
Xapian::doccount get_termfreq_min() const override
A lower bound on the number of documents this object can return.
FixedWeightPostingSource(double wt)
Construct a FixedWeightPostingSource.
bool started
Flag indicating if we've started (true if we have).
bool at_end() const override
Return true if the current position is past the last entry in this list.
void reset(const Database &db_, Xapian::doccount shard_index) override
Set this PostingSource to the start of the list of postings.
Xapian::doccount get_termfreq_max() const override
An upper bound on the number of documents this object can return.
Xapian::docid check_docid
The docid last passed to check() (0 if check() wasn't the last move).
Xapian::doccount termfreq
Number of documents in the posting source.
void skip_to(Xapian::docid min_docid, double min_wt) override
Advance to the specified docid.
bool check(Xapian::docid min_docid, double min_wt) override
Check if the specified docid occurs.
FixedWeightPostingSource * clone() const override
Clone the posting source.
Xapian::Database db
The database we're reading documents from.
InvalidOperationError indicates the API was used in an invalid way.
Definition: error.h:271
Indicates a problem communicating with a remote database.
Definition: error.h:791
void skip_to(Xapian::docid did)
Advance the iterator to document did.
Base class which provides an "external" source of postings.
Definition: postingsource.h:47
void set_maxweight(double max_weight)
Specify an upper bound on what get_weight() will return from now on.
double get_maxweight() const noexcept
Return the currently set upper bound on what get_weight() can return.
Registry for user subclasses.
Definition: registry.h:47
UnimplementedError indicates an attempt to use an unimplemented feature.
Definition: error.h:313
A posting source which looks up weights in a map using values as the key.
void add_mapping(const std::string &key, double wt)
Add a mapping.
ValueMapPostingSource * clone() const override
Clone the posting source.
void reset(const Database &db_, Xapian::doccount shard_index) override
Set this PostingSource to the start of the list of postings.
std::map< std::string, double > weight_map
The value -> weight map.
std::string name() const override
Name of the posting source class.
double max_weight_in_map
The maximum weight in weight_map.
void clear_mappings()
Clear all mappings.
std::string serialise() const override
Serialise object parameters into a string.
void set_default_weight(double wt)
Set a default weight for document values not in the map.
std::string get_description() const override
Return a string describing this object.
double get_weight() const override
Return the weight contribution for the current document.
ValueMapPostingSource * unserialise(const std::string &serialised) const override
Create object given string serialisation returned by serialise().
double default_weight
The default weight.
ValueMapPostingSource(Xapian::valueno slot_)
Construct a ValueMapPostingSource.
A posting source which generates weights from a value slot.
bool at_end() const
Return true if the current position is past the last entry in this list.
void reset(const Database &db_, Xapian::doccount shard_index)
Set this PostingSource to the start of the list of postings.
Xapian::Database get_database() const
The database we're reading values from.
bool get_started() const
Flag indicating if we've started (true if we have).
std::string get_value() const
Read current value.
Xapian::valueno get_slot() const
The slot we're reading values from.
A posting source which reads weights from a value slot.
std::string name() const
Name of the posting source class.
void reset(const Database &db_, Xapian::doccount shard_index)
Set this PostingSource to the start of the list of postings.
ValueWeightPostingSource(Xapian::valueno slot_)
Construct a ValueWeightPostingSource.
std::string serialise() const
Serialise object parameters into a string.
double get_weight() const
Return the weight contribution for the current document.
ValueWeightPostingSource * clone() const
Clone the posting source.
std::string get_description() const
Return a string describing this object.
ValueWeightPostingSource * unserialise(const std::string &serialised) const
Create object given string serialisation returned by serialise().
PositionList * p
Virtual base class for Database internals.
Class representing a document.
Abstract base class for a document.
Hierarchy of classes which Xapian can throw as exceptions.
#define false
Definition: header.h:9
string str(int value)
Convert int to std::string.
Definition: str.cc:91
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
double sortable_unserialise(std::string_view serialised) noexcept
Convert a string encoded using sortable_serialise back to a floating point number.
unsigned valueno
The number for a value slot in a document.
Definition: types.h:90
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
Various assertion macros.
#define Assert(COND)
Definition: omassert.h:122
void unpack_throw_serialisation_error(const char *p)
Throw appropriate SerialisationError.
Definition: pack.cc:29
Pack types into strings and unpack them again.
bool unpack_uint_last(const char **p, const char *end, U *result)
Decode an unsigned integer as the last item in a string.
Definition: pack.h:118
bool unpack_string(const char **p, const char *end, std::string &result)
Decode a std::string from a string.
Definition: pack.h:468
void pack_uint_last(std::string &s, U value)
Append an encoded unsigned integer to a string as the last item.
Definition: pack.h:100
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
Definition: pack.h:346
void pack_uint(std::string &s, U value)
Append an encoded unsigned integer to a string.
Definition: pack.h:315
void pack_string(std::string &s, std::string_view value)
Append an encoded std::string to a string.
Definition: pack.h:442
External sources of posting information.
Class for managing a tree of PostList objects.
parsing a user query string to build a Xapian::Query object
string serialise_double(double v)
Serialise a double to a string.
double unserialise_double(const char **p, const char *end)
Unserialise a double serialised by serialise_double.
functions to serialise and unserialise a double
Convert types to std::string.