xapian-core  1.4.19
postingsource.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2008,2009,2010,2011,2012,2015,2016 Olly Betts
5  * Copyright (C) 2008,2009 Lemur Consulting Ltd
6  * Copyright (C) 2010 Richard Boulton
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as
10  * published by the Free Software Foundation; either version 2 of the
11  * License, or (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include <config.h>
24 
25 // We need to be able to set deprecated members of ValuePostingSource.
26 #define XAPIAN_DEPRECATED(X) X
27 #include "xapian/postingsource.h"
28 
29 #include "autoptr.h"
30 
31 #include "backends/database.h"
32 #include "backends/document.h"
33 #include "matcher/multimatch.h"
34 
35 #include "xapian/document.h"
36 #include "xapian/error.h"
37 #include "xapian/queryparser.h" // For sortable_unserialise().
38 
39 #include "omassert.h"
40 #include "net/length.h"
41 #include "serialise-double.h"
42 #include "str.h"
43 
44 #include <cfloat>
45 
46 using namespace std;
47 
48 namespace Xapian {
49 
50 PostingSource::~PostingSource() { }
51 
52 void
53 PostingSource::set_maxweight(double max_weight)
54 {
55  if (usual(matcher_)) {
56  MultiMatch * multimatch = static_cast<MultiMatch*>(matcher_);
57  multimatch->recalc_maxweight();
58  }
59  max_weight_ = max_weight;
60 }
61 
62 double
63 PostingSource::get_weight() const
64 {
65  return 0;
66 }
67 
68 void
69 PostingSource::skip_to(Xapian::docid did, double min_wt)
70 {
71  while (!at_end() && get_docid() < did) {
72  next(min_wt);
73  }
74 }
75 
76 bool
77 PostingSource::check(Xapian::docid did, double min_wt)
78 {
79  skip_to(did, min_wt);
80  return true;
81 }
82 
84 PostingSource::clone() const
85 {
86  return NULL;
87 }
88 
89 string
91 {
92  return string();
93 }
94 
95 string
96 PostingSource::serialise() const
97 {
98  throw Xapian::UnimplementedError("serialise() not supported for this PostingSource");
99 }
100 
102 PostingSource::unserialise(const string &) const
103 {
104  throw Xapian::UnimplementedError("unserialise() not supported for this PostingSource");
105 }
106 
108 PostingSource::unserialise_with_registry(const std::string &s,
109  const Registry &) const
110 {
111  return unserialise(s);
112 }
113 
114 string
115 PostingSource::get_description() const
116 {
117  return "Xapian::PostingSource subclass";
118 }
119 
120 
121 ValuePostingSource::ValuePostingSource(Xapian::valueno slot_)
122  : real_slot(slot_),
123  db(real_db),
124  slot(real_slot),
125  value_it(real_value_it),
126  started(real_started),
127  termfreq_min(real_termfreq_min),
128  termfreq_est(real_termfreq_est),
129  termfreq_max(real_termfreq_max)
130 {
131 }
132 
135 {
136  return real_termfreq_min;
137 }
138 
141 {
142  return real_termfreq_est;
143 }
144 
147 {
148  return real_termfreq_max;
149 }
150 
151 void
153 {
154  if (!real_started) {
155  real_started = true;
157  } else {
158  ++real_value_it;
159  }
160 
162 
163  if (min_wt > get_maxweight()) {
165  return;
166  }
167 }
168 
169 void
171 {
172  if (!real_started) {
173  real_started = true;
175 
177  }
178 
179  if (min_wt > get_maxweight()) {
181  return;
182  }
183  real_value_it.skip_to(min_docid);
184 }
185 
186 bool
187 ValuePostingSource::check(Xapian::docid min_docid, double min_wt)
188 {
189  if (!real_started) {
190  real_started = true;
192 
193  if (real_value_it == real_db.valuestream_end(real_slot)) return true;
194  }
195 
196  if (min_wt > get_maxweight()) {
198  return true;
199  }
200  return real_value_it.check(min_docid);
201 }
202 
203 bool
205 {
207 }
208 
211 {
212  return real_value_it.get_docid();
213 }
214 
215 void
217 {
218  real_db = db_;
219  real_started = false;
220  set_maxweight(DBL_MAX);
221  try {
225  } catch (const Xapian::UnimplementedError &) {
228  real_termfreq_min = 0;
229  }
230 }
231 
232 
234  : ValuePostingSource(slot_)
235 {
236 }
237 
238 double
240 {
241  Assert(!at_end());
242  Assert(get_started());
244 }
245 
248 {
249  return new ValueWeightPostingSource(get_slot());
250 }
251 
252 string
254 {
255  return string("Xapian::ValueWeightPostingSource");
256 }
257 
258 string
260 {
261  return encode_length(get_slot());
262 }
263 
266 {
267  const char * p = s.data();
268  const char * end = p + s.size();
269 
270  Xapian::valueno new_slot;
271  decode_length(&p, end, new_slot);
272  if (p != end) {
273  throw Xapian::NetworkError("Bad serialised ValueWeightPostingSource - junk at end");
274  }
275 
276  return new ValueWeightPostingSource(new_slot);
277 }
278 
279 void
281 {
283 
284  string upper_bound;
285  try {
286  upper_bound = get_database().get_value_upper_bound(get_slot());
287  } catch (const Xapian::UnimplementedError &) {
288  // ValuePostingSource::init() set the maxweight to DBL_MAX.
289  return;
290  }
291 
292  if (upper_bound.empty()) {
293  // This should only happen if there are no entries, in which case the
294  // maxweight is 0.
295  set_maxweight(0.0);
296  } else {
297  set_maxweight(sortable_unserialise(upper_bound));
298  }
299 }
300 
301 string
303 {
304  string desc("Xapian::ValueWeightPostingSource(slot=");
305  desc += str(get_slot());
306  desc += ")";
307  return desc;
308 }
309 
310 
312  : ValuePostingSource(slot_),
313  default_weight(0.0),
314  max_weight_in_map(0.0)
315 {
316 }
317 
318 void
319 ValueMapPostingSource::add_mapping(const string & key, double wt)
320 {
321  weight_map[key] = wt;
323 }
324 
325 void
327 {
328  weight_map.clear();
329  max_weight_in_map = 0.0;
330 }
331 
332 void
334 {
335  default_weight = wt;
336 }
337 
338 double
340 {
341  map<string, double>::const_iterator wit = weight_map.find(get_value());
342  if (wit == weight_map.end()) {
343  return default_weight;
344  }
345  return wit->second;
346 }
347 
350 {
351  AutoPtr<ValueMapPostingSource> res(new ValueMapPostingSource(get_slot()));
352  map<string, double>::const_iterator i;
353  for (i = weight_map.begin(); i != weight_map.end(); ++i) {
354  res->add_mapping(i->first, i->second);
355  }
356  res->set_default_weight(default_weight);
357  return res.release();
358 }
359 
360 string
362 {
363  return string("Xapian::ValueMapPostingSource");
364 }
365 
366 string
368 {
369  string result = encode_length(get_slot());
370  result += serialise_double(default_weight);
371 
372  map<string, double>::const_iterator i;
373  for (i = weight_map.begin(); i != weight_map.end(); ++i) {
374  result.append(encode_length(i->first.size()));
375  result.append(i->first);
376  result.append(serialise_double(i->second));
377  }
378 
379  return result;
380 }
381 
384 {
385  const char * p = s.data();
386  const char * end = p + s.size();
387 
388  Xapian::valueno new_slot;
389  decode_length(&p, end, new_slot);
390  AutoPtr<ValueMapPostingSource> res(new ValueMapPostingSource(new_slot));
391  res->set_default_weight(unserialise_double(&p, end));
392  while (p != end) {
393  size_t keylen;
394  decode_length_and_check(&p, end, keylen);
395  string key(p, keylen);
396  p += keylen;
397  res->add_mapping(key, unserialise_double(&p, end));
398  }
399  return res.release();
400 }
401 
402 void
404 {
407 }
408 
409 string
411 {
412  string desc("Xapian::ValueMapPostingSource(slot=");
413  desc += str(get_slot());
414  desc += ")";
415  return desc;
416 }
417 
419  : started(false)
420 {
421  // The weight is fixed at wt, so that's the maxweight too. So just store wt
422  // as the maxweight and we can read it from there when we need it.
423  set_maxweight(wt);
424 }
425 
428 {
429  return termfreq;
430 }
431 
434 {
435  return termfreq;
436 }
437 
440 {
441  return termfreq;
442 }
443 
444 double
446 {
447  return get_maxweight();
448 }
449 
450 void
452 {
453  if (!started) {
454  started = true;
455  it = db.postlist_begin(string());
456  } else {
457  ++it;
458  }
459 
460  if (it == db.postlist_end(string())) return;
461 
462  if (check_docid) {
463  it.skip_to(check_docid + 1);
464  check_docid = 0;
465  }
466 
467  if (min_wt > get_maxweight()) {
468  it = db.postlist_end(string());
469  }
470 }
471 
472 void
474 {
475  if (!started) {
476  started = true;
477  it = db.postlist_begin(string());
478 
479  if (it == db.postlist_end(string())) return;
480  }
481 
482  if (check_docid) {
483  if (min_docid < check_docid)
484  min_docid = check_docid + 1;
485  check_docid = 0;
486  }
487 
488  if (min_wt > get_maxweight()) {
489  it = db.postlist_end(string());
490  return;
491  }
492  it.skip_to(min_docid);
493 }
494 
495 bool
497 {
498  // We're guaranteed not to be called if the document doesn't
499  // exist, so just remember the docid passed, and return true.
500  check_docid = min_docid;
501  return true;
502 }
503 
504 bool
506 {
507  if (check_docid != 0) return false;
508  return started && it == db.postlist_end(string());
509 }
510 
513 {
514  if (check_docid != 0) return check_docid;
515  return *it;
516 }
517 
520 {
522 }
523 
524 string
526 {
527  return string("Xapian::FixedWeightPostingSource");
528 }
529 
530 string
532 {
534 }
535 
538 {
539  const char * p = s.data();
540  const char * s_end = p + s.size();
541  double new_wt = unserialise_double(&p, s_end);
542  if (p != s_end) {
543  throw Xapian::NetworkError("Bad serialised FixedWeightPostingSource - junk at end");
544  }
545  return new FixedWeightPostingSource(new_wt);
546 }
547 
548 void
550 {
551  db = db_;
552  termfreq = db_.get_doccount();
553  started = false;
554  check_docid = 0;
555 }
556 
557 string
559 {
560  string desc("Xapian::FixedWeightPostingSource(wt=");
561  desc += str(get_maxweight());
562  desc += ")";
563  return desc;
564 }
565 
566 }
FixedWeightPostingSource * unserialise(const std::string &serialised) const
Create object given string serialisation returned by serialise().
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
Xapian::doccount get_termfreq_max() const
An upper bound on the number of documents this object can return.
#define Assert(COND)
Definition: omassert.h:122
bool check(Xapian::docid min_docid, double min_wt)
Check if the specified docid occurs.
void skip_to(Xapian::docid min_docid, double min_wt)
Advance to the specified docid.
void init(const Database &db_)
Set this PostingSource to the start of the list of postings.
ValueMapPostingSource * clone() const
Clone the posting source.
length encoded as a string
double max_weight_in_map
The maximum weight in weight_map.
std::string serialise() const
Serialise object parameters into a string.
Xapian::doccount get_termfreq_min() const
A lower bound on the number of documents this object can return.
This class is used to access a database, or a group of databases.
Definition: database.h:68
ValueMapPostingSource * unserialise(const std::string &serialised) const
Create object given string serialisation returned by serialise().
void set_maxweight(double max_weight)
Specify an upper bound on what get_weight() will return from now on.
class for performing a match
Xapian::PostingIterator it
Iterator over all documents.
std::string get_description() const
Return a string describing this object.
Xapian::docid get_docid() const
Return the current docid.
A posting source which looks up weights in a map using values as the key.
#define usual(COND)
Definition: config.h:544
bool check(Xapian::docid min_docid, double min_wt)
Check if the specified docid occurs.
double get_weight() const
Return the weight contribution for the current document.
std::string name() const
Name of the posting source class.
bool at_end() const
Return true if the current position is past the last entry in this list.
void init(const Database &db_)
Set this PostingSource to the start of the list of postings.
External sources of posting information.
ValueWeightPostingSource(Xapian::valueno slot_)
Construct a ValueWeightPostingSource.
bool & started
Flag indicating if we&#39;ve started (true if we have).
std::string serialise() const
Serialise object parameters into a string.
STL namespace.
Convert types to std::string.
double get_weight() const
Return the weight contribution for the current document.
std::string get_value_upper_bound(Xapian::valueno slot) const
Get an upper bound on the values stored in the given value slot.
Definition: omdatabase.cc:386
std::string encode_length(T len)
Encode a length as a variable-length string.
Definition: length.h:36
#define false
Definition: header.h:9
Xapian::doccount get_doccount() const
Get the number of documents in the database.
Definition: omdatabase.cc:267
void clear_mappings()
Clear all mappings.
std::string serialise() const
Serialise object parameters into a string.
Hierarchy of classes which Xapian can throw as exceptions.
Xapian::doccount get_termfreq_est() const
An estimate of the number of documents this object can return.
functions to serialise and unserialise a double
std::string get_description() const
Return a string describing this object.
std::string get_value() const
Read current value.
double unserialise_double(const char **p, const char *end)
Unserialise a double serialised by serialise_double.
ValueIterator valuestream_end(Xapian::valueno) const
Return end iterator corresponding to valuestream_begin().
Definition: database.h:359
ValueMapPostingSource(Xapian::valueno slot_)
Construct a ValueMapPostingSource.
bool get_started() const
Flag indicating if we&#39;ve started (true if we have).
Xapian::doccount get_termfreq_est() const
An estimate of the number of documents this object can return.
Registry for user subclasses.
Definition: registry.h:47
void init(const Database &db_)
Set this PostingSource to the start of the list of postings.
Xapian::doccount termfreq
Number of documents in the posting source.
Xapian::doccount real_termfreq_min
A posting source which returns a fixed weight for all documents.
bool at_end() const
Return true if the current position is past the last entry in this list.
void recalc_maxweight()
Called by postlists to indicate that they&#39;ve rearranged themselves and the maxweight now possible is ...
Definition: multimatch.h:136
string str(int value)
Convert int to std::string.
Definition: str.cc:90
void skip_to(Xapian::docid min_docid, double min_wt)
Advance to the specified docid.
ValueWeightPostingSource * clone() const
Clone the posting source.
bool started
Flag indicating if we&#39;ve started (true if we have).
double get_weight() const
Return the weight contribution for the current document.
std::string name() const
Name of the posting source class.
void init(const Database &db_)
Set this PostingSource to the start of the list of postings.
Base class which provides an "external" source of postings.
Definition: postingsource.h:47
std::map< std::string, double > weight_map
The value -> weight map.
FixedWeightPostingSource * clone() const
Clone the posting source.
ValueIterator valuestream_begin(Xapian::valueno slot) const
Return an iterator over the value in slot slot for each document.
Definition: omdatabase.cc:450
A posting source which generates weights from a value slot.
double sortable_unserialise(const std::string &serialised)
Convert a string encoded using sortable_serialise back to a floating point number.
bool check(Xapian::docid docid)
Check if the specified docid occurs.
Xapian::ValueIterator real_value_it
A posting source which reads weights from a value slot.
Xapian::doccount get_termfreq_min() const
A lower bound on the number of documents this object can return.
void set_default_weight(double wt)
Set a default weight for document values not in the map.
std::string serialise_double(double v)
Serialise a double to a string.
void decode_length_and_check(const char **p, const char *end, unsigned &out)
Decode a length encoded by encode_length.
Definition: length.cc:112
char name[9]
Definition: dbcheck.cc:55
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
void add_mapping(const std::string &key, double wt)
Add a mapping.
FixedWeightPostingSource(double wt)
Construct a FixedWeightPostingSource.
Indicates a problem communicating with a remote database.
Definition: error.h:803
Xapian::docid get_docid() const
Return the current docid.
void skip_to(Xapian::docid did)
Advance the iterator to document did.
unsigned valueno
The number for a value slot in a document.
Definition: types.h:108
Xapian::doccount real_termfreq_est
Definition: quest.cc:110
Xapian::docid get_docid() const
Return the docid at the current position.
Xapian::doccount get_termfreq_max() const
An upper bound on the number of documents this object can return.
Xapian::Database db
The database we&#39;re reading documents from.
double get_maxweight() const
Return the currently set upper bound on what get_weight() can return.
std::string get_description() const
Return a string describing this object.
double default_weight
The default weight.
void next(double min_wt)
Advance the current position to the next matching document.
Various assertion macros.
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
API for working with documents.
PostingIterator postlist_end(const std::string &) const
Corresponding end iterator to postlist_begin().
Definition: database.h:225
std::string name() const
Name of the posting source class.
Xapian::doccount real_termfreq_max
Xapian::docid check_docid
The docid last passed to check() (0 if check() wasn&#39;t the last move).
Xapian::Database real_db
void decode_length(const char **p, const char *end, unsigned &out)
Decode a length encoded by encode_length.
Definition: length.cc:94
ValueWeightPostingSource * unserialise(const std::string &serialised) const
Create object given string serialisation returned by serialise().
Xapian::doccount get_value_freq(Xapian::valueno slot) const
Return the frequency of a given value slot.
Definition: omdatabase.cc:355
Wrapper around standard unique_ptr template.
UnimplementedError indicates an attempt to use an unimplemented feature.
Definition: error.h:325
PostingIterator postlist_begin(const std::string &tname) const
An iterator pointing to the start of the postlist for a given term.
Definition: omdatabase.cc:162
parsing a user query string to build a Xapian::Query object
Xapian::valueno get_slot() const
The slot we&#39;re reading values from.
void skip_to(Xapian::docid docid_or_slot)
Advance the iterator to document id or value slot docid_or_slot.
Xapian::Database get_database() const
The database we&#39;re reading values from.
void next(double min_wt)
Advance the current position to the next matching document.