xapian-core  2.0.0
honey_postlist.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2017,2018,2022,2024 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include "honey_postlist.h"
24 
25 #include "honey_cursor.h"
26 #include "honey_database.h"
27 #include "honey_positionlist.h"
29 #include "overflow.h"
30 #include "pack.h"
31 
32 #include <string>
33 #include <string_view>
34 
35 using namespace Honey;
36 using namespace std;
37 
38 bool
40 {
41  Xapian::docid chunk_last = docid_from_key(term, cursor->current_key);
42  if (!chunk_last) return false;
43 
44  cursor->read_tag();
45  const string& tag = cursor->current_tag;
46  reader.assign(tag.data(), tag.size(), chunk_last);
47  return true;
48 }
49 
50 // Return T with just its top bit set (for unsigned T).
51 #define TOP_BIT_SET(T) ((static_cast<T>(-1) >> 1) + 1)
52 
54  string_view term_,
55  HoneyCursor* cursor_)
56  : LeafPostList(term_), cursor(cursor_), db(db_)
57 {
58  if (!cursor) {
59  // Term not present in db.
60  reader.init();
61  last_did = 0;
62  wdf_max = 0;
63  termfreq = 0;
64  collfreq = 0;
65  return;
66  }
67 
68  cursor->read_tag();
69  const string& chunk = cursor->current_tag;
70 
71  const char* p = chunk.data();
72  const char* pend = p + chunk.size();
73  // FIXME: Make use of [first,last] ranges to calculate better estimates and
74  // potentially to spot subqueries that can't match anything.
77  Xapian::docid first_did;
78  Xapian::termcount first_wdf;
79  Xapian::docid chunk_last;
80  if (!decode_initial_chunk_header(&p, pend, tf, cf,
81  first_did, last_did,
82  chunk_last, first_wdf, wdf_max))
83  throw Xapian::DatabaseCorruptError("Postlist initial chunk header");
84 
85  Xapian::termcount cf_info = cf;
86  if (cf == 0) {
87  // wdf must always be zero.
88  } else if (tf <= 2) {
89  // No further postlist data stored.
90  } else if (cf == tf - 1 + first_wdf) {
91  // wdf must be 1 for second and subsequent entries.
92  cf_info = 1 | TOP_BIT_SET(decltype(cf_info));
93  } else {
94  cf_info = 1;
95  // wdf_max can only be zero if cf == 0 (and
96  // decode_initial_chunk_header() should ensure this).
97  Assert(wdf_max != 0);
98  Xapian::termcount remaining_cf_for_flat_wdf;
99  if (!mul_overflows(tf - 1, wdf_max, remaining_cf_for_flat_wdf) &&
100  cf - first_wdf == remaining_cf_for_flat_wdf) {
101  // Set cl_info to the flat wdf value with the top bit set to
102  // signify that this is a flat wdf value.
103  cf_info = wdf_max;
104  // It shouldn't be possible for the top bit to already be set since
105  // tf > 2 so cf must be at least 2 * remaining_cf_for_flat_wdf.
106  Assert((cf_info & TOP_BIT_SET(decltype(cf_info))) == 0);
107  cf_info |= TOP_BIT_SET(decltype(cf_info));
108  }
109  }
110 
111  termfreq = tf;
112  collfreq = cf;
113  reader.init(tf, cf_info);
114  reader.assign(p, pend - p, first_did, last_did, first_wdf);
115 }
116 
118 {
119  delete cursor;
120 }
121 
122 bool
123 HoneyPostList::open_nearby_postlist(std::string_view term_,
124  bool need_read_pos,
125  LeafPostList*& pl) const
126 {
127  Assert(!term_.empty());
128  if (!cursor) return false;
129  // FIXME: Once Honey supports writing, we need to return false here if the
130  // DB is writable and has uncommitted modifications.
131 
132  unique_ptr<HoneyCursor> new_cursor(new HoneyCursor(*cursor));
133  if (!new_cursor->find_exact(Honey::make_postingchunk_key(term_))) {
134  pl = nullptr;
135  return true;
136  }
137 
138  if (need_read_pos)
139  pl = new HoneyPosPostList(db, term_, new_cursor.release());
140  else
141  pl = new HoneyPostList(db, term_, new_cursor.release());
142  return true;
143 }
144 
147 {
148  return reader.get_docid();
149 }
150 
153 {
154  return reader.get_wdf();
155 }
156 
157 bool
159 {
160  return cursor == NULL;
161 }
162 
165 {
167 }
168 
169 PostList*
171 {
172  if (!started) {
173  started = true;
174  return NULL;
175  }
176 
177  Assert(!reader.at_end());
178 
179  if (reader.next())
180  return NULL;
181 
182  if (reader.get_docid() >= last_did) {
183  // We've reached the end.
184  delete cursor;
185  cursor = NULL;
186  return NULL;
187  }
188 
189  if (rare(!cursor->next()))
190  throw Xapian::DatabaseCorruptError("Hit end of table looking for "
191  "postlist chunk");
192 
193  if (rare(!update_reader()))
194  throw Xapian::DatabaseCorruptError("Missing postlist chunk");
195 
196  return NULL;
197 }
198 
199 PostList*
201 {
202  if (!started) {
203  started = true;
204  }
205 
206  if (rare(!cursor)) {
207  // No-op if already at_end.
208  return NULL;
209  }
210 
211  Assert(!reader.at_end());
212 
213  if (reader.skip_to(did))
214  return NULL;
215 
216  if (did > last_did) {
217  // We've reached the end.
218  delete cursor;
219  cursor = NULL;
220  return NULL;
221  }
222 
223  // At this point we know that skip_to() must succeed since last_did
224  // satisfies the requirements.
225 
226  // find_entry_ge() returns true for an exact match, which isn't interesting
227  // here.
229 
230  if (rare(cursor->after_end()))
231  throw Xapian::DatabaseCorruptError("Hit end of table looking for "
232  "postlist chunk");
233 
234  if (rare(!update_reader()))
235  throw Xapian::DatabaseCorruptError("Missing postlist chunk");
236 
237  if (rare(!reader.skip_to(did)))
238  throw Xapian::DatabaseCorruptError("Postlist chunk doesn't contain "
239  "its last entry");
240 
241  return NULL;
242 }
243 
246 {
247  return wdf_max;
248 }
249 
250 void
252 {
253  Assert(!started);
254  if (termfreq) {
255  first = reader.get_docid();
256  last = last_did;
257  AssertRel(first,<=,last);
258  } else {
259  last = 0;
260  }
261 }
262 
263 string
265 {
266  string desc = "HoneyPostList(";
267  desc += term;
268  desc += ')';
269  return desc;
270 }
271 
273  std::string_view term_,
274  HoneyCursor* cursor_)
275  : HoneyPostList(db_, term_, cursor_),
276  position_list(db_->position_table) {}
277 
280 {
282  // FIXME: Consider returning NULL if there's no positional data - callers
283  // need fixing up, but this may be a rare case and the costs of checking
284  // for NULL may outweigh any gains. Need to profile.
285  return &position_list;
286 }
287 
288 string
290 {
291  string desc = "HoneyPosPostList(";
292  desc += term;
293  desc += ')';
294  return desc;
295 }
296 
297 namespace Honey {
298 
299 void
300 PostingChunkReader::assign(const char* p_, size_t len,
301  Xapian::docid chunk_last)
302 {
303  const char* pend = p_ + len;
304  if (collfreq_info ?
305  !decode_delta_chunk_header(&p_, pend, chunk_last, did, wdf) :
306  !decode_delta_chunk_header_no_wdf(&p_, pend, chunk_last, did)) {
307  throw Xapian::DatabaseCorruptError("Postlist delta chunk header");
308  }
309  p = p_;
310  end = pend;
311  last_did = chunk_last;
312 }
313 
314 void
315 PostingChunkReader::assign(const char* p_, size_t len, Xapian::docid did_,
316  Xapian::docid last_did_in_chunk,
317  Xapian::termcount wdf_)
318 {
319  p = p_;
320  end = p_ + len;
321  did = did_;
322  last_did = last_did_in_chunk;
323  wdf = wdf_;
324 }
325 
326 bool
328 {
329  if (p == end) {
330  if (termfreq == 2 && did != last_did) {
331  did = last_did;
332  wdf = collfreq_info - wdf;
333  return true;
334  }
335  p = NULL;
336  return false;
337  }
338 
339  // The "constant wdf apart from maybe the first entry" case.
340  if (collfreq_info & TOP_BIT_SET(decltype(collfreq_info))) {
342  collfreq_info = 0;
343  }
344 
345  Xapian::docid delta;
346  if (!unpack_uint(&p, end, &delta)) {
347  throw Xapian::DatabaseCorruptError("postlist docid delta");
348  }
349  did += delta + 1;
350  if (collfreq_info) {
351  if (!unpack_uint(&p, end, &wdf)) {
352  throw Xapian::DatabaseCorruptError("postlist wdf");
353  }
354  }
355 
356  return true;
357 }
358 
359 bool
361 {
362  if (p == NULL)
363  return false;
364 
365  if (target <= did)
366  return true;
367 
368  if (target > last_did) {
369  p = NULL;
370  return false;
371  }
372 
373  if (p == end) {
374  // Given the checks above, this must be the termfreq == 2 case with the
375  // current position being on the first entry, and so skip_to() must
376  // move to last_did.
377  AssertEq(termfreq, 2);
378  did = last_did;
379  wdf = collfreq_info - wdf;
380  return true;
381  }
382 
383  // The "constant wdf apart from maybe the first entry" case.
384  if (collfreq_info & TOP_BIT_SET(decltype(collfreq_info))) {
386  collfreq_info = 0;
387  }
388 
389  if (target == last_did) {
390  if (collfreq_info) {
391  if (!unpack_uint_backwards(&end, p, &wdf))
392  throw Xapian::DatabaseCorruptError("postlist final wdf");
393  }
394  did = last_did;
395  p = end;
396  return true;
397  }
398 
399  do {
400  if (rare(p == end)) {
401  // FIXME: Shouldn't happen unless last_did was wrong.
402  p = NULL;
403  return false;
404  }
405 
406  Xapian::docid delta;
407  if (!unpack_uint(&p, end, &delta)) {
408  throw Xapian::DatabaseCorruptError("postlist docid delta");
409  }
410  did += delta + 1;
411  if (collfreq_info) {
412  if (!unpack_uint(&p, end, &wdf)) {
413  throw Xapian::DatabaseCorruptError("postlist wdf");
414  }
415  }
416  } while (target > did);
417 
418  return true;
419 }
420 
421 }
bool read_tag(bool keep_compressed=false)
bool after_end() const
Definition: honey_cursor.h:94
bool find_entry_ge(std::string_view key)
Definition: honey_cursor.h:110
std::string current_tag
Definition: honey_cursor.h:43
bool next()
Definition: honey_cursor.h:96
Database using honey backend.
HoneyPositionTable position_table
PostList in a honey database with positions.
HoneyRePositionList position_list
PositionList object to reuse for OP_NEAR and OP_PHRASE.
std::string get_description() const
Return a string description of this object.
HoneyPosPostList(const HoneyDatabase *db_, std::string_view term_, HoneyCursor *cursor_)
PositionList * read_position_list()
Read the position list for the term in the current document and return a pointer to it (owned by the ...
HoneyPositionList * open_position_list(Xapian::docid did, std::string_view term) const
PostList in a honey database.
Xapian::termcount get_wdf() const
Return the wdf for the document at the current position.
Xapian::docid last_did
The highest document id in this posting list.
PositionList * open_position_list() const
Read the position list for the term in the current document and return a pointer to it (not owned by ...
Xapian::termcount wdf_max
Maximum wdf for this postlist.
bool open_nearby_postlist(std::string_view term_, bool need_read_pos, LeafPostList *&pl) const
Open another postlist from the same database.
HoneyCursor * cursor
Cursor on the postlist table.
Xapian::termcount get_wdf_upper_bound() const
const HoneyDatabase * db
HoneyDatabase to get position table object from.
void get_docid_range(Xapian::docid &first, Xapian::docid &last) const
Get the bounds on the range of docids this PostList can return.
PostList * skip_to(Xapian::docid did, double w_min)
Skip forward to the specified docid.
Honey::PostingChunkReader reader
bool update_reader()
Update reader to use the chunk currently pointed to by cursor.
HoneyPostList(const HoneyPostList &)=delete
Don't allow copying.
Xapian::docid get_docid() const
Return the current docid.
bool started
Needed so that first next() does nothing.
std::string get_description() const
Return a string description of this object.
bool at_end() const
Return true if the current position is past the last entry in this list.
void read_data(Xapian::docid did, const std::string &term)
Fill list with data, and move the position to the start.
Xapian::doccount termfreq
void init()
Initialise already at_end().
Xapian::termcount wdf
Xapian::docid get_docid() const
Xapian::docid last_did
The last docid in this chunk.
Xapian::termcount collfreq_info
Value "to do with" collection frequency.
void assign(const char *p_, size_t len, Xapian::docid did)
Xapian::termcount get_wdf() const
bool next()
Advance, returning false if we've run out of data.
bool skip_to(Xapian::docid target)
Skip ahead, returning false if we've run out of data.
Abstract base class for leaf postlists.
Definition: leafpostlist.h:40
Xapian::termcount collfreq
The collection frequency of the term.
Definition: leafpostlist.h:57
std::string term
The term name for this postlist (empty for an alldocs postlist).
Definition: leafpostlist.h:51
DatabaseCorruptError indicates database corruption was detected.
Definition: error.h:397
Abstract base class for postlists.
Definition: postlist.h:40
PostList * next()
Advance the current position to the next document in the postlist.
Definition: postlist.h:168
Xapian::doccount termfreq
Estimate of the number of documents this PostList will return.
Definition: postlist.h:52
Abstract base class for iterating term positions in a document.
Definition: positionlist.h:32
#define rare(COND)
Definition: config.h:607
string term
PositionList * p
HoneyCursor class.
Database using honey backend.
A position list in a honey database.
#define TOP_BIT_SET(T)
PostList in a honey database.
Encoding and decoding functions for honey postlists.
bool decode_delta_chunk_header(const char **p, const char *end, Xapian::docid chunk_last, Xapian::docid &chunk_first, Xapian::termcount &chunk_first_wdf)
bool decode_initial_chunk_header(const char **p, const char *end, Xapian::doccount &termfreq, Xapian::termcount &collfreq, Xapian::docid &first, Xapian::docid &last, Xapian::docid &chunk_last, Xapian::termcount &first_wdf, Xapian::termcount &wdf_max)
bool decode_delta_chunk_header_no_wdf(const char **p, const char *end, Xapian::docid chunk_last, Xapian::docid &chunk_first)
std::string make_postingchunk_key(std::string_view term)
Generate a key for a posting initial chunk.
Xapian::docid docid_from_key(const std::string &key)
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
#define AssertEq(A, B)
Definition: omassert.h:124
#define AssertRel(A, REL, B)
Definition: omassert.h:123
#define Assert(COND)
Definition: omassert.h:122
Arithmetic operations with overflow checks.
std::enable_if_t< std::is_unsigned_v< T1 > &&std::is_unsigned_v< T2 > &&std::is_unsigned_v< R >, bool > mul_overflows(T1 a, T2 b, R &res)
Multiplication with overflow checking.
Definition: overflow.h:188
Pack types into strings and unpack them again.
bool unpack_uint_backwards(const char **p, const char *start, U *result)
Decode an unsigned integer from a string, going backwards.
Definition: pack.h:412
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
Definition: pack.h:346