xapian-core  2.0.0
honey_alldocspostlist.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2006,2007,2008,2009,2018 Olly Betts
5  * Copyright (C) 2008 Lemur Consulting Ltd
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, see
19  * <https://www.gnu.org/licenses/>.
20  */
21 
22 #include <config.h>
23 #include "honey_alldocspostlist.h"
24 
25 #include "honey_database.h"
26 #include "honey_defs.h"
27 
28 #include "debuglog.h"
29 #include "str.h"
30 #include "wordaccess.h"
31 
32 #include <string>
33 
34 using namespace Honey;
35 using namespace std;
36 
39  : LeafPostList({}),
40  cursor(db->get_postlist_cursor())
41 {
42  LOGCALL_CTOR(DB, "HoneyAllDocsPostList", db | doccount);
43  static const char doclen_key_prefix[2] = {
45  };
46  cursor->find_entry_ge(string(doclen_key_prefix, 2));
47  /* For an all documents postlist the term frequency is the number of
48  * documents in the database.
49  */
50  termfreq = doccount;
51  collfreq = doccount;
52 }
53 
55 {
56  delete cursor;
57 }
58 
61 {
62  return reader.get_docid();
63 }
64 
67 {
68  LOGCALL(DB, Xapian::termcount, "HoneyAllDocsPostList::get_wdf", NO_ARGS);
70  RETURN(1);
71 }
72 
73 bool
75 {
76  return cursor == NULL;
77 }
78 
79 PostList*
81 {
82  Assert(cursor);
83  if (!reader.at_end()) {
84  if (reader.next()) return NULL;
85  cursor->next();
86  }
87 
88  if (!cursor->after_end()) {
89  if (reader.update(cursor)) {
90  if (!reader.at_end()) return NULL;
91  }
92  }
93 
94  // We've reached the end.
95  delete cursor;
96  cursor = NULL;
97  return NULL;
98 }
99 
100 PostList*
102 {
103  if (rare(!cursor)) {
104  // No-op if already at_end.
105  return NULL;
106  }
107 
108  if (reader.at_end()) {
109  // This happens if the first operation is a skip_to().
111  Assert(!reader.at_end());
112  }
113 
114  if (reader.skip_to(did))
115  return NULL;
116 
118  // Exact match.
119  if (rare(!reader.update(cursor))) {
120  // Shouldn't be possible.
121  Assert(false);
122  }
123  if (reader.skip_to(did)) return NULL;
124  // The chunk's last docid is did, so skip_to() should always succeed.
125  Assert(false);
126  } else if (!cursor->after_end()) {
127  if (reader.update(cursor)) {
128  if (reader.skip_to(did)) return NULL;
129  // The chunk's last docid is >= did, so skip_to() should always
130  // succeed.
131  Assert(false);
132  }
133  }
134 
135  // We've reached the end.
136  delete cursor;
137  cursor = NULL;
138  return NULL;
139 }
140 
141 PostList*
143 {
144  if (rare(!cursor)) {
145  // Already at_end.
146  valid = true;
147  return NULL;
148  }
149 
150  if (!reader.at_end()) {
151  // Check for the requested docid in the current block.
152  if (reader.skip_to(did)) {
153  valid = true;
154  return NULL;
155  }
156  }
157 
158  // Try moving to the appropriate chunk.
160  // We're in a chunk which might contain the docid.
161  if (reader.update(cursor)) {
162  if (reader.skip_to(did)) {
163  valid = true;
164  return NULL;
165  }
166  }
167  valid = false;
168  return NULL;
169  }
170 
171  // We had an exact match for a chunk starting with specified docid.
172  Assert(!cursor->after_end());
173  if (!reader.update(cursor)) {
174  // We found the exact key we built so it must be a doclen chunk.
175  // Therefore reader.update() "can't possibly fail".
176  Assert(false);
177  }
178 
179  valid = true;
180  return NULL;
181 }
182 
185 {
186  LOGCALL(DB, Xapian::termcount, "HoneyAllDocsPostList::get_wdf_upper_bound", NO_ARGS);
187  RETURN(1);
188 }
189 
190 string
192 {
193  string desc = "HoneyAllDocsPostList(doccount=";
194  desc += str(termfreq);
195  desc += ')';
196  return desc;
197 }
198 
199 namespace Honey {
200 
201 bool
202 DocLenChunkReader::read_doclen(const unsigned char* q)
203 {
204  switch (width) {
205  case 1:
206  doclen = *q;
207  return doclen != 0xff;
208  case 2:
209  doclen = unaligned_read2(q);
210  return doclen != 0xffff;
211  case 3:
212  // q - 1 is always a valid byte - either the leading byte holding
213  // the data width, or else the last byte of the previous value.
214  // unaligned_read4() uses bigendian order, so we just need to mask
215  // off the most significant byte.
216  doclen = unaligned_read4(q - 1) & 0xffffff;
217  return doclen != 0xffffff;
218  default:
219  doclen = unaligned_read4(q);
220  return doclen != 0xffffffff;
221  }
222 }
223 
224 bool
226 {
227  Xapian::docid last_did = docid_from_key(cursor->current_key);
228  if (!last_did) return false;
229 
230  cursor->read_tag();
231 
232  size_t len = cursor->current_tag.size();
233  if (rare(len == 0))
234  throw Xapian::DatabaseCorruptError("Doclen data chunk is empty");
235 
236  p = reinterpret_cast<const unsigned char*>(cursor->current_tag.data());
237  end = p + len;
238  width = *p++;
239  if (((width - 8) &~ 0x18) != 0) {
240  throw Xapian::DatabaseCorruptError("Invalid doclen width - currently "
241  "8, 16, 24 and 32 are supported");
242  }
243  width /= 8;
244  if ((len - 1) % width != 0)
245  throw Xapian::DatabaseCorruptError("Doclen data chunk has junk at end");
246  Xapian::docid first_did = last_did - (len - 1) / width + 1;
247 
248  did = first_did;
249  if (!read_doclen(p)) {
250  // The first doclen value shouldn't be missing.
251  throw Xapian::DatabaseCorruptError("Invalid first doclen value");
252  }
253  return true;
254 }
255 
256 bool
258 {
259  do {
260  p += width;
261  if (p == end) {
262  p = NULL;
263  return false;
264  }
265 
266  ++did;
267  } while (!read_doclen(p));
268  return true;
269 }
270 
271 bool
273 {
274  if (p == NULL)
275  return false;
276 
277  if (target <= did)
278  return true;
279 
280  Xapian::docid delta = target - did;
281  if (delta >= Xapian::docid(end - p) / width) {
282  p = NULL;
283  return false;
284  }
285 
286  did = target;
287  p += delta * width;
288 
289  return read_doclen(p) || next();
290 }
291 
292 // FIXME: Add check() method, which doesn't advance when read_doclen() returns
293 // false?
294 
295 bool
297 {
298  if (target < did)
299  return false;
300 
301  Xapian::docid delta = target - did;
302  Assert(width > 0);
303  if (delta >= Xapian::docid(end - p) / width) {
304  return false;
305  }
306 
307  return read_doclen(p + delta * width);
308 }
309 
310 }
HoneyAllDocsPostList(const HoneyAllDocsPostList &)=delete
Don't allow copying.
Xapian::docid get_docid() const
Return the current docid.
Honey::DocLenChunkReader reader
Xapian::termcount get_wdf_upper_bound() const
bool at_end() const
Return true if the current position is past the last entry in this list.
std::string get_description() const
Return a string description of this object.
Xapian::termcount get_wdf() const
Return the wdf for the document at the current position.
PostList * skip_to(Xapian::docid did, double w_min)
Skip forward to the specified docid.
PostList * check(Xapian::docid did, double w_min, bool &valid)
Check if the specified docid occurs in this postlist.
HoneyCursor * cursor
Cursor on the postlist table.
bool read_tag(bool keep_compressed=false)
bool after_end() const
Definition: honey_cursor.h:94
bool find_entry_ge(std::string_view key)
Definition: honey_cursor.h:110
std::string current_tag
Definition: honey_cursor.h:43
bool next()
Definition: honey_cursor.h:96
std::string current_key
Definition: honey_cursor.h:43
Database using honey backend.
bool find_doclength(Xapian::docid target)
Searches the whole chunk (skip_to() only advances).
bool update(HoneyCursor *cursor)
Update to use the chunk currently pointed to by cursor.
unsigned const char * end
Xapian::docid get_docid() const
bool read_doclen(const unsigned char *q)
bool skip_to(Xapian::docid target)
Abstract base class for leaf postlists.
Definition: leafpostlist.h:40
DatabaseCorruptError indicates database corruption was detected.
Definition: error.h:397
Abstract base class for postlists.
Definition: postlist.h:40
PostList * next()
Advance the current position to the next document in the postlist.
Definition: postlist.h:168
Xapian::doccount termfreq
Estimate of the number of documents this PostList will return.
Definition: postlist.h:52
#define rare(COND)
Definition: config.h:607
Debug logging macros.
#define RETURN(...)
Definition: debuglog.h:484
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:478
#define LOGCALL_CTOR(CATEGORY, CLASS, PARAMS)
Definition: debuglog.h:480
A PostList which iterates over all documents in a HoneyDatabase.
Database using honey backend.
Definitions, types, etc for use inside honey.
std::string make_doclenchunk_key(Xapian::docid last_did)
Generate a key for a doclen chunk.
Xapian::docid docid_from_key(const std::string &key)
@ KEY_DOCLEN_CHUNK
Definition: honey_defs.h:88
string str(int value)
Convert int to std::string.
Definition: str.cc:91
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
#define AssertParanoid(COND)
Definition: omassert.h:129
#define Assert(COND)
Definition: omassert.h:122
Convert types to std::string.
functions for reading and writing different width words
uint32_t unaligned_read4(const unsigned char *ptr)
Definition: wordaccess.h:147
uint16_t unaligned_read2(const unsigned char *ptr)
Definition: wordaccess.h:159