xapian-core  2.0.0
honey_alltermslist.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2005,2007,2008,2009,2010,2017,2018,2024 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include "honey_alltermslist.h"
24 #include "honey_cursor.h"
25 #include "honey_postlist.h"
27 
28 #include "debuglog.h"
29 #include "pack.h"
30 #include "stringutils.h"
31 
32 #include "xapian/error.h"
33 
34 using namespace std;
35 
36 void
38 {
39  LOGCALL_VOID(DB, "HoneyAllTermsList::read_termfreq", NO_ARGS);
40  Assert(cursor != NULL);
41 
42  // Unpack the termfreq from the tag.
43  Xapian::termcount collfreq;
44  cursor->read_tag();
45  const char* p = cursor->current_tag.data();
46  const char* pend = p + cursor->current_tag.size();
48  termfreq, collfreq)) {
49  throw Xapian::DatabaseCorruptError("Postlist initial chunk header not "
50  "as expected");
51  }
52  // Not used.
53  (void)collfreq;
54 }
55 
57 {
58  LOGCALL_DTOR(DB, "HoneyAllTermsList");
59  delete cursor;
60 }
61 
64 {
65  // This is an over-estimate and not entirely proportional between shards,
66  // but we only use this value to build a balanced or-tree, and it'll at
67  // least tend to distinguish large databases from small ones.
68  return database->postlist_table.get_approx_entry_count();
69 }
70 
73 {
74  LOGCALL(DB, Xapian::doccount, "HoneyAllTermsList::get_termfreq", NO_ARGS);
75  Assert(cursor != NULL);
76  if (termfreq == 0) read_termfreq();
77  RETURN(termfreq);
78 }
79 
80 TermList*
82 {
83  LOGCALL(DB, TermList*, "HoneyAllTermsList::next", NO_ARGS);
84  // Set termfreq to 0 to indicate no termfreq has been read for the current
85  // term.
86  termfreq = 0;
87 
88  if (rare(!cursor)) {
89  Assert(database);
90  cursor = database->postlist_table.cursor_get();
91  Assert(cursor); // The postlist table isn't optional.
92 
93  if (prefix.empty()) {
94  (void)cursor->find_entry_ge(string("\x00\xff", 2));
95  } else {
96  const string& key = pack_honey_postlist_key(prefix);
97  if (cursor->find_entry_ge(key)) {
98  // The exact term we asked for is there, so just copy it rather
99  // than wasting effort unpacking it from the key.
100  current_term = prefix;
101  RETURN(NULL);
102  }
103  }
104  if (cursor->after_end()) {
105  RETURN(this);
106  }
107  goto first_time;
108  }
109 
110  while (true) {
111  if (!cursor->next()) {
112  RETURN(this);
113  }
114 
115 first_time:
116  // Fast check for terms without any zero bytes. ~8.4% faster for
117  // glass.
118  auto nul = cursor->current_key.find('\0');
119  if (nul == string::npos) {
120  current_term = cursor->current_key;
121  break;
122  }
123  if (cursor->current_key[nul + 1] != '\xff') {
124  continue;
125  }
126 
127  const char* p = cursor->current_key.data();
128  const char* pend = p + cursor->current_key.size();
129  if (!unpack_string_preserving_sort(&p, pend, current_term)) {
130  throw Xapian::DatabaseCorruptError("PostList table key has "
131  "unexpected format");
132  }
133 
134  // If this key is for the first chunk of a postlist, we're done.
135  // Otherwise we need to skip past continuation chunks until we find the
136  // first chunk of the next postlist.
137  if (p == pend) break;
138  }
139 
140  if (!startswith(current_term, prefix)) {
141  // We've reached the end of the prefixed terms.
142  RETURN(this);
143  }
144 
145  RETURN(NULL);
146 }
147 
148 TermList*
150 {
151  LOGCALL(DB, TermList*, "HoneyAllTermsList::skip_to", term);
152  // Set termfreq to 0 to indicate no termfreq has been read for the current
153  // term.
154  termfreq = 0;
155 
156  if (rare(!cursor)) {
157  if (rare(term.empty())) {
158  RETURN(next());
159  }
160  cursor = database->postlist_table.cursor_get();
161  Assert(cursor); // The postlist table isn't optional.
162  }
163 
164  if (rare(term.empty())) {
165  RETURN(NULL);
166  }
167 
168  string key = pack_honey_postlist_key(term);
169  if (cursor->find_entry_ge(key)) {
170  // The exact term we asked for is there, so just copy it rather than
171  // wasting effort unpacking it from the key.
172  current_term = term;
173  } else {
174  if (cursor->after_end()) {
175  RETURN(this);
176  }
177 
178  const char* p = cursor->current_key.data();
179  const char* pend = p + cursor->current_key.size();
180  if (!unpack_string_preserving_sort(&p, pend, current_term) ||
181  p != pend) {
182  throw Xapian::DatabaseCorruptError("PostList table key has "
183  "unexpected format");
184  }
185  }
186 
187  if (!startswith(current_term, prefix)) {
188  // We've reached the end of the prefixed terms.
189  RETURN(this);
190  }
191 
192  RETURN(NULL);
193 }
Xapian::doccount get_termfreq() const
Returns the term frequency of the current term.
Xapian::termcount get_approx_size() const
Return approximate size of this termlist.
TermList * next()
Advance to the next term in the list.
void read_termfreq() const
Read and cache the term frequency.
TermList * skip_to(std::string_view term)
Advance to the first term which is >= term.
~HoneyAllTermsList()
Destructor.
DatabaseCorruptError indicates database corruption was detected.
Definition: error.h:397
Abstract base class for termlists.
Definition: termlist.h:42
#define rare(COND)
Definition: config.h:607
string term
PositionList * p
Debug logging macros.
#define RETURN(...)
Definition: debuglog.h:484
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:478
#define LOGCALL_VOID(CATEGORY, FUNC, PARAMS)
Definition: debuglog.h:479
#define LOGCALL_DTOR(CATEGORY, CLASS)
Definition: debuglog.h:481
Hierarchy of classes which Xapian can throw as exceptions.
A termlist containing all terms in a honey database.
HoneyCursor class.
PostList in a honey database.
Encoding and decoding functions for honey postlists.
bool decode_initial_chunk_header_freqs(const char **p, const char *end, Xapian::doccount &termfreq, Xapian::termcount &collfreq)
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
#define Assert(COND)
Definition: omassert.h:122
Pack types into strings and unpack them again.
std::string pack_honey_postlist_key(std::string_view term)
Definition: pack.h:602
bool unpack_string_preserving_sort(const char **p, const char *end, std::string &result)
Decode a "sort preserved" std::string from a string.
Definition: pack.h:551
Various handy string-related helpers.
bool startswith(std::string_view s, char pfx)
Definition: stringutils.h:56