xapian-core  2.0.0
glass_alltermslist.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2005,2007,2008,2009,2010,2017,2024 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include "glass_alltermslist.h"
24 #include "glass_postlist.h"
25 
26 #include "debuglog.h"
27 #include "pack.h"
28 #include "stringutils.h"
29 
30 #include <string_view>
31 
32 using namespace std;
33 
34 void
36 {
37  LOGCALL_VOID(DB, "GlassAllTermsList::read_termfreq", NO_ARGS);
38  Assert(!current_term.empty());
39  Assert(cursor);
40  Assert(!cursor->after_end());
41 
42  // Unpack the termfreq from the tag.
43  cursor->read_tag();
44  const char *p = cursor->current_tag.data();
45  const char *pend = p + cursor->current_tag.size();
46  GlassPostList::read_freqs(&p, pend, &termfreq, NULL);
47 }
48 
50 {
51  LOGCALL_DTOR(DB, "GlassAllTermsList");
52  delete cursor;
53 }
54 
57 {
58  // This is an over-estimate and not entirely proportional between shards,
59  // but we only use this value to build a balanced or-tree, and it'll at
60  // least tend to distinguish large databases from small ones.
61  return database->postlist_table.get_entry_count();
62 }
63 
66 {
67  LOGCALL(DB, Xapian::doccount, "GlassAllTermsList::get_termfreq", NO_ARGS);
68  Assert(!current_term.empty());
69  Assert(cursor);
70  Assert(!cursor->after_end());
71  if (termfreq == 0) read_termfreq();
72  RETURN(termfreq);
73 }
74 
75 TermList *
77 {
78  LOGCALL(DB, TermList *, "GlassAllTermsList::next", NO_ARGS);
79  // Set termfreq to 0 to indicate no termfreq has been read for the current
80  // term.
81  termfreq = 0;
82 
83  if (rare(!cursor)) {
84  cursor = database->postlist_table.cursor_get();
85  Assert(cursor); // The postlist table isn't optional.
86 
87  if (prefix.empty()) {
88  (void)cursor->find_entry_ge(string("\x00\xff", 2));
89  } else {
90  const string & key = pack_glass_postlist_key(prefix);
91  if (cursor->find_entry_ge(key)) {
92  // The exact term we asked for is there, so just copy it rather
93  // than wasting effort unpacking it from the key.
94  current_term = prefix;
95  RETURN(NULL);
96  }
97  }
98  if (cursor->after_end()) {
99  RETURN(this);
100  }
101  goto first_time;
102  }
103 
104  Assert(!cursor->after_end());
105  while (true) {
106  if (!cursor->next()) {
107  RETURN(this);
108  }
109 
110 first_time:
111  // Fast check for terms without any zero bytes. ~8.4% faster.
112  auto nul = cursor->current_key.find('\0');
113  if (nul == string::npos) {
114  current_term = cursor->current_key;
115  break;
116  }
117  if (cursor->current_key[nul + 1] != '\xff') {
118  continue;
119  }
120 
121  const char *p = cursor->current_key.data();
122  const char *pend = p + cursor->current_key.size();
123  if (!unpack_string_preserving_sort(&p, pend, current_term)) {
124  throw Xapian::DatabaseCorruptError("PostList table key has unexpected format");
125  }
126 
127  // If this key is for the first chunk of a postlist, we're done.
128  // Otherwise we need to skip past continuation chunks until we find the
129  // first chunk of the next postlist.
130  if (p == pend) break;
131  }
132 
133  if (!startswith(current_term, prefix)) {
134  // We've reached the end of the prefixed terms.
135  RETURN(this);
136  }
137 
138  RETURN(NULL);
139 }
140 
141 TermList*
143 {
144  LOGCALL(DB, TermList *, "GlassAllTermsList::skip_to", term);
145  // Set termfreq to 0 to indicate no termfreq has been read for the current
146  // term.
147  termfreq = 0;
148 
149  if (rare(!cursor)) {
150  cursor = database->postlist_table.cursor_get();
151  Assert(cursor); // The postlist table isn't optional.
152  }
153  Assert(!cursor->after_end());
154 
155  string key = pack_glass_postlist_key(term);
156  if (cursor->find_entry_ge(key)) {
157  // The exact term we asked for is there, so just copy it rather than
158  // wasting effort unpacking it from the key.
159  current_term = term;
160  } else {
161  if (cursor->after_end()) {
162  RETURN(this);
163  }
164 
165  const char *p = cursor->current_key.data();
166  const char *pend = p + cursor->current_key.size();
167  if (!unpack_string_preserving_sort(&p, pend, current_term)) {
168  throw Xapian::DatabaseCorruptError("PostList table key has unexpected format");
169  }
170  }
171 
172  if (!startswith(current_term, prefix)) {
173  // We've reached the end of the prefixed terms.
174  RETURN(this);
175  }
176 
177  RETURN(NULL);
178 }
Xapian::doccount get_termfreq() const
Returns the term frequency of the current term.
TermList * next()
Advance to the next term in the list.
TermList * skip_to(std::string_view tname)
Advance to the first term which is >= tname.
void read_termfreq() const
Read and cache the term frequency.
~GlassAllTermsList()
Destructor.
Xapian::termcount get_approx_size() const
Return approximate size of this termlist.
static void read_freqs(const char **posptr, const char *end, Xapian::doccount *number_of_entries_ptr, Xapian::termcount *collection_freq_ptr)
Read the term frequency and collection frequency.
DatabaseCorruptError indicates database corruption was detected.
Definition: error.h:397
Abstract base class for termlists.
Definition: termlist.h:42
#define rare(COND)
Definition: config.h:607
string term
PositionList * p
Debug logging macros.
#define RETURN(...)
Definition: debuglog.h:484
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:478
#define LOGCALL_VOID(CATEGORY, FUNC, PARAMS)
Definition: debuglog.h:479
#define LOGCALL_DTOR(CATEGORY, CLASS)
Definition: debuglog.h:481
A termlist containing all terms in a glass database.
Postlists in glass databases.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
#define Assert(COND)
Definition: omassert.h:122
Pack types into strings and unpack them again.
bool unpack_string_preserving_sort(const char **p, const char *end, std::string &result)
Decode a "sort preserved" std::string from a string.
Definition: pack.h:551
std::string pack_glass_postlist_key(std::string_view term)
Definition: pack.h:574
Various handy string-related helpers.
bool startswith(std::string_view s, char pfx)
Definition: stringutils.h:56