xapian-core  2.0.0
honey_termlist.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007,2008,2009,2010,2011,2018,2024 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include "honey_termlist.h"
24 
25 #include "expand/expandweight.h"
26 
27 using namespace std;
28 
29 [[noreturn]]
30 static void
31 throw_database_corrupt(const char* item, const char* pos)
32 {
33  string message;
34  if (pos != NULL) {
35  message = "Value overflow unpacking termlist: ";
36  } else {
37  message = "Out of data unpacking termlist: ";
38  }
39  message += item;
40  throw Xapian::DatabaseCorruptError(message);
41 }
42 
44  : db(db_), did(did_)
45 {
47  data)) {
48  // Document with no terms or values, or one which doesn't exist.
49  termlist_size = 0;
50  doclen = 0;
51  pos = end = data.data();
52  return;
53  }
54 
55  pos = data.data();
56  end = pos + data.size();
57 
58  if (pos == end)
59  throw_database_corrupt("No termlist data", pos);
60 
61  size_t slot_enc_size = *pos++;
62 
63  // If the top bit is clear we have a 7-bit bitmap of slots used.
64  if (slot_enc_size & 0x80) {
65  slot_enc_size &= 0x7f;
66  if (slot_enc_size == 0) {
67  if (!unpack_uint(&pos, end, &slot_enc_size)) {
68  throw Xapian::DatabaseCorruptError("Termlist encoding corrupt");
69  }
70  }
71 
72  // Skip encoded slot data.
73  pos += slot_enc_size;
74  }
75 
76  if (pos == end) {
77  // Document with values but no terms.
78  termlist_size = 0;
79  doclen = 0;
80  return;
81  }
82 
83  if (!unpack_uint(&pos, end, &termlist_size)) {
84  throw_database_corrupt("termlist length", pos);
85  }
86  ++termlist_size;
87 
88  if (!unpack_uint(&pos, end, &doclen)) {
89  throw_database_corrupt("doclen", pos);
90  }
91 }
92 
95 {
96  return termlist_size;
97 }
98 
99 void
101 {
102  Assert(pos != NULL);
103  stats.accumulate(shard_index,
104  current_wdf,
105  doclen,
106  get_termfreq(),
107  db->get_doccount());
108 }
109 
112 {
113  Assert(pos != NULL);
114  return current_wdf;
115 }
116 
119 {
120  Assert(pos != NULL);
121  if (current_termfreq == 0)
123  return current_termfreq;
124 }
125 
126 TermList*
128 {
129  Assert(pos != NULL);
130 
131  if (pos == end) {
132  return this;
133  }
134 
135  current_wdf = 0;
136 
137  if (!current_term.empty()) {
138  size_t reuse = static_cast<unsigned char>(*pos++);
139  if (reuse > current_term.size()) {
140  current_wdf = reuse / (current_term.size() + 1);
141  reuse = reuse % (current_term.size() + 1);
142  }
143  current_term.resize(reuse);
144  }
145 
146  if (current_wdf) {
147  --current_wdf;
148  } else {
149  if (!unpack_uint(&pos, end, &current_wdf)) {
150  throw_database_corrupt("wdf", pos);
151  }
152  }
153 
154  if (pos == end)
155  throw_database_corrupt("term", NULL);
156 
157  size_t append = static_cast<unsigned char>(*pos++);
158  if (size_t(end - pos) < append)
159  throw_database_corrupt("term", NULL);
160 
161  current_term.append(pos, append);
162  pos += append;
163 
164  // Indicate that termfreq hasn't been read for the current term.
165  current_termfreq = 0;
166 
167  return NULL;
168 }
169 
170 TermList*
171 HoneyTermList::skip_to(std::string_view term)
172 {
173  while (current_term < term) {
174  if (HoneyTermList::next())
175  return this;
176  }
177  return NULL;
178 }
179 
182 {
184 }
185 
188 {
190 }
Database using honey backend.
void get_freqs(std::string_view term, Xapian::doccount *termfreq_ptr, Xapian::termcount *collfreq_ptr) const
Returns frequencies for a term.
PositionList * open_position_list(Xapian::docid did, std::string_view term) const
HoneyTermListTable termlist_table
HoneyPositionTable position_table
Xapian::doccount get_doccount() const
Xapian::termcount positionlist_count(Xapian::docid did, std::string_view term) const
Return the number of entries in specified position list.
bool get_exact_entry(std::string_view key, std::string *tag) const
Definition: honey_table.cc:247
static std::string make_key(Xapian::docid did)
TermList * skip_to(std::string_view term)
Skip forward to the specified term.
PositionList * positionlist_begin() const
Return a PositionIterator for the current position.
const char * pos
Current position with the encoded tag value held in data.
Xapian::termcount get_approx_size() const
Return approximate size of this termlist.
const char * end
Pointer to the end of the encoded tag value.
Xapian::doccount current_termfreq
The term frequency for the term at the current position.
Xapian::termcount positionlist_count() const
Return the length of the position list for the current position.
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
HoneyTermList(const HoneyTermList &)
Don't allow copying.
Xapian::termcount doclen
The length of document did.
std::string data
The tag value from the termlist table which holds the encoded termlist.
Xapian::termcount termlist_size
The number of entries in this termlist.
Xapian::termcount current_wdf
The wdf for the term at the current position.
Xapian::Internal::intrusive_ptr< const HoneyDatabase > db
The database we're reading data from.
Xapian::docid did
The document id that this TermList is for.
TermList * next()
Advance the current position to the next term in the termlist.
void accumulate_stats(Xapian::Internal::ExpandStats &stats) const
Collate weighting information for the current term.
Xapian::doccount get_termfreq() const
Return the term frequency for the term at the current position.
DatabaseCorruptError indicates database corruption was detected.
Definition: error.h:397
Collates statistics while calculating term weight in an ESet.
Definition: expandweight.h:37
void accumulate(size_t shard_index, Xapian::termcount wdf, Xapian::termcount doclen, Xapian::doccount subtf, Xapian::doccount subdbsize)
Definition: expandweight.h:71
Abstract base class for iterating term positions in a document.
Definition: positionlist.h:32
Abstract base class for termlists.
Definition: termlist.h:42
std::string current_term
The current term.
Definition: termlist.h:54
size_t shard_index
Which shard of a multidatabase this is from.
Definition: termlist.h:126
string term
Xapian::termpos pos
Collate statistics and calculate the term weights for the ESet.
static void throw_database_corrupt(const char *item, const char *pos)
A TermList in a honey database.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
#define Assert(COND)
Definition: omassert.h:122
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
Definition: pack.h:346