xapian-core  2.0.0
glass_termlist.cc
Go to the documentation of this file.
1 
4 /* Copyright 1999,2000,2001 BrightStation PLC
5  * Copyright 2002 Ananova Ltd
6  * Copyright 2002,2003,2004,2006,2007,2008,2009,2010,2014,2019,2024 Olly Betts
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as
10  * published by the Free Software Foundation; either version 2 of the
11  * License, or (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, see
20  * <https://www.gnu.org/licenses/>.
21  */
22 
23 #include <config.h>
24 #include "glass_termlist.h"
25 
26 #include "xapian/error.h"
27 
28 #include "expand/expandweight.h"
29 #include "glass_positionlist.h"
30 #include "debuglog.h"
31 #include "omassert.h"
32 #include "pack.h"
33 #include "str.h"
34 
35 using namespace std;
37 
39  Xapian::docid did_,
40  bool throw_if_not_present)
41  : db(db_), did(did_), current_wdf(0), current_termfreq(0)
42 {
43  LOGCALL_CTOR(DB, "GlassTermList", db_ | did_ | throw_if_not_present);
44 
46  data)) {
47  if (!throw_if_not_present) {
48  pos = NULL;
49  return;
50  }
51  throw Xapian::DocNotFoundError("No termlist for document " + str(did));
52  }
53 
54  pos = data.data();
55  end = pos + data.size();
56 
57  if (pos == end) {
58  doclen = 0;
59  termlist_size = 0;
60  return;
61  }
62 
63  // Read doclen
64  if (!unpack_uint(&pos, end, &doclen)) {
65  const char *msg;
66  if (pos == 0) {
67  msg = "Too little data for doclen in termlist";
68  } else {
69  msg = "Overflowed value for doclen in termlist";
70  }
72  }
73 
74  // Read termlist_size
75  if (!unpack_uint(&pos, end, &termlist_size)) {
76  const char *msg;
77  if (pos == 0) {
78  msg = "Too little data for list size in termlist";
79  } else {
80  msg = "Overflowed value for list size in termlist";
81  }
83  }
84 }
85 
88 {
89  LOGCALL(DB, Xapian::termcount, "GlassTermList::get_doclength", NO_ARGS);
90  RETURN(doclen);
91 }
92 
95 {
96  LOGCALL(DB, Xapian::termcount, "GlassTermList::get_unique_terms", NO_ARGS);
97  // get_unique_terms() really ought to only count terms with wdf > 0, but
98  // that's expensive to calculate on demand, so for now let's just ensure
99  // unique_terms <= doclen.
100  RETURN(min(termlist_size, doclen));
101 }
102 
105 {
106  LOGCALL(DB, Xapian::termcount, "GlassTermList::get_approx_size", NO_ARGS);
108 }
109 
110 void
112 {
113  LOGCALL_VOID(DB, "GlassTermList::accumulate_stats", stats);
114  Assert(pos != NULL);
115  stats.accumulate(shard_index,
117 }
118 
121 {
122  LOGCALL(DB, Xapian::termcount, "GlassTermList::get_wdf", NO_ARGS);
124 }
125 
128 {
129  LOGCALL(DB, Xapian::doccount, "GlassTermList::get_termfreq", NO_ARGS);
130  if (current_termfreq == 0)
133 }
134 
135 TermList *
137 {
138  LOGCALL(DB, TermList *, "GlassTermList::next", NO_ARGS);
139  Assert(pos != NULL);
140  if (pos == end) {
141  RETURN(this);
142  }
143 
144  // Reset to 0 to indicate that the termfreq needs to be read.
145  current_termfreq = 0;
146 
147  bool wdf_in_reuse = false;
148  if (!current_term.empty()) {
149  // Find out how much of the previous term to reuse.
150  size_t len = static_cast<unsigned char>(*pos++);
151  if (len > current_term.size()) {
152  // The wdf is also stored in the "reuse" byte.
153  wdf_in_reuse = true;
154  size_t divisor = current_term.size() + 1;
155  current_wdf = len / divisor - 1;
156  len %= divisor;
157  }
158  current_term.resize(len);
159  }
160 
161  // Append the new tail to form the next term.
162  size_t append_len = static_cast<unsigned char>(*pos++);
163  current_term.append(pos, append_len);
164  pos += append_len;
165 
166  // Read the wdf if it wasn't packed into the reuse byte.
167  if (!wdf_in_reuse && !unpack_uint(&pos, end, &current_wdf)) {
168  const char *msg;
169  if (pos == 0) {
170  msg = "Too little data for wdf in termlist";
171  } else {
172  msg = "Overflowed value for wdf in termlist";
173  }
174  throw Xapian::DatabaseCorruptError(msg);
175  }
176 
177  RETURN(NULL);
178 }
179 
180 TermList*
182 {
183  LOGCALL(API, TermList *, "GlassTermList::skip_to", term);
184  while (current_term < term) {
185  if (GlassTermList::next())
186  RETURN(this);
187  }
188  RETURN(NULL);
189 }
190 
193 {
194  LOGCALL(DB, Xapian::termcount, "GlassTermList::positionlist_count", NO_ARGS);
196 }
197 
200 {
201  LOGCALL(DB, PositionList*, "GlassTermList::positionlist_begin", NO_ARGS);
203 }
204 
205 #ifdef DISABLE_GPL_LIBXAPIAN
206 # error GPL source we cannot relicense included in libxapian
207 #endif
void get_freqs(std::string_view term, Xapian::doccount *termfreq_ptr, Xapian::termcount *collfreq_ptr) const
Returns frequencies for a term.
PositionList * open_position_list(Xapian::docid did, std::string_view term) const
GlassTermListTable termlist_table
Table storing term lists.
Xapian::doccount get_doccount() const
Virtual methods of Database::Internal.
virtual Xapian::termcount positionlist_count(Xapian::docid did, std::string_view term) const
bool get_exact_entry(std::string_view key, std::string &tag) const
Read an entry from the table, if and only if it is exactly that being asked for.
static std::string make_key(Xapian::docid did)
Xapian::termcount get_doclength() const
Return the length of this document.
Xapian::termcount get_unique_terms() const
Return the number of unique terms.
Xapian::termcount termlist_size
The number of entries in this termlist.
std::string data
The tag value from the termlist table which holds the encoded termlist.
PositionList * positionlist_begin() const
Return a PositionIterator for the current position.
Xapian::doccount get_termfreq() const
Return the term frequency for the term at the current position.
TermList * next()
Advance the current position to the next term in the termlist.
TermList * skip_to(std::string_view term)
Skip forward to the specified term.
Xapian::termcount current_wdf
The wdf for the term at the current position.
Xapian::termcount get_approx_size() const
Return approximate size of this termlist.
Xapian::docid did
The document id that this TermList is for.
const char * end
Pointer to the end of the encoded tag value.
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
Xapian::doccount current_termfreq
The term frequency for the term at the current position.
Xapian::Internal::intrusive_ptr< const GlassDatabase > db
The database we're reading data from.
const char * pos
Current position with the encoded tag value held in data.
Xapian::termcount positionlist_count() const
Return the length of the position list for the current position.
GlassTermList(const GlassTermList &)
Don't allow copying.
Xapian::termcount doclen
The length of document did.
void accumulate_stats(Xapian::Internal::ExpandStats &stats) const
Collate weighting information for the current term.
DatabaseCorruptError indicates database corruption was detected.
Definition: error.h:397
Indicates an attempt to access a document not present in the database.
Definition: error.h:662
Collates statistics while calculating term weight in an ESet.
Definition: expandweight.h:37
void accumulate(size_t shard_index, Xapian::termcount wdf, Xapian::termcount doclen, Xapian::doccount subtf, Xapian::doccount subdbsize)
Definition: expandweight.h:71
A smart pointer that uses intrusive reference counting.
Definition: intrusive_ptr.h:83
Abstract base class for iterating term positions in a document.
Definition: positionlist.h:32
Abstract base class for termlists.
Definition: termlist.h:42
std::string current_term
The current term.
Definition: termlist.h:54
size_t shard_index
Which shard of a multidatabase this is from.
Definition: termlist.h:126
string term
Debug logging macros.
#define RETURN(...)
Definition: debuglog.h:484
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:478
#define LOGCALL_CTOR(CATEGORY, CLASS, PARAMS)
Definition: debuglog.h:480
#define LOGCALL_VOID(CATEGORY, FUNC, PARAMS)
Definition: debuglog.h:479
Hierarchy of classes which Xapian can throw as exceptions.
Collate statistics and calculate the term weights for the ESet.
A position list in a glass database.
A TermList in a glass database.
string str(int value)
Convert int to std::string.
Definition: str.cc:91
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
Various assertion macros.
#define Assert(COND)
Definition: omassert.h:122
Pack types into strings and unpack them again.
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
Definition: pack.h:346
Convert types to std::string.