xapian-core  1.4.20
chert_termlist.cc
Go to the documentation of this file.
1 /* chert_termlist.cc: Termlists in a chert database
2  *
3  * Copyright 1999,2000,2001 BrightStation PLC
4  * Copyright 2002 Ananova Ltd
5  * Copyright 2002,2003,2004,2006,2007,2008,2010,2011,2014 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20  * USA
21  */
22 
23 #include <config.h>
24 #include "chert_termlist.h"
25 
26 #include "xapian/error.h"
27 
28 #include "expand/expandweight.h"
29 #include "chert_positionlist.h"
30 #include "debuglog.h"
31 #include "omassert.h"
32 #include "pack.h"
33 #include "str.h"
34 
35 using namespace std;
37 
39  Xapian::docid did_)
40  : db(db_), did(did_), current_wdf(0), current_termfreq(0)
41 {
42  LOGCALL_CTOR(DB, "ChertTermList", db_ | did_);
43 
45  data))
46  throw Xapian::DocNotFoundError("No termlist for document " + str(did));
47 
48  pos = data.data();
49  end = pos + data.size();
50 
51  if (pos == end) {
52  doclen = 0;
53  termlist_size = 0;
54  return;
55  }
56 
57  // Read doclen
58  if (!unpack_uint(&pos, end, &doclen)) {
59  const char *msg;
60  if (pos == 0) {
61  msg = "Too little data for doclen in termlist";
62  } else {
63  msg = "Overflowed value for doclen in termlist";
64  }
66  }
67 
68  // Read termlist_size
69  if (!unpack_uint(&pos, end, &termlist_size)) {
70  const char *msg;
71  if (pos == 0) {
72  msg = "Too little data for list size in termlist";
73  } else {
74  msg = "Overflowed value for list size in termlist";
75  }
77  }
78 }
79 
82 {
83  LOGCALL(DB, chert_doclen_t, "ChertTermList::get_doclength", NO_ARGS);
84  RETURN(doclen);
85 }
86 
89 {
90  LOGCALL(DB, Xapian::termcount, "ChertTermList::get_approx_size", NO_ARGS);
92 }
93 
94 void
96 {
97  LOGCALL_VOID(DB, "ChertTermList::accumulate_stats", stats);
98  Assert(!at_end());
99  stats.accumulate(shard_index,
101 }
102 
103 string
105 {
106  LOGCALL(DB, string, "ChertTermList::get_termname", NO_ARGS);
108 }
109 
112 {
113  LOGCALL(DB, Xapian::termcount, "ChertTermList::get_wdf", NO_ARGS);
115 }
116 
119 {
120  LOGCALL(DB, Xapian::doccount, "ChertTermList::get_termfreq", NO_ARGS);
121  if (current_termfreq == 0)
124 }
125 
126 TermList *
128 {
129  LOGCALL(DB, TermList *, "ChertTermList::next", NO_ARGS);
130  Assert(!at_end());
131  if (pos == end) {
132  pos = NULL;
133  RETURN(NULL);
134  }
135 
136  // Reset to 0 to indicate that the termfreq needs to be read.
137  current_termfreq = 0;
138 
139  bool wdf_in_reuse = false;
140  if (!current_term.empty()) {
141  // Find out how much of the previous term to reuse.
142  size_t len = static_cast<unsigned char>(*pos++);
143  if (len > current_term.size()) {
144  // The wdf is also stored in the "reuse" byte.
145  wdf_in_reuse = true;
146  size_t divisor = current_term.size() + 1;
147  current_wdf = len / divisor - 1;
148  len %= divisor;
149  }
150  current_term.resize(len);
151  }
152 
153  // Append the new tail to form the next term.
154  size_t append_len = static_cast<unsigned char>(*pos++);
155  current_term.append(pos, append_len);
156  pos += append_len;
157 
158  // Read the wdf if it wasn't packed into the reuse byte.
159  if (!wdf_in_reuse && !unpack_uint(&pos, end, &current_wdf)) {
160  const char *msg;
161  if (pos == 0) {
162  msg = "Too little data for wdf in termlist";
163  } else {
164  msg = "Overflowed value for wdf in termlist";
165  }
166  throw Xapian::DatabaseCorruptError(msg);
167  }
168 
169  RETURN(NULL);
170 }
171 
172 TermList *
173 ChertTermList::skip_to(const string & term)
174 {
175  LOGCALL(API, TermList *, "ChertTermList::skip_to", term);
176  while (pos != NULL && current_term < term) {
177  (void)ChertTermList::next();
178  }
179  RETURN(NULL);
180 }
181 
182 bool
184 {
185  LOGCALL(DB, bool, "ChertTermList::at_end", NO_ARGS);
186  RETURN(pos == NULL);
187 }
188 
191 {
192  LOGCALL(DB, Xapian::termcount, "ChertTermList::positionlist_count", NO_ARGS);
194 }
195 
198 {
199  LOGCALL(DB, Xapian::PositionIterator, "ChertTermList::positionlist_begin", NO_ARGS);
202 }
std::string current_term
The termname at the current position.
#define RETURN(A)
Definition: debuglog.h:482
#define Assert(COND)
Definition: omassert.h:122
A position list in a chert database.
Xapian::termcount positionlist_count() const
Return the length of the position list for the current position.
chert_doclen_t doclen
The length of document did.
Xapian::doccount get_doccount() const
Virtual methods of Database::Internal.
#define LOGCALL_VOID(CATEGORY, FUNC, PARAMS)
Definition: debuglog.h:477
Xapian::termcount get_approx_size() const
Return approximate size of this termlist.
Abstract base class for termlists.
Definition: termlist.h:39
STL namespace.
Convert types to std::string.
ChertPositionListTable position_table
Table storing position lists.
const char * end
Pointer to the end of the encoded tag value.
Hierarchy of classes which Xapian can throw as exceptions.
std::string get_termname() const
Return the termname at the current position.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
void accumulate(size_t shard_index, Xapian::termcount wdf, Xapian::termcount doclen, Xapian::doccount subtf, Xapian::doccount subdbsize)
Definition: expandweight.h:76
bool at_end() const
Return true if the current position is past the last term in this list.
Collate statistics and calculate the term weights for the ESet.
TermList * next()
Advance the current position to the next term in the termlist.
Xapian::termcount positionlist_count(Xapian::docid did, const string &term) const
Return the number of entries in specified position list.
Xapian::doccount get_termfreq() const
Return the term frequency for the term at the current position.
TermList * skip_to(const std::string &term)
Skip forward to the specified term.
bool get_exact_entry(const std::string &key, std::string &tag) const
Read an entry from the table, if and only if it is exactly that being asked for.
Xapian::termcount termlist_size
The number of entries in this termlist.
string str(int value)
Convert int to std::string.
Definition: str.cc:90
Class for iterating over term positions.
static std::string make_key(Xapian::docid did)
#define LOGCALL_CTOR(CATEGORY, CLASS, PARAMS)
Definition: debuglog.h:478
A position list in a chert database.
Indicates an attempt to access a document not present in the database.
Definition: error.h:674
A TermList in a chert database.
DatabaseCorruptError indicates database corruption was detected.
Definition: error.h:409
ChertTermListTable termlist_table
Table storing term lists.
Xapian::PositionIterator positionlist_begin() const
Return a PositionIterator for the current position.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
ChertTermList(const ChertTermList &)
Don&#39;t allow copying.
size_t shard_index
Which shard of a multidatabase this is from.
Definition: termlist.h:114
Collates statistics while calculating term weight in an ESet.
Definition: expandweight.h:37
Pack types into strings and unpack them again.
std::string data
The tag value from the termlist table which holds the encoded termlist.
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
Definition: pack.h:413
Various assertion macros.
void accumulate_stats(Xapian::Internal::ExpandStats &stats) const
Collate weighting information for the current term.
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
Xapian::termcount current_wdf
The wdf for the term at the current position.
const char * pos
Current position with the encoded tag value held in data.
unsigned int chert_doclen_t
An integer type for storing the length of a document - ie, the sum of the wdfs of the terms in the do...
Definition: chert_types.h:51
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
A smart pointer that uses intrusive reference counting.
Definition: intrusive_ptr.h:81
chert_doclen_t get_doclength() const
Return the length of this document.
Xapian::Internal::intrusive_ptr< const ChertDatabase > db
The database we&#39;re reading data from.
Xapian::doccount current_termfreq
The term frequency for the term at the current position.
Debug logging macros.
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:476
void get_freqs(const string &term, Xapian::doccount *termfreq_ptr, Xapian::termcount *collfreq_ptr) const
Virtual methods of Database::Internal.
Xapian::docid did
The document id that this TermList is for.