chert_termlist.cc
Go to the documentation of this file.
1 /* chert_termlist.cc: Termlists in a chert database
2  *
3  * Copyright 1999,2000,2001 BrightStation PLC
4  * Copyright 2002 Ananova Ltd
5  * Copyright 2002,2003,2004,2006,2007,2008,2010,2011,2014 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20  * USA
21  */
22 
23 #include <config.h>
24 #include "chert_termlist.h"
25 
26 #include "xapian/error.h"
27 
28 #include "expand/expandweight.h"
29 #include "chert_positionlist.h"
30 #include "debuglog.h"
31 #include "omassert.h"
32 #include "pack.h"
33 #include "str.h"
34 
35 using namespace std;
37 
39  Xapian::docid did_)
40  : db(db_), did(did_), current_wdf(0), current_termfreq(0)
41 {
42  LOGCALL_CTOR(DB, "ChertTermList", db_ | did_);
43 
45  data))
46  throw Xapian::DocNotFoundError("No termlist for document " + str(did));
47 
48  pos = data.data();
49  end = pos + data.size();
50 
51  if (pos == end) {
52  doclen = 0;
53  termlist_size = 0;
54  return;
55  }
56 
57  // Read doclen
58  if (!unpack_uint(&pos, end, &doclen)) {
59  const char *msg;
60  if (pos == 0) {
61  msg = "Too little data for doclen in termlist";
62  } else {
63  msg = "Overflowed value for doclen in termlist";
64  }
66  }
67 
68  // Read termlist_size
69  if (!unpack_uint(&pos, end, &termlist_size)) {
70  const char *msg;
71  if (pos == 0) {
72  msg = "Too little data for list size in termlist";
73  } else {
74  msg = "Overflowed value for list size in termlist";
75  }
77  }
78 }
79 
82 {
83  LOGCALL(DB, chert_doclen_t, "ChertTermList::get_doclength", NO_ARGS);
84  RETURN(doclen);
85 }
86 
89 {
90  LOGCALL(DB, Xapian::termcount, "ChertTermList::get_approx_size", NO_ARGS);
92 }
93 
94 void
96 {
97  LOGCALL_VOID(DB, "ChertTermList::accumulate_stats", stats);
98  Assert(!at_end());
100 }
101 
102 string
104 {
105  LOGCALL(DB, string, "ChertTermList::get_termname", NO_ARGS);
107 }
108 
111 {
112  LOGCALL(DB, Xapian::termcount, "ChertTermList::get_wdf", NO_ARGS);
114 }
115 
118 {
119  LOGCALL(DB, Xapian::doccount, "ChertTermList::get_termfreq", NO_ARGS);
120  if (current_termfreq == 0)
123 }
124 
125 TermList *
127 {
128  LOGCALL(DB, TermList *, "ChertTermList::next", NO_ARGS);
129  Assert(!at_end());
130  if (pos == end) {
131  pos = NULL;
132  RETURN(NULL);
133  }
134 
135  // Reset to 0 to indicate that the termfreq needs to be read.
136  current_termfreq = 0;
137 
138  bool wdf_in_reuse = false;
139  if (!current_term.empty()) {
140  // Find out how much of the previous term to reuse.
141  size_t len = static_cast<unsigned char>(*pos++);
142  if (len > current_term.size()) {
143  // The wdf is also stored in the "reuse" byte.
144  wdf_in_reuse = true;
145  size_t divisor = current_term.size() + 1;
146  current_wdf = len / divisor - 1;
147  len %= divisor;
148  }
149  current_term.resize(len);
150  }
151 
152  // Append the new tail to form the next term.
153  size_t append_len = static_cast<unsigned char>(*pos++);
154  current_term.append(pos, append_len);
155  pos += append_len;
156 
157  // Read the wdf if it wasn't packed into the reuse byte.
158  if (!wdf_in_reuse && !unpack_uint(&pos, end, &current_wdf)) {
159  const char *msg;
160  if (pos == 0) {
161  msg = "Too little data for wdf in termlist";
162  } else {
163  msg = "Overflowed value for wdf in termlist";
164  }
165  throw Xapian::DatabaseCorruptError(msg);
166  }
167 
168  RETURN(NULL);
169 }
170 
171 TermList *
172 ChertTermList::skip_to(const string & term)
173 {
174  LOGCALL(API, TermList *, "ChertTermList::skip_to", term);
175  while (pos != NULL && current_term < term) {
176  (void)ChertTermList::next();
177  }
178  RETURN(NULL);
179 }
180 
181 bool
183 {
184  LOGCALL(DB, bool, "ChertTermList::at_end", NO_ARGS);
185  RETURN(pos == NULL);
186 }
187 
190 {
191  LOGCALL(DB, Xapian::termcount, "ChertTermList::positionlist_count", NO_ARGS);
193 }
194 
197 {
198  LOGCALL(DB, Xapian::PositionIterator, "ChertTermList::positionlist_begin", NO_ARGS);
201 }
std::string current_term
The termname at the current position.
bool at_end() const
Return true if the current position is past the last term in this list.
#define RETURN(A)
Definition: debuglog.h:459
#define Assert(COND)
Definition: omassert.h:122
A position list in a chert database.
bool get_exact_entry(const std::string &key, std::string &tag) const
Read an entry from the table, if and only if it is exactly that being asked for.
chert_doclen_t doclen
The length of document did.
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
Xapian::doccount get_doccount() const
Virtual methods of Database::Internal.
#define LOGCALL_VOID(CATEGORY, FUNC, PARAMS)
Definition: debuglog.h:454
Abstract base class for termlists.
Definition: termlist.h:39
STL namespace.
Convert types to std::string.
void accumulate(Xapian::termcount wdf, Xapian::termcount doclen, Xapian::doccount subtf, Xapian::doccount subdbsize)
Definition: expandweight.h:80
ChertPositionListTable position_table
Table storing position lists.
const char * end
Pointer to the end of the encoded tag value.
Hierarchy of classes which Xapian can throw as exceptions.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
Xapian::termcount positionlist_count() const
Return the length of the position list for the current position.
std::string get_termname() const
Return the termname at the current position.
Collate statistics and calculate the term weights for the ESet.
void accumulate_stats(Xapian::Internal::ExpandStats &stats) const
Collate weighting information for the current term.
TermList * next()
Advance the current position to the next term in the termlist.
Xapian::doccount get_termfreq() const
Return the term frequency for the term at the current position.
TermList * skip_to(const std::string &term)
Skip forward to the specified term.
Xapian::termcount termlist_size
The number of entries in this termlist.
string str(int value)
Convert int to std::string.
Definition: str.cc:84
Class for iterating over term positions.
Xapian::termcount positionlist_count(Xapian::docid did, const string &term) const
Return the number of entries in specified position list.
static std::string make_key(Xapian::docid did)
Xapian::PositionIterator positionlist_begin() const
Return a PositionIterator for the current position.
void get_freqs(const string &term, Xapian::doccount *termfreq_ptr, Xapian::termcount *collfreq_ptr) const
Virtual methods of Database::Internal.
#define LOGCALL_CTOR(CATEGORY, CLASS, PARAMS)
Definition: debuglog.h:455
A position list in a chert database.
Indicates an attempt to access a document not present in the database.
Definition: error.h:658
A TermList in a chert database.
DatabaseCorruptError indicates database corruption was detected.
Definition: error.h:397
ChertTermListTable termlist_table
Table storing term lists.
Xapian::termcount get_approx_size() const
Return approximate size of this termlist.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
ChertTermList(const ChertTermList &)
Don't allow copying.
Collates statistics while calculating term weight in an ESet.
Definition: expandweight.h:37
Pack types into strings and unpack them again.
std::string data
The tag value from the termlist table which holds the encoded termlist.
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
Definition: pack.h:395
Various assertion macros.
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
Xapian::termcount current_wdf
The wdf for the term at the current position.
const char * pos
Current position with the encoded tag value held in data.
unsigned int chert_doclen_t
An integer type for storing the length of a document - ie, the sum of the wdfs of the terms in the do...
Definition: chert_types.h:51
A smart pointer that uses intrusive reference counting.
Definition: intrusive_ptr.h:81
Xapian::Internal::intrusive_ptr< const ChertDatabase > db
The database we're reading data from.
Xapian::doccount current_termfreq
The term frequency for the term at the current position.
chert_doclen_t get_doclength() const
Return the length of this document.
Debug logging macros.
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:453
Xapian::docid did
The document id that this TermList is for.

Documentation for Xapian (version 1.4.1).
Generated on Sun Oct 23 2016 by Doxygen 1.8.8.