xapian-core  1.4.25
chert_termlisttable.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007,2008 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <config.h>
22 
23 #include "chert_termlisttable.h"
24 
25 #include <xapian/document.h>
26 #include <xapian/error.h>
27 #include <xapian/termiterator.h>
28 
29 #include "debuglog.h"
30 #include "omassert.h"
31 #include "pack.h"
32 #include "stringutils.h"
33 
34 #include <string>
35 
36 using namespace std;
37 
38 void
40  const Xapian::Document & doc,
41  chert_doclen_t doclen)
42 {
43  LOGCALL_VOID(DB, "ChertTermListTable::set_termlist", did | doc | doclen);
44 
45  string tag;
46  pack_uint(tag, doclen);
47 
48  Xapian::doccount termlist_size = doc.termlist_count();
49  if (termlist_size == 0) {
50  // doclen is sum(wdf) so should be zero if there are no terms.
51  Assert(doclen == 0);
52  Assert(doc.termlist_begin() == doc.termlist_end());
53  add(make_key(did), string());
54  return;
55  }
56 
58  if (t != doc.termlist_end()) {
59  pack_uint(tag, termlist_size);
60  string prev_term = *t;
61 
62  tag += prev_term.size();
63  tag += prev_term;
64  pack_uint(tag, t.get_wdf());
65  --termlist_size;
66 
67  while (++t != doc.termlist_end()) {
68  const string & term = *t;
69  // If there's a shared prefix with the previous term, we don't
70  // store it explicitly, but just store the length of the shared
71  // prefix. In general, this is a big win.
72  size_t reuse = common_prefix_length(prev_term, term);
73 
74  // reuse must be <= prev_term.size(), and we know that value while
75  // decoding. So if the wdf is small enough that we can multiply it
76  // by (prev_term.size() + 1), add reuse and fit the result in a
77  // byte, then we can pack reuse and the wdf into a single byte and
78  // save ourselves a byte. We actually need to add one to the wdf
79  // before multiplying so that a wdf of 0 can be detected by the
80  // decoder.
81  size_t packed = 0;
82  Xapian::termcount wdf = t.get_wdf();
83  // If wdf >= 128, then we aren't going to be able to pack it in so
84  // don't even try to avoid the calculation overflowing and making
85  // us think we can.
86  if (wdf < 127)
87  packed = (wdf + 1) * (prev_term.size() + 1) + reuse;
88 
89  if (packed && packed < 256) {
90  // We can pack the wdf into the same byte.
91  tag += char(packed);
92  tag += char(term.size() - reuse);
93  tag.append(term.data() + reuse, term.size() - reuse);
94  } else {
95  tag += char(reuse);
96  tag += char(term.size() - reuse);
97  tag.append(term.data() + reuse, term.size() - reuse);
98  // FIXME: pack wdf after reuse next time we rejig the format
99  // incompatibly.
100  pack_uint(tag, wdf);
101  }
102 
103  prev_term = *t;
104  --termlist_size;
105  }
106  }
107  Assert(termlist_size == 0);
108  add(make_key(did), tag);
109 }
Subclass of ChertTable which holds termlists.
#define Assert(COND)
Definition: omassert.h:122
void set_termlist(Xapian::docid did, const Xapian::Document &doc, chert_doclen_t doclen)
Set the termlist data for document did.
Xapian::termcount termlist_count() const
The length of the termlist - i.e.
Definition: omdocument.cc:191
#define LOGCALL_VOID(CATEGORY, FUNC, PARAMS)
Definition: debuglog.h:488
STL namespace.
TermIterator termlist_end() const
Equivalent end iterator for termlist_begin().
Definition: document.h:270
Hierarchy of classes which Xapian can throw as exceptions.
Class for iterating over a list of terms.
Definition: termiterator.h:41
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
void pack_uint(std::string &s, U value)
Append an encoded unsigned integer to a string.
Definition: pack.h:382
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
std::string::size_type common_prefix_length(const std::string &a, const std::string &b)
Definition: stringutils.h:123
Pack types into strings and unpack them again.
Various handy helpers which std::string really should provide.
Various assertion macros.
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
unsigned int chert_doclen_t
An integer type for storing the length of a document - ie, the sum of the wdfs of the terms in the do...
Definition: chert_types.h:51
API for working with documents.
Class for iterating over a list of terms.
TermIterator termlist_begin() const
Start iterating the terms in this document.
Definition: omdocument.cc:197
A handle representing a document in a Xapian database.
Definition: document.h:61
string make_key(Xapian::docid did)
Definition: chert_record.cc:35
Debug logging macros.