xapian-core  2.0.0
honey_termlisttable.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007,2008,2009,2010,2018 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include "honey_termlisttable.h"
24 
25 #include <xapian/document.h>
26 #include <xapian/error.h>
27 #include <xapian/termiterator.h>
28 
29 #include "debuglog.h"
30 #include "omassert.h"
31 #include "pack.h"
32 #include "stringutils.h"
33 
34 #include <string>
35 
36 using namespace std;
37 
38 void
40  const Xapian::Document& doc,
41  Xapian::termcount doclen)
42 {
43  LOGCALL_VOID(DB, "HoneyTermListTable::set_termlist", did | doc | doclen);
44 
45  Xapian::doccount termlist_size = doc.termlist_count();
46 
47  string tag;
48  // FIXME: Need to encode value slots used here. For now, just encode no
49  // slots as used.
50  tag += '\0';
51 
52  if (usual(termlist_size != 0)) {
53  pack_uint(tag, termlist_size - 1);
54  pack_uint(tag, doclen);
56  string prev_term = *t;
57 
58  tag += prev_term.size();
59  pack_uint(tag, t.get_wdf());
60  tag += prev_term;
61 
62  while (++t != doc.termlist_end()) {
63  const string& term = *t;
64  // If there's a shared prefix with the previous term, we don't
65  // store it explicitly, but just store the length of the shared
66  // prefix. In general, this is a big win.
67  size_t reuse = common_prefix_length(prev_term, term);
68 
69  // reuse must be <= prev_term.size(), and we know that value while
70  // decoding. So if the wdf is small enough that we can multiply it
71  // by (prev_term.size() + 1), add reuse and fit the result in a
72  // byte, then we can pack reuse and the wdf into a single byte and
73  // save ourselves a byte. We actually need to add one to the wdf
74  // before multiplying so that a wdf of 0 can be detected by the
75  // decoder.
76  Xapian::termcount wdf = t.get_wdf();
77  // If wdf >= 127, then we aren't going to be able to pack it in so
78  // don't even try to avoid any risk of the calculation overflowing
79  // and making us think we can.
80  size_t packed;
81  if (wdf < 127 &&
82  (packed = (wdf + 1) * (prev_term.size() + 1) + reuse) < 256) {
83  // We can pack the wdf into the same byte.
84  tag += char(packed);
85  } else {
86  tag += char(reuse);
87  pack_uint(tag, wdf);
88  }
89  tag += char(term.size() - reuse);
90  tag.append(term.data() + reuse, term.size() - reuse);
91 
92  prev_term = *t;
93  }
94  } else {
95  Assert(doclen == 0);
96  Assert(doc.termlist_begin() == doc.termlist_end());
97  }
98 
99  if (rare(tag.size() == 1 && tag[0] == '\0')) {
100  // No slots used or terms.
101  del(make_key(did));
102  } else {
103  add(make_key(did), tag);
104  }
105 }
void set_termlist(Xapian::docid did, const Xapian::Document &doc, Xapian::termcount doclen)
Set the termlist data for document did.
Class representing a document.
Definition: document.h:64
TermIterator termlist_end() const noexcept
End iterator corresponding to termlist_begin().
Definition: document.h:219
Xapian::termcount termlist_count() const
Return the number of distinct terms in this document.
Definition: document.cc:174
TermIterator termlist_begin() const
Start iterating the terms in this document.
Definition: document.cc:179
Class for iterating over a list of terms.
Definition: termiterator.h:41
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
#define usual(COND)
Definition: config.h:608
#define rare(COND)
Definition: config.h:607
string term
Debug logging macros.
#define LOGCALL_VOID(CATEGORY, FUNC, PARAMS)
Definition: debuglog.h:479
Class representing a document.
Hierarchy of classes which Xapian can throw as exceptions.
Subclass of HoneyTable which holds termlists.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
Various assertion macros.
#define Assert(COND)
Definition: omassert.h:122
Pack types into strings and unpack them again.
void pack_uint(std::string &s, U value)
Append an encoded unsigned integer to a string.
Definition: pack.h:315
Various handy string-related helpers.
std::string::size_type common_prefix_length(std::string_view a, std::string_view b)
Definition: stringutils.h:128
Class for iterating over a list of terms.