xapian-core  2.0.0
serialise.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2006-2024 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include <xapian/document.h>
25 #include <xapian/termiterator.h>
26 #include <xapian/valueiterator.h>
27 
28 #include "omassert.h"
29 #include "api/rsetinternal.h"
30 #include "pack.h"
31 #include "serialise.h"
32 #include "serialise-double.h"
33 #include "stringutils.h"
34 #include "weight/weightinternal.h"
35 
36 #include <string>
37 #include <string_view>
38 
39 using namespace std;
40 
41 string
43 {
44  string result;
45 
46  pack_uint(result, stats.total_length);
47  pack_uint(result, stats.collection_size);
48  pack_uint(result, stats.rset_size);
49  pack_uint(result, stats.db_doclength_lower_bound);
50  pack_uint(result, stats.db_doclength_upper_bound -
55  pack_bool(result, stats.have_max_part);
56 
57  pack_uint(result, stats.termfreqs.size());
58  string_view prev_term;
59  for (auto&& i : stats.termfreqs) {
60  const string& term = i.first;
61  // We reduce the size of the encoding by reusing any prefix which is in
62  // common with the previous term. This is much more compact if there
63  // are a lot of terms, especially if they share a prefix such as an
64  // OP_WILDCARD which expands to a lot of terms.
65  size_t reuse = common_prefix_length(prev_term, term, 255);
66  size_t append = term.size() - reuse;
67  if (reuse == prev_term.size() && usual(term.size() <= 255)) {
68  // Reuse the whole of the previous term. In this case we store the
69  // new length for the reuse count, which is longer than a valid
70  // reuse count. This saves a byte (or two if the new term is
71  // >= 128 bytes long).
72  AssertRel(term.size(), >, reuse);
73  result += char(term.size());
74  } else {
75  result += char(reuse);
76  pack_uint(result, append);
77  }
78  result.append(term.data() + reuse, append);
79  pack_uint(result, i.second.termfreq);
80  if (stats.rset_size != 0)
81  pack_uint(result, i.second.reltermfreq);
82  pack_uint(result, i.second.collfreq);
83  if (stats.have_max_part)
84  result += serialise_double(i.second.max_part);
85  prev_term = term;
86  }
87 
88  return result;
89 }
90 
91 void
92 unserialise_stats(const char* p, const char* p_end,
94 {
95  size_t n;
96  if (!unpack_uint(&p, p_end, &stat.total_length) ||
97  !unpack_uint(&p, p_end, &stat.collection_size) ||
98  !unpack_uint(&p, p_end, &stat.rset_size) ||
99  !unpack_uint(&p, p_end, &stat.db_doclength_lower_bound) ||
100  !unpack_uint(&p, p_end, &stat.db_doclength_upper_bound) ||
101  !unpack_uint(&p, p_end, &stat.db_unique_terms_lower_bound) ||
102  !unpack_uint(&p, p_end, &stat.db_unique_terms_upper_bound) ||
103  !unpack_bool(&p, p_end, &stat.have_max_part) ||
104  !unpack_uint(&p, p_end, &n)) {
106  }
109 
110  string term;
111  for ( ; n; --n) {
112  Xapian::doccount termfreq;
113  Xapian::doccount reltermfreq = 0;
114  Xapian::termcount collfreq;
115  if (p == p_end) {
117  }
118  size_t reuse = static_cast<unsigned char>(*p++);
119  size_t append;
120  if (reuse <= term.size()) {
121  term.resize(reuse);
122  if (!unpack_uint(&p, p_end, &append) ||
123  size_t(p_end - p) < append) {
125  }
126  } else {
127  append = reuse - term.size();
128  }
129  term.append(p, append);
130  p += append;
131  if (!unpack_uint(&p, p_end, &termfreq) ||
132  (stat.rset_size != 0 && !unpack_uint(&p, p_end, &reltermfreq)) ||
133  !unpack_uint(&p, p_end, &collfreq)) {
135  }
136  double max_part = 0.0;
137  if (stat.have_max_part)
138  max_part = unserialise_double(&p, p_end);
139  stat.termfreqs.insert(make_pair(term,
140  TermFreqs(termfreq,
141  reltermfreq,
142  collfreq,
143  max_part)));
144  }
145 }
146 
147 string
149 {
150  string result;
151  if (rset.internal) {
152  Xapian::docid lastdid = 0;
153  for (Xapian::docid did : rset.internal->docs) {
154  pack_uint(result, did - lastdid - 1);
155  lastdid = did;
156  }
157  }
158  return result;
159 }
160 
162 unserialise_rset(const string &s)
163 {
164  Xapian::RSet rset;
165 
166  const char * p = s.data();
167  const char * p_end = p + s.size();
168 
169  Xapian::docid did = 0;
170  while (p != p_end) {
171  Xapian::docid inc;
172  if (!unpack_uint(&p, p_end, &inc)) {
174  }
175  did += inc + 1;
176  rset.add_document(did);
177  }
178 
179  return rset;
180 }
181 
182 string
184 {
185  string result;
186 
187  size_t n = doc.values_count();
188  pack_uint(result, n);
189  Xapian::ValueIterator value;
190  for (value = doc.values_begin(); value != doc.values_end(); ++value) {
191  pack_uint(result, value.get_valueno());
192  pack_string(result, *value);
193  --n;
194  }
195  Assert(n == 0);
196 
197  n = doc.termlist_count();
198  pack_uint(result, n);
200  for (term = doc.termlist_begin(); term != doc.termlist_end(); ++term) {
201  pack_string(result, *term);
202  pack_uint(result, term.get_wdf());
203 
204  size_t x = term.positionlist_count();
205  pack_uint(result, x);
207  Xapian::termpos oldpos = 0;
208  for (pos = term.positionlist_begin(); pos != term.positionlist_end(); ++pos) {
209  pack_uint(result, *pos - oldpos);
210  oldpos = *pos;
211  --x;
212  }
213  Assert(x == 0);
214  --n;
215  }
216  AssertEq(n, 0);
217 
218  result += doc.get_data();
219  return result;
220 }
221 
223 unserialise_document(string_view s)
224 {
225  Xapian::Document doc;
226  const char * p = s.data();
227  const char * p_end = p + s.size();
228 
229  size_t n_values;
230  if (!unpack_uint(&p, p_end, &n_values)) {
232  }
233  string value;
234  for ( ; n_values; --n_values) {
235  Xapian::valueno slot;
236  if (!unpack_uint(&p, p_end, &slot) ||
237  !unpack_string(&p, p_end, value)) {
239  }
240  doc.add_value(slot, value);
241  }
242 
243  size_t n_terms;
244  if (!unpack_uint(&p, p_end, &n_terms)) {
246  }
247  string term;
248  for ( ; n_terms; --n_terms) {
249  Xapian::termcount wdf;
250  if (!unpack_string(&p, p_end, term) ||
251  !unpack_uint(&p, p_end, &wdf)) {
253  }
254  // Set all the wdf using add_term, then pass wdf_inc 0 to add_posting.
255  doc.add_term(term, wdf);
256 
257  size_t n_pos;
258  if (!unpack_uint(&p, p_end, &n_pos)) {
260  }
261  Xapian::termpos pos = 0;
262  for ( ; n_pos; --n_pos) {
263  Xapian::termpos inc;
264  if (!unpack_uint(&p, p_end, &inc)) {
266  }
267  pos += inc;
268  doc.add_posting(term, pos, 0);
269  }
270  }
271 
272  doc.set_data(string_view(p, p_end - p));
273  return doc;
274 }
Class representing a document.
Definition: document.h:64
void set_data(std::string_view data)
Set the document data.
Definition: document.cc:81
std::string get_data() const
Get the document data.
Definition: document.cc:75
void add_term(std::string_view term, Xapian::termcount wdf_inc=1)
Add a term to this document.
Definition: document.cc:87
Xapian::valueno values_count() const
Count the value slots used in this document.
Definition: document.cc:203
ValueIterator values_begin() const
Start iterating the values in this document.
Definition: document.cc:208
TermIterator termlist_end() const noexcept
End iterator corresponding to termlist_begin().
Definition: document.h:219
Xapian::termcount termlist_count() const
Return the number of distinct terms in this document.
Definition: document.cc:174
TermIterator termlist_begin() const
Start iterating the terms in this document.
Definition: document.cc:179
ValueIterator values_end() const noexcept
End iterator corresponding to values_begin().
Definition: document.h:259
void add_value(Xapian::valueno slot, std::string_view value)
Add a value to a slot in this document.
Definition: document.cc:191
void add_posting(std::string_view term, Xapian::termpos term_pos, Xapian::termcount wdf_inc=1)
Add a posting for a term.
Definition: document.cc:111
Class for iterating over term positions.
Class representing a set of documents judged as relevant.
Definition: rset.h:39
Xapian::Internal::intrusive_ptr< Internal > internal
Definition: rset.h:42
void add_document(Xapian::docid did)
Mark a document as relevant.
Definition: rset.cc:55
Class for iterating over a list of terms.
Definition: termiterator.h:41
Class for iterating over document values.
Definition: valueiterator.h:39
Xapian::valueno get_valueno() const
Return the value slot number for the current position.
Class to hold statistics for a given collection.
Xapian::totallength total_length
Total length of all documents in the collection.
Xapian::termcount db_doclength_upper_bound
An upper bound on the maximum length of any document in the database.
bool have_max_part
Has max_part been set for any term?
Xapian::termcount db_doclength_lower_bound
A lower bound on the minimum length of any document in the database.
Xapian::termcount db_unique_terms_lower_bound
A lower bound on the number of unique terms in any document.
Xapian::doccount rset_size
Number of relevant documents in the collection.
Xapian::doccount collection_size
Number of documents in the collection.
std::map< std::string, TermFreqs, std::less<> > termfreqs
Map of term frequencies and relevant term frequencies for the collection.
Xapian::termcount db_unique_terms_upper_bound
An upper bound on the number of unique terms in any document.
#define usual(COND)
Definition: config.h:608
string term
PositionList * p
Xapian::termpos pos
Class representing a document.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned valueno
The number for a value slot in a document.
Definition: types.h:90
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
Definition: types.h:75
Various assertion macros.
#define AssertEq(A, B)
Definition: omassert.h:124
#define AssertRel(A, REL, B)
Definition: omassert.h:123
#define Assert(COND)
Definition: omassert.h:122
void unpack_throw_serialisation_error(const char *p)
Throw appropriate SerialisationError.
Definition: pack.cc:29
Pack types into strings and unpack them again.
bool unpack_string(const char **p, const char *end, std::string &result)
Decode a std::string from a string.
Definition: pack.h:468
bool unpack_bool(const char **p, const char *end, bool *result)
Decode a bool from a string.
Definition: pack.h:76
void pack_bool(std::string &s, bool value)
Append an encoded bool to a string.
Definition: pack.h:64
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
Definition: pack.h:346
void pack_uint(std::string &s, U value)
Append an encoded unsigned integer to a string.
Definition: pack.h:315
void pack_string(std::string &s, std::string_view value)
Append an encoded std::string to a string.
Definition: pack.h:442
Class for iterating over term positions.
Set of documents judged as relevant.
string serialise_double(double v)
Serialise a double to a string.
double unserialise_double(const char **p, const char *end)
Unserialise a double serialised by serialise_double.
functions to serialise and unserialise a double
string serialise_document(const Xapian::Document &doc)
Serialise a Xapian::Document object.
Definition: serialise.cc:183
string serialise_stats(const Xapian::Weight::Internal &stats)
Serialise a stats object.
Definition: serialise.cc:42
Xapian::Document unserialise_document(string_view s)
Unserialise a serialised Xapian::Document object.
Definition: serialise.cc:223
Xapian::RSet unserialise_rset(const string &s)
Unserialise a serialised Xapian::RSet object.
Definition: serialise.cc:162
void unserialise_stats(const char *p, const char *p_end, Xapian::Weight::Internal &stat)
Unserialise a serialised stats object.
Definition: serialise.cc:92
string serialise_rset(const Xapian::RSet &rset)
Serialise a Xapian::RSet object.
Definition: serialise.cc:148
functions to convert classes to strings and back
Various handy string-related helpers.
std::string::size_type common_prefix_length(std::string_view a, std::string_view b)
Definition: stringutils.h:128
The frequencies for a term.
Class for iterating over a list of terms.
Class for iterating over document values.
Xapian::Weight::Internal class, holding database and term statistics.