xapian-core  1.4.19
serialise.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2006,2007,2008,2009,2010,2011,2014,2015,2017 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <config.h>
22 
23 #include <xapian/document.h>
25 #include <xapian/termiterator.h>
26 #include <xapian/valueiterator.h>
27 
28 #include "omassert.h"
29 #include "api/omenquireinternal.h"
30 #include "length.h"
31 #include "serialise.h"
32 #include "serialise-double.h"
33 #include "weight/weightinternal.h"
34 
35 #include "autoptr.h"
36 #include <set>
37 #include <string>
38 
39 using namespace std;
40 
41 string
43 {
44  string result;
45 
46  result += encode_length(stats.total_length);
47  result += encode_length(stats.collection_size);
48  result += encode_length(stats.rset_size);
49  // This is redundant, but sent to keep the protocol compatible with older
50  // 1.4.x releases.
51  result += encode_length(stats.total_length);
52  result += static_cast<char>(stats.have_max_part);
53 
54  result += encode_length(stats.termfreqs.size());
55  map<string, TermFreqs>::const_iterator i;
56  for (i = stats.termfreqs.begin(); i != stats.termfreqs.end(); ++i) {
57  result += encode_length(i->first.size());
58  result += i->first;
59  result += encode_length(i->second.termfreq);
60  if (stats.rset_size != 0)
61  result += encode_length(i->second.reltermfreq);
62  result += encode_length(i->second.collfreq);
63  if (stats.have_max_part)
64  result += serialise_double(i->second.max_part);
65  }
66 
67  return result;
68 }
69 
70 void
71 unserialise_stats(const char* p, const char* p_end,
73 {
75  decode_length(&p, p_end, stat.total_length);
76  decode_length(&p, p_end, stat.collection_size);
77  decode_length(&p, p_end, stat.rset_size);
78  // Ignored - only present to keep the protocol compatible with older
79  // 1.4.x releases.
80  decode_length(&p, p_end, dummy);
81  // If p == p_end, the next decode_length() will report it.
82  stat.have_max_part = (p != p_end && *p++);
83 
84  size_t n;
85  decode_length(&p, p_end, n);
86  while (n--) {
87  size_t len;
88  decode_length_and_check(&p, p_end, len);
89  string term(p, len);
90  p += len;
91  Xapian::doccount termfreq;
92  decode_length(&p, p_end, termfreq);
93  Xapian::doccount reltermfreq;
94  if (stat.rset_size == 0) {
95  reltermfreq = 0;
96  } else {
97  decode_length(&p, p_end, reltermfreq);
98  }
99  Xapian::termcount collfreq;
100  decode_length(&p, p_end, collfreq);
101  double max_part = 0.0;
102  if (stat.have_max_part)
103  max_part = unserialise_double(&p, p_end);
104  stat.termfreqs.insert(make_pair(term,
105  TermFreqs(termfreq,
106  reltermfreq,
107  collfreq,
108  max_part)));
109  }
110 }
111 
112 string
114 {
115  string result;
116 
117  result += encode_length(mset.get_firstitem());
118  // Send back the raw matches_* values. MSet::get_matches_estimated()
119  // rounds the estimate lazily, but MSetPostList::get_termfreq_est()
120  // returns the estimate, and the raw estimate is better for that.
121  //
122  // It is also cleaner that a round-trip through serialisation gives you an
123  // object which is as close to the original as possible.
124  result += encode_length(mset.internal->matches_lower_bound);
125  result += encode_length(mset.internal->matches_estimated);
126  result += encode_length(mset.internal->matches_upper_bound);
130  result += serialise_double(mset.get_max_possible());
131  result += serialise_double(mset.get_max_attained());
132 
133  result += serialise_double(mset.internal->percent_factor);
134 
135  result += encode_length(mset.size());
136  for (size_t i = 0; i != mset.size(); ++i) {
137  const Xapian::Internal::MSetItem & item = mset.internal->items[i];
138  result += serialise_double(item.wt);
139  result += encode_length(item.did);
140  result += encode_length(item.sort_key.size());
141  result += item.sort_key;
142  result += encode_length(item.collapse_key.size());
143  result += item.collapse_key;
144  result += encode_length(item.collapse_count);
145  }
146 
147  if (mset.internal->stats)
148  result += serialise_stats(*(mset.internal->stats));
149 
150  return result;
151 }
152 
154 unserialise_mset(const char * p, const char * p_end)
155 {
156  Xapian::doccount firstitem;
157  decode_length(&p, p_end, firstitem);
158  Xapian::doccount matches_lower_bound;
159  decode_length(&p, p_end, matches_lower_bound);
160  Xapian::doccount matches_estimated;
161  decode_length(&p, p_end, matches_estimated);
162  Xapian::doccount matches_upper_bound;
163  decode_length(&p, p_end, matches_upper_bound);
164  Xapian::doccount uncollapsed_lower_bound;
165  decode_length(&p, p_end, uncollapsed_lower_bound);
166  Xapian::doccount uncollapsed_estimated;
167  decode_length(&p, p_end, uncollapsed_estimated);
168  Xapian::doccount uncollapsed_upper_bound;
169  decode_length(&p, p_end, uncollapsed_upper_bound);
170  double max_possible = unserialise_double(&p, p_end);
171  double max_attained = unserialise_double(&p, p_end);
172 
173  double percent_factor = unserialise_double(&p, p_end);
174 
175  vector<Xapian::Internal::MSetItem> items;
176  size_t msize;
177  decode_length(&p, p_end, msize);
178  while (msize-- > 0) {
179  double wt = unserialise_double(&p, p_end);
180  Xapian::docid did;
181  decode_length(&p, p_end, did);
182  size_t len;
183  decode_length_and_check(&p, p_end, len);
184  string sort_key(p, len);
185  p += len;
186  decode_length_and_check(&p, p_end, len);
187  string key(p, len);
188  p += len;
189  Xapian::doccount collapse_cnt;
190  decode_length(&p, p_end, collapse_cnt);
191  items.push_back(Xapian::Internal::MSetItem(wt, did, key, collapse_cnt));
192  swap(items.back().sort_key, sort_key);
193  }
194 
195  AutoPtr<Xapian::Weight::Internal> stats;
196  if (p != p_end) {
197  stats.reset(new Xapian::Weight::Internal());
198  unserialise_stats(p, p_end, *(stats.get()));
199  }
200 
201  Xapian::MSet mset;
202  mset.internal = new Xapian::MSet::Internal(
203  firstitem,
204  matches_upper_bound,
205  matches_lower_bound,
206  matches_estimated,
207  uncollapsed_upper_bound,
208  uncollapsed_lower_bound,
209  uncollapsed_estimated,
210  max_possible, max_attained,
211  items, percent_factor);
212  mset.internal->stats = stats.release();
213  return mset;
214 }
215 
216 string
218 {
219  string result;
220  const set<Xapian::docid> & items = rset.internal->get_items();
221  set<Xapian::docid>::const_iterator i;
222  Xapian::docid lastdid = 0;
223  for (i = items.begin(); i != items.end(); ++i) {
224  Xapian::docid did = *i;
225  result += encode_length(did - lastdid - 1);
226  lastdid = did;
227  }
228  return result;
229 }
230 
232 unserialise_rset(const string &s)
233 {
234  Xapian::RSet rset;
235 
236  const char * p = s.data();
237  const char * p_end = p + s.size();
238 
239  Xapian::docid did = 0;
240  while (p != p_end) {
241  Xapian::docid inc;
242  decode_length(&p, p_end, inc);
243  did += inc + 1;
244  rset.add_document(did);
245  }
246 
247  return rset;
248 }
249 
250 string
252 {
253  string result;
254 
255  size_t n = doc.values_count();
256  result += encode_length(n);
257  Xapian::ValueIterator value;
258  for (value = doc.values_begin(); value != doc.values_end(); ++value) {
259  result += encode_length(value.get_valueno());
260  result += encode_length((*value).size());
261  result += *value;
262  --n;
263  }
264  Assert(n == 0);
265 
266  n = doc.termlist_count();
267  result += encode_length(n);
269  for (term = doc.termlist_begin(); term != doc.termlist_end(); ++term) {
270  result += encode_length((*term).size());
271  result += *term;
272  result += encode_length(term.get_wdf());
273 
274  size_t x = term.positionlist_count();
275  result += encode_length(x);
277  Xapian::termpos oldpos = 0;
278  for (pos = term.positionlist_begin(); pos != term.positionlist_end(); ++pos) {
279  Xapian::termpos diff = *pos - oldpos;
280  string delta = encode_length(diff);
281  result += delta;
282  oldpos = *pos;
283  --x;
284  }
285  Assert(x == 0);
286  --n;
287  }
288  AssertEq(n, 0);
289 
290  result += doc.get_data();
291  return result;
292 }
293 
295 unserialise_document(const string &s)
296 {
297  Xapian::Document doc;
298  const char * p = s.data();
299  const char * p_end = p + s.size();
300 
301  size_t n_values;
302  decode_length(&p, p_end, n_values);
303  while (n_values--) {
304  Xapian::valueno slot;
305  decode_length(&p, p_end, slot);
306  size_t len;
307  decode_length_and_check(&p, p_end, len);
308  doc.add_value(slot, string(p, len));
309  p += len;
310  }
311 
312  size_t n_terms;
313  decode_length(&p, p_end, n_terms);
314  while (n_terms--) {
315  size_t len;
316  decode_length_and_check(&p, p_end, len);
317  string term(p, len);
318  p += len;
319 
320  // Set all the wdf using add_term, then pass wdf_inc 0 to add_posting.
321  Xapian::termcount wdf;
322  decode_length(&p, p_end, wdf);
323  doc.add_term(term, wdf);
324 
325  size_t n_pos;
326  decode_length(&p, p_end, n_pos);
327  Xapian::termpos pos = 0;
328  while (n_pos--) {
329  Xapian::termpos inc;
330  decode_length(&p, p_end, inc);
331  pos += inc;
332  doc.add_posting(term, pos, 0);
333  }
334  }
335 
336  doc.set_data(string(p, p_end - p));
337  return doc;
338 }
Xapian::doccount size() const
Return number of items in this MSet object.
Definition: omenquire.cc:318
#define Assert(COND)
Definition: omassert.h:122
Xapian::doccount size() const
The number of documents in this R-Set.
Definition: omenquire.cc:92
void add_value(Xapian::valueno slot, const std::string &value)
Add a new value.
Definition: omdocument.cc:107
double get_max_possible() const
The maximum possible weight any document could achieve.
Definition: omenquire.cc:290
Xapian::RSet unserialise_rset(const string &s)
Unserialise a serialised Xapian::RSet object.
Definition: serialise.cc:232
length encoded as a string
#define AssertEq(A, B)
Definition: omassert.h:124
Xapian::termcount termlist_count() const
The length of the termlist - i.e.
Definition: omdocument.cc:191
Xapian::Document unserialise_document(const string &s)
Unserialise a serialised Xapian::Document object.
Definition: serialise.cc:295
Xapian::doccount collapse_count
Count of collapses done on collapse_key so far.
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
Definition: types.h:139
Xapian::docid did
Document id.
ValueIterator values_begin() const
Iterator for the values in this document.
Definition: omdocument.cc:210
Xapian::Internal::intrusive_ptr< Internal > internal
Definition: mset.h:52
double get_max_attained() const
The maximum weight attained by any document.
Definition: omenquire.cc:297
Class for iterating over document values.
Definition: valueiterator.h:40
Class representing a list of search results.
Definition: mset.h:44
STL namespace.
std::string encode_length(T len)
Encode a length as a variable-length string.
Definition: length.h:36
TermIterator termlist_end() const
Equivalent end iterator for termlist_begin().
Definition: document.h:260
Class for iterating over term positions.
std::map< std::string, TermFreqs > termfreqs
Map of term frequencies and relevant term frequencies for the collection.
Xapian::doccount collection_size
Number of documents in the collection.
string sort_key
Used when sorting by value.
Xapian::Internal::intrusive_ptr< Internal > internal
Definition: enquire.h:63
Xapian::doccount rset_size
Number of relevant documents in the collection.
Xapian::MSet unserialise_mset(const char *p, const char *p_end)
Unserialise a serialised Xapian::MSet object.
Definition: serialise.cc:154
Class for iterating over a list of terms.
Definition: termiterator.h:41
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
functions to serialise and unserialise a double
PositionIterator positionlist_end() const
Return an end PositionIterator for the current term.
Definition: termiterator.h:104
Xapian::doccount get_uncollapsed_matches_estimated() const
Estimate of the total number of matching documents before collapsing.
Definition: omenquire.cc:276
const char * dummy[]
Definition: version_h.cc:7
double unserialise_double(const char **p, const char *end)
Unserialise a double serialised by serialise_double.
Xapian::doccount get_firstitem() const
Rank of first item in this MSet.
Definition: omenquire.cc:239
bool have_max_part
Has max_part been set for any term?
Xapian::Weight::Internal class, holding database and term statistics.
double wt
Weight calculated.
Class to hold statistics for a given collection.
string collapse_key
Value which was used to collapse upon.
An item resulting from a query.
Class for iterating over document values.
Xapian::termcount values_count() const
Count the values in this document.
Definition: omdocument.cc:204
Class for iterating over term positions.
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
ValueIterator values_end() const
Equivalent end iterator for values_begin().
Definition: document.h:271
string serialise_stats(const Xapian::Weight::Internal &stats)
Serialise a stats object.
Definition: serialise.cc:42
functions to convert classes to strings and back
string serialise_rset(const Xapian::RSet &rset)
Serialise a Xapian::RSet object.
Definition: serialise.cc:217
void add_posting(const std::string &tname, Xapian::termpos tpos, Xapian::termcount wdfinc=1)
Add an occurrence of a term at a particular position.
Definition: omdocument.cc:128
void add_document(Xapian::docid did)
Add a document to the relevance set.
Definition: omenquire.cc:104
string serialise_document(const Xapian::Document &doc)
Serialise a Xapian::Document object.
Definition: serialise.cc:251
The frequencies for a term.
void unserialise_stats(const char *p, const char *p_end, Xapian::Weight::Internal &stat)
Unserialise a serialised stats object.
Definition: serialise.cc:71
std::string serialise_double(double v)
Serialise a double to a string.
void decode_length_and_check(const char **p, const char *end, unsigned &out)
Decode a length encoded by encode_length.
Definition: length.cc:112
string serialise_mset(const Xapian::MSet &mset)
Serialise a Xapian::MSet object.
Definition: serialise.cc:113
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Xapian::doccount get_uncollapsed_matches_upper_bound() const
Upper bound on the total number of matching documents before collapsing.
Definition: omenquire.cc:283
unsigned valueno
The number for a value slot in a document.
Definition: types.h:108
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
Definition: types.h:83
Definition: quest.cc:110
Various assertion macros.
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
std::string get_data() const
Get data stored in the document.
Definition: omdocument.cc:71
API for working with documents.
Xapian::totallength total_length
Total length of all documents in the collection.
Class for iterating over a list of terms.
void set_data(const std::string &data)
Set data stored in the document.
Definition: omdocument.cc:78
void decode_length(const char **p, const char *end, unsigned &out)
Decode a length encoded by encode_length.
Definition: length.cc:94
TermIterator termlist_begin() const
Iterator for the terms in this document.
Definition: omdocument.cc:197
PositionIterator positionlist_begin() const
Return a PositionIterator for the current term.
A handle representing a document in a Xapian database.
Definition: document.h:61
Wrapper around standard unique_ptr template.
Xapian::termcount positionlist_count() const
Return the length of the position list for the current position.
Xapian::valueno get_valueno() const
Return the value slot number for the current position.
A relevance set (R-Set).
Definition: enquire.h:60
Xapian::doccount get_uncollapsed_matches_lower_bound() const
Lower bound on the total number of matching documents before collapsing.
Definition: omenquire.cc:269
void add_term(const std::string &tname, Xapian::termcount wdfinc=1)
Add a term to the document, without positional information.
Definition: omdocument.cc:140