xapian-core  2.0.0
honey_postlist_encodings.h
Go to the documentation of this file.
1 
4 /* Copyright (C) 2015,2018 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #ifndef XAPIAN_INCLUDED_HONEY_POSTLIST_ENCODINGS_H
22 #define XAPIAN_INCLUDED_HONEY_POSTLIST_ENCODINGS_H
23 
24 #include "pack.h"
25 
26 inline void
28  Xapian::termcount collfreq,
29  Xapian::docid first,
30  Xapian::docid last,
31  Xapian::docid chunk_last,
32  Xapian::termcount first_wdf,
33  Xapian::termcount wdf_max,
34  std::string& out)
35 {
36  Assert(termfreq != 0);
37  pack_uint(out, first - 1);
38  if (termfreq == 1) {
39  // Special case for a term which only occurs in one document. By
40  // Zipf's Law we expect these to be common in natural language
41  // (typically 40-60% of words in a large corpus:
42  // https://en.wikipedia.org/wiki/Hapax_legomenon ). Another common
43  // example is unique ID boolean terms.
44  //
45  // There's no postlist data after the chunk header for such terms
46  // (since we know the first docid and its wdf from the data in
47  // the chunk header) so we can just encode the collfreq if it is
48  // non-zero - when decoding if there's no more data we'll know the
49  // collfreq is zero.
50  if (collfreq) {
51  pack_uint(out, collfreq);
52  }
53  AssertEq(first, last);
54  AssertEq(last, chunk_last);
55  AssertEq(collfreq, wdf_max);
56  AssertEq(collfreq, first_wdf);
57  } else if (termfreq == 2) {
58  // A term which only occurs in two documents. By Zipf's Law these
59  // are also fairly common (typically 10-15% of words in a large
60  // corpus: https://en.wikipedia.org/wiki/Hapax_legomenon )
61  //
62  // We have to encode collfreq == 0 explicitly or else the decoder can't
63  // distinguish between the cases:
64  //
65  // tf = 1; collfreq = x
66  // tf = 2; last = first + x + 1; collfreq = 0
67  //
68  // We turn this to our advantage to encode tf = 2 lists with constant
69  // wdf or where the second wdf is one more than the first (which are
70  // common cases) more compactly, so instead of:
71  //
72  // <first - 1> <collfreq> <last - first - 1> <first_wdf>
73  //
74  // We encode these as:
75  //
76  // <first - 1> <collfreq> <last - first - 1>
77  //
78  // And then when it's omitted: first_wdf = collfreq >> 1
79  //
80  // The collfreq = 0 case is then a particular example of this.
81  pack_uint(out, collfreq);
82  AssertRel(last, >, first);
83  pack_uint(out, last - first - 1);
84  if (first_wdf != (collfreq / 2)) {
85  pack_uint(out, first_wdf);
86  AssertEq(std::max(first_wdf, collfreq - first_wdf), wdf_max);
87  } else {
88  AssertEq(collfreq - first_wdf, wdf_max);
89  }
90  } else if (collfreq == 0) {
91  AssertEq(first_wdf, 0);
92  AssertEq(wdf_max, 0);
93  pack_uint(out, 0u);
94  pack_uint(out, termfreq - 3);
95  pack_uint(out, last - first - (termfreq - 1));
96  pack_uint(out, chunk_last - first);
97  } else {
98  AssertRel(collfreq, >=, termfreq);
99  pack_uint(out, collfreq - termfreq + 1);
100  pack_uint(out, termfreq - 3);
101  pack_uint(out, last - first - (termfreq - 1));
102  pack_uint(out, chunk_last - first);
103  pack_uint(out, first_wdf - 1);
104 
105  if (first_wdf >= collfreq - first_wdf - (termfreq - 2)) {
106  AssertEq(wdf_max, first_wdf);
107  } else {
108  AssertRel(wdf_max, >=, first_wdf);
109  pack_uint(out, wdf_max - first_wdf);
110  }
111  }
112 }
113 
114 inline bool
115 decode_initial_chunk_header(const char** p, const char* end,
116  Xapian::doccount& termfreq,
117  Xapian::termcount& collfreq,
118  Xapian::docid& first,
119  Xapian::docid& last,
120  Xapian::docid& chunk_last,
121  Xapian::termcount& first_wdf,
122  Xapian::termcount& wdf_max)
123 {
124  if (!unpack_uint(p, end, &first)) {
125  return false;
126  }
127  ++first;
128  if (*p == end) {
129  collfreq = 0;
130  } else if (!unpack_uint(p, end, &collfreq)) {
131  return false;
132  }
133  if (*p == end) {
134  // Single occurrence term.
135  termfreq = 1;
136  chunk_last = last = first;
137  wdf_max = first_wdf = collfreq;
138  return true;
139  }
140 
141  if (!unpack_uint(p, end, &termfreq)) {
142  return false;
143  }
144  if (*p == end) {
145  // Double occurrence term with first_wdf = floor(collfreq / 2).
146  chunk_last = last = first + termfreq + 1;
147  termfreq = 2;
148  first_wdf = collfreq / 2;
149  wdf_max = std::max(first_wdf, collfreq - first_wdf);
150  return true;
151  }
152 
153  if (!unpack_uint(p, end, &last)) {
154  return false;
155  }
156  if (*p == end) {
157  // Double occurrence term.
158  Assert(collfreq != 0);
159  first_wdf = last;
160  chunk_last = last = first + termfreq + 1;
161  termfreq = 2;
162  wdf_max = std::max(first_wdf, collfreq - first_wdf);
163  return true;
164  }
165 
166  if (!unpack_uint(p, end, &chunk_last)) {
167  return false;
168  }
169  termfreq += 3;
170  last += first + termfreq - 1;
171  chunk_last += first;
172 
173  if (collfreq == 0) {
174  wdf_max = first_wdf = 0;
175  } else {
176  collfreq += (termfreq - 1);
177  if (!unpack_uint(p, end, &first_wdf)) {
178  return false;
179  }
180  ++first_wdf;
181  if (first_wdf >= collfreq - first_wdf - (termfreq - 2)) {
182  wdf_max = first_wdf;
183  } else {
184  if (!unpack_uint(p, end, &wdf_max)) {
185  return false;
186  }
187  wdf_max += first_wdf;
188  }
189  }
190 
191  return true;
192 }
193 
194 inline bool
195 decode_initial_chunk_header_freqs(const char** p, const char* end,
196  Xapian::doccount& termfreq,
197  Xapian::termcount& collfreq)
198 {
199  Xapian::docid first;
200  if (!unpack_uint(p, end, &first)) {
201  return false;
202  }
203  // Not used in this case.
204  (void)first;
205  if (*p == end) {
206  collfreq = 0;
207  } else if (!unpack_uint(p, end, &collfreq)) {
208  return false;
209  }
210  if (*p == end) {
211  // Single occurrence term.
212  termfreq = 1;
213  return true;
214  }
215 
216  if (!unpack_uint(p, end, &termfreq)) {
217  return false;
218  }
219  if (*p == end) {
220  // Double occurrence term with first_wdf = floor(collfreq / 2).
221  termfreq = 2;
222  return true;
223  }
224 
225  Xapian::docid last;
226  if (!unpack_uint(p, end, &last)) {
227  return false;
228  }
229  // Not used in this case.
230  (void)last;
231  if (*p == end) {
232  // Double occurrence term.
233  Assert(collfreq != 0);
234  termfreq = 2;
235  return true;
236  }
237 
238  termfreq += 3;
239  if (collfreq != 0) {
240  collfreq += (termfreq - 1);
241  }
242 
243  return true;
244 }
245 
246 inline void
248  Xapian::docid chunk_last,
249  Xapian::termcount chunk_first_wdf,
250  std::string& out)
251 {
252  Assert(chunk_first_wdf != 0);
253  pack_uint(out, chunk_last - chunk_first);
254  pack_uint(out, chunk_first_wdf - 1);
255 }
256 
257 inline bool
258 decode_delta_chunk_header(const char** p, const char* end,
259  Xapian::docid chunk_last,
260  Xapian::docid& chunk_first,
261  Xapian::termcount& chunk_first_wdf)
262 {
263  if (!unpack_uint(p, end, &chunk_first) ||
264  !unpack_uint(p, end, &chunk_first_wdf)) {
265  return false;
266  }
267  chunk_first = chunk_last - chunk_first;
268  ++chunk_first_wdf;
269  return true;
270 }
271 
272 inline void
274  Xapian::docid chunk_last,
275  std::string& out)
276 {
277  pack_uint(out, chunk_last - chunk_first);
278 }
279 
280 inline bool
281 decode_delta_chunk_header_no_wdf(const char** p, const char* end,
282  Xapian::docid chunk_last,
283  Xapian::docid& chunk_first)
284 {
285  if (!unpack_uint(p, end, &chunk_first)) {
286  return false;
287  }
288  chunk_first = chunk_last - chunk_first;
289  return true;
290 }
291 
292 #endif // XAPIAN_INCLUDED_HONEY_POSTLIST_ENCODINGS_H
PositionList * p
bool decode_delta_chunk_header(const char **p, const char *end, Xapian::docid chunk_last, Xapian::docid &chunk_first, Xapian::termcount &chunk_first_wdf)
void encode_delta_chunk_header(Xapian::docid chunk_first, Xapian::docid chunk_last, Xapian::termcount chunk_first_wdf, std::string &out)
void encode_delta_chunk_header_no_wdf(Xapian::docid chunk_first, Xapian::docid chunk_last, std::string &out)
bool decode_initial_chunk_header(const char **p, const char *end, Xapian::doccount &termfreq, Xapian::termcount &collfreq, Xapian::docid &first, Xapian::docid &last, Xapian::docid &chunk_last, Xapian::termcount &first_wdf, Xapian::termcount &wdf_max)
bool decode_delta_chunk_header_no_wdf(const char **p, const char *end, Xapian::docid chunk_last, Xapian::docid &chunk_first)
bool decode_initial_chunk_header_freqs(const char **p, const char *end, Xapian::doccount &termfreq, Xapian::termcount &collfreq)
void encode_initial_chunk_header(Xapian::doccount termfreq, Xapian::termcount collfreq, Xapian::docid first, Xapian::docid last, Xapian::docid chunk_last, Xapian::termcount first_wdf, Xapian::termcount wdf_max, std::string &out)
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
#define AssertEq(A, B)
Definition: omassert.h:124
#define AssertRel(A, REL, B)
Definition: omassert.h:123
#define Assert(COND)
Definition: omassert.h:122
Pack types into strings and unpack them again.
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
Definition: pack.h:346
void pack_uint(std::string &s, U value)
Append an encoded unsigned integer to a string.
Definition: pack.h:315