xapian-core  2.0.0
pack.h
Go to the documentation of this file.
1 
4 /* Copyright (C) 2009,2015,2016,2017,2018,2024 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #ifndef XAPIAN_INCLUDED_PACK_H
22 #define XAPIAN_INCLUDED_PACK_H
23 
24 #ifndef PACKAGE
25 # error config.h must be included first in each C++ source file
26 #endif
27 
28 #include <string>
29 #include <string_view>
30 #include <type_traits>
31 
32 #include "omassert.h"
33 
34 #include "xapian/types.h"
35 
42 const unsigned int SORTABLE_UINT_LOG2_MAX_BYTES = 2;
43 
46 
48 const unsigned int SORTABLE_UINT_1ST_BYTE_MASK =
50 
55 [[noreturn]]
56 void unpack_throw_serialisation_error(const char* p);
57 
63 inline void
64 pack_bool(std::string& s, bool value)
65 {
66  s += char('0' | static_cast<char>(value));
67 }
68 
75 inline bool
76 unpack_bool(const char** p, const char* end, bool* result)
77 {
78  Assert(result);
79  const char*& ptr = *p;
80  Assert(ptr);
81  char ch;
82  if (rare(ptr == end || ((ch = *ptr++ - '0') &~ 1))) {
83  ptr = NULL;
84  return false;
85  }
86  *result = static_cast<bool>(ch);
87  return true;
88 }
89 
98 template<class U>
99 inline void
100 pack_uint_last(std::string& s, U value)
101 {
102  static_assert(std::is_unsigned_v<U>, "Unsigned type required");
103 
104  while (value) {
105  s += char(value & 0xff);
106  value >>= 8;
107  }
108 }
109 
116 template<class U>
117 inline bool
118 unpack_uint_last(const char** p, const char* end, U* result)
119 {
120  static_assert(std::is_unsigned_v<U>, "Unsigned type required");
121  Assert(result);
122 
123  const char* ptr = *p;
124  Assert(ptr);
125  *p = end;
126 
127  // Check for overflow.
128  if (rare(end - ptr > int(sizeof(U)))) {
129  return false;
130  }
131 
132  *result = 0;
133  while (end != ptr) {
134  *result = (*result << 8) | U(static_cast<unsigned char>(*--end));
135  }
136 
137  return true;
138 }
139 
140 #if HAVE_DECL___BUILTIN_CLZ && \
141  HAVE_DECL___BUILTIN_CLZL && \
142  HAVE_DECL___BUILTIN_CLZLL
143 template<typename T>
144 inline int
145 do_clz(T value) {
146  extern int no_clz_builtin_for_this_type(T);
147  return no_clz_builtin_for_this_type(value);
148 }
149 
150 template<>
151 inline int
152 do_clz(unsigned value) {
153  return __builtin_clz(value);
154 }
155 
156 template<>
157 inline int
158 do_clz(unsigned long value) {
159  return __builtin_clzl(value);
160 }
161 
162 template<>
163 inline int
164 do_clz(unsigned long long value) {
165  return __builtin_clzll(value);
166 }
167 
168 # define HAVE_DO_CLZ
169 #endif
170 
202 template<class U>
203 inline void
204 pack_uint_preserving_sort(std::string& s, U value)
205 {
206  static_assert(std::is_unsigned_v<U>, "Unsigned type required");
207  static_assert(sizeof(U) <= 8,
208  "Template type U too wide for database format");
209  // The clz() functions are undefined for 0, so handle the smallest band
210  // as a special case.
211  if (value < 0x8000) {
212  s.resize(s.size() + 2);
213  s[s.size() - 2] = static_cast<unsigned char>(value >> 8);
214  Assert(s[s.size() - 2] != '\xff');
215  s[s.size() - 1] = static_cast<unsigned char>(value);
216  return;
217  }
218 
219 #ifdef HAVE_DO_CLZ
220  size_t len = ((sizeof(U) * 8 + 5) - do_clz(value)) / 7;
221 #else
222  size_t len = 3;
223  for (auto x = value >> 22; x; x >>= 7) ++len;
224 #endif
225  unsigned mask = 0xff << (10 - len);
226 
227  s.resize(s.size() + len);
228  for (size_t i = 1; i != len; ++i) {
229  s[s.size() - i] = static_cast<unsigned char>(value);
230  value >>= 8;
231  }
232 
233  s[s.size() - len] = static_cast<unsigned char>(value | mask);
234  Assert(s[s.size() - len] != '\xff');
235 
236  AssertRel(len, >, 2);
237  AssertRel(len, <=, 9);
238 }
239 
249 template<class U>
250 inline bool
251 unpack_uint_preserving_sort(const char** p, const char* end, U* result)
252 {
253  static_assert(std::is_unsigned_v<U>, "Unsigned type required");
254  static_assert(sizeof(U) <= 8,
255  "Template type U too wide for database format");
256  Assert(result);
257 
258  const char* ptr = *p;
259  Assert(ptr);
260 
261  if (rare(ptr == end)) {
262  return false;
263  }
264 
265  unsigned char len_byte = static_cast<unsigned char>(*ptr++);
266  if (len_byte < 0x80) {
267  *result = (U(len_byte) << 8) | static_cast<unsigned char>(*ptr++);
268  *p = ptr;
269  return true;
270  }
271 
272  if (len_byte == 0xff) {
273  return false;
274  }
275 
276  // len is how many bytes there are after the length byte.
277 #ifdef HAVE_DO_CLZ
278  size_t len = do_clz(len_byte ^ 0xffu) + 9 - sizeof(unsigned) * 8;
279 #else
280  size_t len = 2;
281  for (unsigned char m = 0x40; len_byte & m; m >>= 1) ++len;
282 #endif
283  if (rare(size_t(end - ptr) < len)) {
284  return false;
285  }
286  unsigned mask = 0xff << (9 - len);
287  len_byte &= ~mask;
288 
289  // Check for overflow.
290  if (rare(len > int(sizeof(U)))) return false;
291  if constexpr(sizeof(U) != 8) {
292  // Need to check the top byte too.
293  if (rare(len == int(sizeof(U)) && len_byte != 0)) return false;
294  }
295 
296  end = ptr + len;
297  *p = end;
298 
299  U r = len_byte;
300  while (ptr != end) {
301  r = (r << 8) | U(static_cast<unsigned char>(*ptr++));
302  }
303  *result = r;
304 
305  return true;
306 }
307 
313 template<class U>
314 inline void
315 pack_uint(std::string& s, U value)
316 {
317  static_assert(std::is_unsigned_v<U>, "Unsigned type required");
318 
319  while (value >= 128) {
320  s += static_cast<char>(static_cast<unsigned char>(value) | 0x80);
321  value >>= 7;
322  }
323  s += static_cast<char>(value);
324 }
325 
331 template<>
332 inline void
333 pack_uint(std::string& s, bool value)
334 {
335  s += static_cast<char>(value);
336 }
337 
344 template<class U>
345 inline bool
346 unpack_uint(const char** p, const char* end, U* result)
347 {
348  static_assert(std::is_unsigned_v<U>, "Unsigned type required");
349 
350  const char* ptr = *p;
351  Assert(ptr);
352  const char* start = ptr;
353 
354  // Check the length of the encoded integer first.
355  do {
356  if (rare(ptr == end)) {
357  // Out of data.
358  *p = NULL;
359  return false;
360  }
361  } while (static_cast<unsigned char>(*ptr++) >= 128);
362 
363  *p = ptr;
364 
365  if (!result) return true;
366 
367  *result = U(*--ptr);
368  if (ptr == start) {
369  // Special case for small values.
370  return true;
371  }
372 
373  size_t maxbits = size_t(ptr - start) * 7;
374  if (maxbits <= sizeof(U) * 8) {
375  // No possibility of overflow.
376  do {
377  unsigned char chunk = static_cast<unsigned char>(*--ptr) & 0x7f;
378  *result = (*result << 7) | U(chunk);
379  } while (ptr != start);
380  return true;
381  }
382 
383  size_t minbits = maxbits - 6;
384  if (rare(minbits > sizeof(U) * 8)) {
385  // Overflow.
386  return false;
387  }
388 
389  while (--ptr != start) {
390  unsigned char chunk = static_cast<unsigned char>(*--ptr) & 0x7f;
391  *result = (*result << 7) | U(chunk);
392  }
393 
394  U tmp = *result;
395  *result <<= 7;
396  if (rare(*result < tmp)) {
397  // Overflow.
398  return false;
399  }
400  *result |= U(static_cast<unsigned char>(*ptr) & 0x7f);
401  return true;
402 }
403 
410 template<class U>
411 inline bool
412 unpack_uint_backwards(const char** p, const char* start, U* result)
413 {
414  static_assert(std::is_unsigned_v<U>, "Unsigned type required");
415 
416  const char* ptr = *p;
417  Assert(ptr);
418 
419  // Check it's not empty and that the final byte is valid.
420  if (rare(ptr == start || static_cast<unsigned char>(ptr[-1]) >= 128)) {
421  // Out of data.
422  *p = NULL;
423  return false;
424  }
425 
426  do {
427  if (rare(--ptr == start))
428  break;
429  } while (static_cast<unsigned char>(ptr[-1]) >= 128);
430 
431  const char* end = *p;
432  *p = ptr;
433  return unpack_uint(&ptr, end, result);
434 }
435 
441 inline void
442 pack_string(std::string& s, std::string_view value)
443 {
444  pack_uint(s, value.size());
445  s += value;
446 }
447 
455 inline void
456 pack_string_empty(std::string& s)
457 {
458  s += '\0';
459 }
460 
467 inline bool
468 unpack_string(const char** p, const char* end, std::string& result)
469 {
470  size_t len;
471  if (rare(!unpack_uint(p, end, &len))) {
472  return false;
473  }
474 
475  const char*& ptr = *p;
476  if (rare(len > size_t(end - ptr))) {
477  ptr = NULL;
478  return false;
479  }
480 
481  result.assign(ptr, len);
482  ptr += len;
483  return true;
484 }
485 
492 inline bool
493 unpack_string_append(const char** p, const char* end, std::string& result)
494 {
495  size_t len;
496  if (rare(!unpack_uint(p, end, &len))) {
497  return false;
498  }
499 
500  const char*& ptr = *p;
501  if (rare(len > size_t(end - ptr))) {
502  ptr = NULL;
503  return false;
504  }
505 
506  result.append(ptr, len);
507  ptr += len;
508  return true;
509 }
510 
527 inline void
528 pack_string_preserving_sort(std::string& s, std::string_view value,
529  bool last = false)
530 {
531  std::string::size_type b = 0, e;
532  while ((e = value.find('\0', b)) != std::string::npos) {
533  ++e;
534  s.append(value, b, e - b);
535  s += '\xff';
536  b = e;
537  }
538  s.append(value, b, std::string::npos);
539  if (!last) s += '\0';
540 }
541 
550 inline bool
551 unpack_string_preserving_sort(const char** p, const char* end,
552  std::string& result)
553 {
554  result.resize(0);
555 
556  const char* ptr = *p;
557  Assert(ptr);
558 
559  while (ptr != end) {
560  char ch = *ptr++;
561  if (rare(ch == '\0')) {
562  if (usual(ptr == end || *ptr != '\xff')) {
563  break;
564  }
565  ++ptr;
566  }
567  result += ch;
568  }
569  *p = ptr;
570  return true;
571 }
572 
573 inline std::string
574 pack_glass_postlist_key(std::string_view term)
575 {
576  // Special case for doclen lists.
577  if (term.empty())
578  return std::string("\x00\xe0", 2);
579 
580  std::string key;
581  pack_string_preserving_sort(key, term, true);
582  return key;
583 }
584 
585 inline std::string
587 {
588  // Special case for doclen lists.
589  if (term.empty()) {
590  std::string key("\x00\xe0", 2);
591  pack_uint_preserving_sort(key, did);
592  return key;
593  }
594 
595  std::string key;
597  pack_uint_preserving_sort(key, did);
598  return key;
599 }
600 
601 inline std::string
602 pack_honey_postlist_key(std::string_view term)
603 {
604  Assert(!term.empty());
605  std::string key;
606  pack_string_preserving_sort(key, term, true);
607  return key;
608 }
609 
610 inline std::string
612 {
613  Assert(!term.empty());
614  std::string key;
616  pack_uint_preserving_sort(key, did);
617  return key;
618 }
619 
620 #endif // XAPIAN_INCLUDED_PACK_H
#define usual(COND)
Definition: config.h:608
#define rare(COND)
Definition: config.h:607
string term
PositionList * p
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
Various assertion macros.
#define AssertRel(A, REL, B)
Definition: omassert.h:123
#define Assert(COND)
Definition: omassert.h:122
void unpack_throw_serialisation_error(const char *p)
Throw appropriate SerialisationError.
Definition: pack.cc:29
bool unpack_uint_backwards(const char **p, const char *start, U *result)
Decode an unsigned integer from a string, going backwards.
Definition: pack.h:412
const unsigned int SORTABLE_UINT_LOG2_MAX_BYTES
How many bits to store the length of a sortable uint in.
Definition: pack.h:42
std::string pack_honey_postlist_key(std::string_view term)
Definition: pack.h:602
const unsigned int SORTABLE_UINT_1ST_BYTE_MASK
Calculated value used below.
Definition: pack.h:48
bool unpack_uint_last(const char **p, const char *end, U *result)
Decode an unsigned integer as the last item in a string.
Definition: pack.h:118
bool unpack_string_preserving_sort(const char **p, const char *end, std::string &result)
Decode a "sort preserved" std::string from a string.
Definition: pack.h:551
bool unpack_string(const char **p, const char *end, std::string &result)
Decode a std::string from a string.
Definition: pack.h:468
const unsigned int SORTABLE_UINT_MAX_BYTES
Calculated value used below.
Definition: pack.h:45
bool unpack_string_append(const char **p, const char *end, std::string &result)
Decode a std::string from a string and append.
Definition: pack.h:493
void pack_uint_last(std::string &s, U value)
Append an encoded unsigned integer to a string as the last item.
Definition: pack.h:100
bool unpack_bool(const char **p, const char *end, bool *result)
Decode a bool from a string.
Definition: pack.h:76
void pack_bool(std::string &s, bool value)
Append an encoded bool to a string.
Definition: pack.h:64
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
Definition: pack.h:346
void pack_string_preserving_sort(std::string &s, std::string_view value, bool last=false)
Append an encoded std::string to a string, preserving the sort order.
Definition: pack.h:528
void pack_uint(std::string &s, U value)
Append an encoded unsigned integer to a string.
Definition: pack.h:315
void pack_string(std::string &s, std::string_view value)
Append an encoded std::string to a string.
Definition: pack.h:442
bool unpack_uint_preserving_sort(const char **p, const char *end, U *result)
Decode a "sort preserved" unsigned integer from a string.
Definition: pack.h:251
void pack_string_empty(std::string &s)
Append an empty encoded std::string to a string.
Definition: pack.h:456
std::string pack_glass_postlist_key(std::string_view term)
Definition: pack.h:574
void pack_uint_preserving_sort(std::string &s, U value)
Append an encoded unsigned integer to a string, preserving the sort order.
Definition: pack.h:204
typedefs for Xapian