xapian-core  1.4.27
pack.h
Go to the documentation of this file.
1 
4 /* Copyright (C) 2009,2015,2016,2017 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #ifndef XAPIAN_INCLUDED_PACK_H
22 #define XAPIAN_INCLUDED_PACK_H
23 
24 #ifndef PACKAGE
25 # error config.h must be included first in each C++ source file
26 #endif
27 
28 #include <cstring>
29 #include <string>
30 #include <type_traits>
31 
32 #include "omassert.h"
33 
34 #include "xapian/types.h"
35 
42 const unsigned int SORTABLE_UINT_LOG2_MAX_BYTES = 2;
43 
46 
48 const unsigned int SORTABLE_UINT_1ST_BYTE_MASK =
50 
56 inline void
57 pack_bool(std::string& s, bool value)
58 {
59  s += char('0' | static_cast<char>(value));
60 }
61 
68 inline bool
69 unpack_bool(const char** p, const char* end, bool* result)
70 {
71  Assert(result);
72  const char*& ptr = *p;
73  Assert(ptr);
74  char ch;
75  if (rare(ptr == end || ((ch = *ptr++ - '0') &~ 1))) {
76  ptr = NULL;
77  return false;
78  }
79  *result = static_cast<bool>(ch);
80  return true;
81 }
82 
91 template<class U>
92 inline void
93 pack_uint_last(std::string& s, U value)
94 {
95  static_assert(std::is_unsigned<U>::value, "Unsigned type required");
96 
97  while (value) {
98  s += char(value & 0xff);
99  value >>= 8;
100  }
101 }
102 
109 template<class U>
110 inline bool
111 unpack_uint_last(const char** p, const char* end, U* result)
112 {
113  static_assert(std::is_unsigned<U>::value, "Unsigned type required");
114  Assert(result);
115 
116  const char* ptr = *p;
117  Assert(ptr);
118  *p = end;
119 
120  // Check for overflow.
121  if (rare(end - ptr > int(sizeof(U)))) {
122  return false;
123  }
124 
125  *result = 0;
126  while (end != ptr) {
127  *result = (*result << 8) | U(static_cast<unsigned char>(*--end));
128  }
129 
130  return true;
131 }
132 
147 template<class U>
148 inline void
149 C_pack_uint_preserving_sort(std::string & s, U value)
150 {
151  static_assert(std::is_unsigned<U>::value, "Unsigned type required");
152 #if 0
153  // FIXME: Doesn't work with 64-bit Xapian::docid, etc.
154  static_assert(sizeof(U) <= SORTABLE_UINT_MAX_BYTES,
155  "Template type U too wide for database format");
156 #endif
157 
158  char tmp[sizeof(U) + 1];
159  char * p = tmp + sizeof(tmp);
160 
161  do {
162  *--p = char(value & 0xff);
163  value >>= 8;
164  } while (value &~ SORTABLE_UINT_1ST_BYTE_MASK);
165 
166  unsigned char len = static_cast<unsigned char>(tmp + sizeof(tmp) - p);
167  *--p = char((len - 1) << (8 - SORTABLE_UINT_LOG2_MAX_BYTES) | value);
168  s.append(p, len + 1);
169  Assert(s[0] != '\xff');
170 }
171 
183 template<class U>
184 inline bool
185 C_unpack_uint_preserving_sort(const char ** p, const char * end, U * result)
186 {
187  static_assert(std::is_unsigned<U>::value, "Unsigned type required");
188  static_assert(sizeof(U) < 256,
189  "Template type U too wide for database format");
190  Assert(result);
191 
192  const char * ptr = *p;
193  Assert(ptr);
194 
195  if (rare(ptr == end)) {
196  return false;
197  }
198 
199  unsigned char len_byte = static_cast<unsigned char>(*ptr++);
200  *result = len_byte & SORTABLE_UINT_1ST_BYTE_MASK;
201  size_t len = (len_byte >> (8 - SORTABLE_UINT_LOG2_MAX_BYTES)) + 1;
202 
203  if (rare(size_t(end - ptr) < len)) {
204  return false;
205  }
206 
207  end = ptr + len;
208  *p = end;
209 
210  // Check for overflow.
211  if (rare(len > int(sizeof(U)))) {
212  return false;
213  }
214 
215  while (ptr != end) {
216  *result = (*result << 8) | U(static_cast<unsigned char>(*ptr++));
217  }
218 
219  return true;
220 }
221 
222 #if HAVE_DECL___BUILTIN_CLZ && \
223  HAVE_DECL___BUILTIN_CLZL && \
224  HAVE_DECL___BUILTIN_CLZLL
225 template<typename T>
226 inline int
227 do_clz(T value) {
228  extern int no_clz_builtin_for_this_type(T);
229  return no_clz_builtin_for_this_type(value);
230 }
231 
232 template<>
233 inline int
234 do_clz(unsigned value) {
235  return __builtin_clz(value);
236 }
237 
238 template<>
239 inline int
240 do_clz(unsigned long value) {
241  return __builtin_clzl(value);
242 }
243 
244 template<>
245 inline int
246 do_clz(unsigned long long value) {
247  return __builtin_clzll(value);
248 }
249 
250 # define HAVE_DO_CLZ
251 #endif
252 
267 template<class U>
268 inline void
269 pack_uint_preserving_sort(std::string& s, U value)
270 {
271  static_assert(std::is_unsigned<U>::value, "Unsigned type required");
272  static_assert(sizeof(U) <= 8,
273  "Template type U too wide for database format");
274  // The clz() functions are undefined for 0, so handle the smallest band
275  // as a special case.
276  if (value < 0x8000) {
277  s.resize(s.size() + 2);
278  s[s.size() - 2] = static_cast<unsigned char>(value >> 8);
279  Assert(s[s.size() - 2] != '\xff');
280  s[s.size() - 1] = static_cast<unsigned char>(value);
281  return;
282  }
283 
284 #ifdef HAVE_DO_CLZ
285  size_t len = ((sizeof(U) * 8 + 5) - do_clz(value)) / 7;
286 #else
287  size_t len = 3;
288  for (auto x = value >> 22; x; x >>= 7) ++len;
289 #endif
290  unsigned mask = 0xff << (10 - len);
291 
292  s.resize(s.size() + len);
293  for (size_t i = 1; i != len; ++i) {
294  s[s.size() - i] = static_cast<unsigned char>(value);
295  value >>= 8;
296  }
297 
298  s[s.size() - len] = static_cast<unsigned char>(value | mask);
299  Assert(s[s.size() - len] != '\xff');
300 
301  AssertRel(len, >, 2);
302  AssertRel(len, <=, 9);
303 }
304 
316 template<class U>
317 inline bool
318 unpack_uint_preserving_sort(const char** p, const char* end, U* result)
319 {
320  static_assert(std::is_unsigned<U>::value, "Unsigned type required");
321  static_assert(sizeof(U) <= 8,
322  "Template type U too wide for database format");
323  Assert(result);
324 
325  const char* ptr = *p;
326  Assert(ptr);
327 
328  if (rare(ptr == end)) {
329  return false;
330  }
331 
332  unsigned char len_byte = static_cast<unsigned char>(*ptr++);
333  if (len_byte < 0x80) {
334  *result = (U(len_byte) << 8) | static_cast<unsigned char>(*ptr++);
335  *p = ptr;
336  return true;
337  }
338 
339  if (len_byte == 0xff) {
340  return false;
341  }
342 
343  // len is how many bytes there are after the length byte.
344 #ifdef HAVE_DO_CLZ
345  size_t len = do_clz(len_byte ^ 0xffu) + 9 - sizeof(unsigned) * 8;
346 #else
347  size_t len = 2;
348  for (unsigned char m = 0x40; len_byte & m; m >>= 1) ++len;
349 #endif
350  if (rare(size_t(end - ptr) < len)) {
351  return false;
352  }
353  unsigned mask = 0xff << (9 - len);
354  len_byte &= ~mask;
355 
356  // Check for overflow.
357  if (rare(len > int(sizeof(U)))) return false;
358  if (sizeof(U) != 8) {
359  // Need to check the top byte too.
360  if (rare(len == int(sizeof(U)) && len_byte != 0)) return false;
361  }
362 
363  end = ptr + len;
364  *p = end;
365 
366  U r = len_byte;
367  while (ptr != end) {
368  r = (r << 8) | U(static_cast<unsigned char>(*ptr++));
369  }
370  *result = r;
371 
372  return true;
373 }
374 
380 template<class U>
381 inline void
382 pack_uint(std::string& s, U value)
383 {
384  static_assert(std::is_unsigned<U>::value, "Unsigned type required");
385 
386  while (value >= 128) {
387  s += static_cast<char>(static_cast<unsigned char>(value) | 0x80);
388  value >>= 7;
389  }
390  s += static_cast<char>(value);
391 }
392 
398 template<>
399 inline void
400 pack_uint(std::string& s, bool value)
401 {
402  s += static_cast<char>(value);
403 }
404 
411 template<class U>
412 inline bool
413 unpack_uint(const char** p, const char* end, U* result)
414 {
415  static_assert(std::is_unsigned<U>::value, "Unsigned type required");
416 
417  const char* ptr = *p;
418  Assert(ptr);
419  const char* start = ptr;
420 
421  // Check the length of the encoded integer first.
422  do {
423  if (rare(ptr == end)) {
424  // Out of data.
425  *p = NULL;
426  return false;
427  }
428  } while (static_cast<unsigned char>(*ptr++) >= 128);
429 
430  *p = ptr;
431 
432  if (!result) return true;
433 
434  *result = U(*--ptr);
435  if (ptr == start) {
436  // Special case for small values.
437  return true;
438  }
439 
440  size_t maxbits = size_t(ptr - start) * 7;
441  if (maxbits <= sizeof(U) * 8) {
442  // No possibility of overflow.
443  do {
444  unsigned char chunk = static_cast<unsigned char>(*--ptr) & 0x7f;
445  *result = (*result << 7) | U(chunk);
446  } while (ptr != start);
447  return true;
448  }
449 
450  size_t minbits = maxbits - 6;
451  if (rare(minbits > sizeof(U) * 8)) {
452  // Overflow.
453  return false;
454  }
455 
456  while (--ptr != start) {
457  unsigned char chunk = static_cast<unsigned char>(*--ptr) & 0x7f;
458  *result = (*result << 7) | U(chunk);
459  }
460 
461  U tmp = *result;
462  *result <<= 7;
463  if (rare(*result < tmp)) {
464  // Overflow.
465  return false;
466  }
467  *result |= U(static_cast<unsigned char>(*ptr) & 0x7f);
468  return true;
469 }
470 
476 inline void
477 pack_string(std::string& s, const std::string& value)
478 {
479  pack_uint(s, value.size());
480  s += value;
481 }
482 
488 inline void
489 pack_string(std::string& s, const char* ptr)
490 {
491  Assert(ptr);
492  size_t len = std::strlen(ptr);
493  pack_uint(s, len);
494  s.append(ptr, len);
495 }
496 
503 inline bool
504 unpack_string(const char** p, const char* end, std::string& result)
505 {
506  size_t len;
507  if (rare(!unpack_uint(p, end, &len))) {
508  return false;
509  }
510 
511  const char*& ptr = *p;
512  if (rare(len > size_t(end - ptr))) {
513  ptr = NULL;
514  return false;
515  }
516 
517  result.assign(ptr, len);
518  ptr += len;
519  return true;
520 }
521 
538 inline void
539 pack_string_preserving_sort(std::string& s, const std::string& value,
540  bool last = false)
541 {
542  std::string::size_type b = 0, e;
543  while ((e = value.find('\0', b)) != std::string::npos) {
544  ++e;
545  s.append(value, b, e - b);
546  s += '\xff';
547  b = e;
548  }
549  s.append(value, b, std::string::npos);
550  if (!last) s += '\0';
551 }
552 
561 inline bool
562 unpack_string_preserving_sort(const char** p, const char* end,
563  std::string& result)
564 {
565  result.resize(0);
566 
567  const char* ptr = *p;
568  Assert(ptr);
569 
570  while (ptr != end) {
571  char ch = *ptr++;
572  if (rare(ch == '\0')) {
573  if (usual(ptr == end || *ptr != '\xff')) {
574  break;
575  }
576  ++ptr;
577  }
578  result += ch;
579  }
580  *p = ptr;
581  return true;
582 }
583 
584 inline std::string
585 pack_chert_postlist_key(const std::string& term)
586 {
587  // Special case for doclen lists.
588  if (term.empty())
589  return std::string("\x00\xe0", 2);
590 
591  std::string key;
592  pack_string_preserving_sort(key, term, true);
593  return key;
594 }
595 
596 inline std::string
597 pack_chert_postlist_key(const std::string& term, Xapian::docid did)
598 {
599  // Special case for doclen lists.
600  if (term.empty()) {
601  std::string key("\x00\xe0", 2);
602  C_pack_uint_preserving_sort(key, did);
603  return key;
604  }
605 
606  std::string key;
607  pack_string_preserving_sort(key, term);
608  C_pack_uint_preserving_sort(key, did);
609  return key;
610 }
611 
612 inline std::string
613 pack_glass_postlist_key(const std::string& term)
614 {
615  // Special case for doclen lists.
616  if (term.empty())
617  return std::string("\x00\xe0", 2);
618 
619  std::string key;
620  pack_string_preserving_sort(key, term, true);
621  return key;
622 }
623 
624 inline std::string
625 pack_glass_postlist_key(const std::string& term, Xapian::docid did)
626 {
627  // Special case for doclen lists.
628  if (term.empty()) {
629  std::string key("\x00\xe0", 2);
630  pack_uint_preserving_sort(key, did);
631  return key;
632  }
633 
634  std::string key;
635  pack_string_preserving_sort(key, term);
636  pack_uint_preserving_sort(key, did);
637  return key;
638 }
639 
640 #endif // XAPIAN_INCLUDED_PACK_H
void pack_bool(std::string &s, bool value)
Append an encoded bool to a string.
Definition: pack.h:57
#define Assert(COND)
Definition: omassert.h:122
const unsigned int SORTABLE_UINT_1ST_BYTE_MASK
Calculated value used below.
Definition: pack.h:48
typedefs for Xapian
const unsigned int SORTABLE_UINT_LOG2_MAX_BYTES
How many bits to store the length of a sortable uint in.
Definition: pack.h:42
#define AssertRel(A, REL, B)
Definition: omassert.h:123
#define usual(COND)
Definition: config.h:576
const unsigned int SORTABLE_UINT_MAX_BYTES
Calculated value used below.
Definition: pack.h:45
#define rare(COND)
Definition: config.h:575
void pack_uint_last(std::string &s, U value)
Append an encoded unsigned integer to a string as the last item.
Definition: pack.h:93
void C_pack_uint_preserving_sort(std::string &s, U value)
Append an encoded unsigned integer to a string, preserving the sort order.
Definition: pack.h:149
bool C_unpack_uint_preserving_sort(const char **p, const char *end, U *result)
Decode an "sort preserved" unsigned integer from a string.
Definition: pack.h:185
bool unpack_string_preserving_sort(const char **p, const char *end, std::string &result)
Decode a "sort preserved" std::string from a string.
Definition: pack.h:562
bool unpack_uint_preserving_sort(const char **p, const char *end, U *result)
Decode a "sort preserved" unsigned integer from a string.
Definition: pack.h:318
std::string pack_chert_postlist_key(const std::string &term)
Definition: pack.h:585
void pack_uint(std::string &s, U value)
Append an encoded unsigned integer to a string.
Definition: pack.h:382
bool unpack_bool(const char **p, const char *end, bool *result)
Decode a bool from a string.
Definition: pack.h:69
void pack_string(std::string &s, const std::string &value)
Append an encoded std::string to a string.
Definition: pack.h:477
std::string pack_glass_postlist_key(const std::string &term)
Definition: pack.h:613
bool unpack_uint_last(const char **p, const char *end, U *result)
Decode an unsigned integer as the last item in a string.
Definition: pack.h:111
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
Definition: pack.h:413
Various assertion macros.
bool unpack_string(const char **p, const char *end, std::string &result)
Decode a std::string from a string.
Definition: pack.h:504
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
void pack_string_preserving_sort(std::string &s, const std::string &value, bool last=false)
Append an encoded std::string to a string, preserving the sort order.
Definition: pack.h:539
void pack_uint_preserving_sort(std::string &s, U value)
Append an encoded unsigned integer to a string, preserving the sort order.
Definition: pack.h:269