xapian-core  2.0.0
bitstream.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2004,2005,2006,2008,2013,2014,2016,2017,2018 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include "bitstream.h"
24 
25 #include <xapian/types.h>
26 
27 #include "omassert.h"
28 #include "pack.h"
29 
30 using namespace std;
31 
32 // Find the position of the most significant set bit counting from 1 with
33 // 0 being returned if no bits are set (similar to how ffs() reports the least
34 // significant set bit).
35 template<typename T>
36 static inline int
38 {
39 #ifdef HAVE_DO_CLZ
40  return mask ? sizeof(T) * 8 - do_clz(mask) : 0;
41 #else
42  // Table of results for 8 bit inputs.
43  static const unsigned char hob_tab[256] = {
44  0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
45  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
46  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
47  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
48  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
49  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
50  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
51  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
52  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
53  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
54  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
55  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
56  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
57  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
58  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
59  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
60  };
61 
62  int result = 0;
63  if constexpr(sizeof(T) > 4) {
64  if (mask >= 0x100000000ul) {
65  mask >>= 32;
66  result += 32;
67  }
68  }
69  if (mask >= 0x10000u) {
70  mask >>= 16;
71  result += 16;
72  }
73  if (mask >= 0x100u) {
74  mask >>= 8;
75  result += 8;
76  }
77  return result + hob_tab[mask];
78 #endif
79 }
80 
81 namespace Xapian {
82 
84 template<typename T, typename U>
85 static constexpr inline
86 T safe_shl(T x, U shift)
87 {
88  return (shift >= sizeof(T) * 8 ? 0 : x << shift);
89 }
90 
91 void
93 {
94  Assert(value < outof);
95  unsigned bits = highest_order_bit(outof - Xapian::termpos(1));
96  // If the top bit of (outof - Xapian::termpos(1)) is set then
97  // the shift will shift the bit out and give zero and the
98  // subtraction will result in an unsigned overflow.
99  const Xapian::termpos spare =
100  UNSIGNED_OVERFLOW_OK(safe_shl(Xapian::termpos(1), bits) - outof);
101  if (spare) {
102  /* If we have spare values, we can use one fewer bit to encode some
103  * values. We shorten the values in the middle of the range, as
104  * testing (on positional data) shows this works best. "Managing
105  * Gigabytes" suggests reversing this for the lowest level and encoding
106  * the end values of the range shorter, which is contrary to our
107  * testing (MG is talking about posting lists, which probably have
108  * different characteristics).
109  *
110  * For example, if outof is 11, the codes emitted are:
111  *
112  * value output
113  * 0 0000
114  * 1 0001
115  * 2 0010
116  * 3 011
117  * 4 100
118  * 5 101
119  * 6 110
120  * 7 111
121  * 8 1000
122  * 9 1001
123  * 10 1010
124  *
125  * Note the LSB comes first in the bitstream, so these codes need to be
126  * suffix-free to be decoded.
127  */
128  const Xapian::termpos mid_start = (outof - spare) / 2;
129  if (value >= mid_start + spare) {
130  value = (value - (mid_start + spare)) |
131  (Xapian::termpos(1) << (bits - 1));
132  } else if (value >= mid_start) {
133  --bits;
134  }
135  }
136 
137  if (bits + n_bits > sizeof(acc) * 8) {
138  // We need to write more bits than there's empty room for in
139  // the accumulator. So we arrange to shift out 8 bits, then
140  // adjust things so we're adding 8 fewer bits.
141  Assert(bits <= sizeof(acc) * 8);
142  acc |= (value << n_bits);
143  buf += char(acc);
144  acc >>= 8;
145  value >>= 8;
146  bits -= 8;
147  }
148  acc |= (value << n_bits);
149  n_bits += bits;
150  while (n_bits >= 8) {
151  buf += char(acc);
152  acc >>= 8;
153  n_bits -= 8;
154  }
155 }
156 
157 void
158 BitWriter::encode_interpolative(const Xapian::VecCOW<Xapian::termpos>& pos,
159  int j, int k)
160 {
161  // "Interpolative code" - for an algorithm description, see "Managing
162  // Gigabytes" - pages 126-127 in the second edition. You can probably
163  // view those pages in google books.
164  while (j + 1 < k) {
165  const Xapian::termpos mid = j + (k - j) / 2;
166  // Encode one out of (pos[k] - pos[j] + 1) values
167  // (less some at either end because we must be able to fit
168  // all the intervening pos in)
169  const Xapian::termpos outof = pos[k] - pos[j] + j - k + 1;
170  const Xapian::termpos lowest = pos[j] + mid - j;
171  encode(pos[mid] - lowest, outof);
172  encode_interpolative(pos, j, mid);
173  j = mid;
174  }
175 }
176 
179 {
180  (void)force;
181  Assert(force == di_current.is_initialized());
183  // If the top bit of (outof - Xapian::termpos(1)) is set then
184  // the shift will shift the bit out and give zero and the
185  // subtraction will result in an unsigned overflow.
186  const Xapian::termpos spare =
187  UNSIGNED_OVERFLOW_OK(safe_shl(Xapian::termpos(1), bits) - outof);
188  const Xapian::termpos mid_start = (outof - spare) / 2;
190  if (spare) {
191  pos = read_bits(bits - 1);
192  if (pos < mid_start) {
193  if (read_bits(1)) pos += mid_start + spare;
194  }
195  } else {
196  pos = read_bits(bits);
197  }
198  Assert(pos < outof);
199  return pos;
200 }
201 
203 BitReader::read_bits(int count)
204 {
205  Xapian::termpos result;
206  if (count > int(sizeof(acc) * 8 - 7)) {
207  // If we need more than 7 bits less than fit in acc do the read in two
208  // goes to ensure that we don't overflow acc. This is a little more
209  // conservative than it needs to be, but such large values will
210  // inevitably be rare (because you can't fit very many of them into
211  // the full Xapian::termpos range).
212  Assert(count <= int(sizeof(acc) * 8));
213  const size_t half_the_bits = sizeof(acc) * 4;
214  result = read_bits(half_the_bits);
215  return result | (read_bits(count - half_the_bits) << half_the_bits);
216  }
217  while (n_bits < count) {
218  Assert(p < end);
219  acc |= Xapian::termpos(static_cast<unsigned char>(*p++)) << n_bits;
220  n_bits += 8;
221  }
222  result = acc & ((Xapian::termpos(1) << count) - Xapian::termpos(1));
223  acc >>= count;
224  n_bits -= count;
225  return result;
226 }
227 
228 void
229 BitReader::decode_interpolative(int j, int k,
230  Xapian::termpos pos_j, Xapian::termpos pos_k)
231 {
232  Assert(!di_current.is_initialized());
233  di_stack.reserve(highest_order_bit(pos_k - pos_j));
234  di_current.set_j(j, pos_j);
235  di_current.set_k(k, pos_k);
236 }
237 
239 BitReader::decode_interpolative_next()
240 {
241  Assert(di_current.is_initialized());
242  while (!di_stack.empty() || di_current.is_next()) {
243  if (!di_current.is_next()) {
244  Xapian::termpos pos_ret = di_current.pos_k;
245  di_current = di_stack.back();
246  di_stack.pop_back();
247  int mid = (di_current.j + di_current.k) / 2;
248  di_current.set_j(mid, pos_ret);
249  return pos_ret;
250  }
251  di_stack.push_back(di_current);
252  int mid = (di_current.j + di_current.k) / 2;
253  Xapian::termpos pos_mid = decode(di_current.outof(), true) +
254  (di_current.pos_j + mid - di_current.j);
255  di_current.set_k(mid, pos_mid);
256  }
257 #ifdef XAPIAN_ASSERTIONS
258  di_current.uninit();
259 #endif
260  return di_current.pos_k;
261 }
262 
263 }
static int highest_order_bit(T mask)
Definition: bitstream.cc:37
Classes to encode/decode a bitstream.
Suitable for "simple" type T.
Definition: smallvector.h:62
#define UNSIGNED_OVERFLOW_OK(X)
Definition: config.h:626
PositionList * p
Xapian::termpos pos
bool encode(double lat, double lon, std::string &result)
Encode a coordinate and append it to a string.
Definition: geoencode.cc:73
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
static constexpr T safe_shl(T x, U shift)
Shift left that's safe for shifts wider than the type.
Definition: bitstream.cc:86
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
Definition: types.h:75
Various assertion macros.
#define Assert(COND)
Definition: omassert.h:122
Pack types into strings and unpack them again.
typedefs for Xapian
static int decode(const T(&table)[N], const char *s)
Decode a string to an integer.
Definition: xapian-quest.cc:70