xapian-core  1.4.19
bitstream.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2004,2005,2006,2008,2013,2014,2016,2018 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19  * USA
20  */
21 
22 #include <config.h>
23 
24 #include "bitstream.h"
25 
26 #include <xapian/types.h>
27 
28 #include "omassert.h"
29 #include "pack.h"
30 
31 #include <cmath>
32 #include <vector>
33 
34 using namespace std;
35 
36 // Highly optimised fls() implementation.
37 template<typename T>
38 static inline int
40 {
41 #ifdef HAVE_DO_CLZ
42  return mask ? sizeof(T) * 8 - do_clz(mask) : 0;
43 #else
44  static const unsigned char flstab[256] = {
45  0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
46  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
47  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
48  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
49  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
50  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
51  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
52  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
53  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
54  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
55  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
56  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
57  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
58  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
59  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
60  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
61  };
62 
63  int result = 0;
64  if (sizeof(T) > 4) {
65  if (mask >= 0x100000000ul) {
66  mask >>= 32;
67  result += 32;
68  }
69  }
70  if (mask >= 0x10000u) {
71  mask >>= 16;
72  result += 16;
73  }
74  if (mask >= 0x100u) {
75  mask >>= 8;
76  result += 8;
77  }
78  return result + flstab[mask];
79 #endif
80 }
81 
82 namespace Xapian {
83 
85 template<typename T, typename U>
86 static constexpr inline
87 T safe_shl(T x, U shift)
88 {
89  return (shift >= sizeof(T) * 8 ? 0 : x << shift);
90 }
91 
92 void
94 {
95  Assert(value < outof);
96  unsigned bits = highest_order_bit(outof - Xapian::termpos(1));
97  const Xapian::termpos spare = safe_shl(Xapian::termpos(1), bits) - outof;
98  if (spare) {
99  /* If we have spare values, we can use one fewer bit to encode some
100  * values. We shorten the values in the middle of the range, as
101  * testing (on positional data) shows this works best. "Managing
102  * Gigabytes" suggests reversing this for the lowest level and encoding
103  * the end values of the range shorter, which is contrary to our
104  * testing (MG is talking about posting lists, which probably have
105  * different characteristics).
106  *
107  * For example, if outof is 11, the codes emitted are:
108  *
109  * value output
110  * 0 0000
111  * 1 0001
112  * 2 0010
113  * 3 011
114  * 4 100
115  * 5 101
116  * 6 110
117  * 7 111
118  * 8 1000
119  * 9 1001
120  * 10 1010
121  *
122  * Note the LSB comes first in the bitstream, so these codes need to be
123  * suffix-free to be decoded.
124  */
125  const Xapian::termpos mid_start = (outof - spare) / 2;
126  if (value >= mid_start + spare) {
127  value = (value - (mid_start + spare)) |
128  (Xapian::termpos(1) << (bits - 1));
129  } else if (value >= mid_start) {
130  --bits;
131  }
132  }
133 
134  if (bits + n_bits > sizeof(acc) * 8) {
135  // We need to write more bits than there's empty room for in
136  // the accumulator. So we arrange to shift out 8 bits, then
137  // adjust things so we're adding 8 fewer bits.
138  Assert(bits <= sizeof(acc) * 8);
139  acc |= (value << n_bits);
140  buf += char(acc);
141  acc >>= 8;
142  value >>= 8;
143  bits -= 8;
144  }
145  acc |= (value << n_bits);
146  n_bits += bits;
147  while (n_bits >= 8) {
148  buf += char(acc);
149  acc >>= 8;
150  n_bits -= 8;
151  }
152 }
153 
154 void
155 BitWriter::encode_interpolative(const vector<Xapian::termpos>& pos, int j, int k)
156 {
157  // "Interpolative code" - for an algorithm description, see "Managing
158  // Gigabytes" - pages 126-127 in the second edition. You can probably
159  // view those pages in google books.
160  while (j + 1 < k) {
161  const Xapian::termpos mid = j + (k - j) / 2;
162  // Encode one out of (pos[k] - pos[j] + 1) values
163  // (less some at either end because we must be able to fit
164  // all the intervening pos in)
165  const Xapian::termpos outof = pos[k] - pos[j] + j - k + 1;
166  const Xapian::termpos lowest = pos[j] + mid - j;
167  encode(pos[mid] - lowest, outof);
168  encode_interpolative(pos, j, mid);
169  j = mid;
170  }
171 }
172 
175 {
176  (void)force;
177  Assert(force == di_current.is_initialized());
179  const Xapian::termpos spare = safe_shl(Xapian::termpos(1), bits) - outof;
180  const Xapian::termpos mid_start = (outof - spare) / 2;
181  Xapian::termpos p;
182  if (spare) {
183  p = read_bits(bits - 1);
184  if (p < mid_start) {
185  if (read_bits(1)) p += mid_start + spare;
186  }
187  } else {
188  p = read_bits(bits);
189  }
190  Assert(p < outof);
191  return p;
192 }
193 
195 BitReader::read_bits(int count)
196 {
197  Xapian::termpos result;
198  if (count > int(sizeof(acc) * 8 - 7)) {
199  // If we need more than 7 bits less than fit in acc do the read in two
200  // goes to ensure that we don't overflow acc. This is a little more
201  // conservative than it needs to be, but such large values will
202  // inevitably be rare (because you can't fit very many of them into
203  // the full Xapian::termpos range).
204  Assert(count <= int(sizeof(acc) * 8));
205  const size_t half_the_bits = sizeof(acc) * 4;
206  result = read_bits(half_the_bits);
207  return result | (read_bits(count - half_the_bits) << half_the_bits);
208  }
209  while (n_bits < count) {
210  Assert(idx < buf.size());
211  unsigned char byte = buf[idx++];
212  acc |= Xapian::termpos(byte) << n_bits;
213  n_bits += 8;
214  }
215  result = acc & ((Xapian::termpos(1) << count) - Xapian::termpos(1));
216  acc >>= count;
217  n_bits -= count;
218  return result;
219 }
220 
221 void
222 BitReader::decode_interpolative(int j, int k,
223  Xapian::termpos pos_j, Xapian::termpos pos_k)
224 {
225  Assert(!di_current.is_initialized());
226  di_stack.reserve(highest_order_bit(pos_k - pos_j));
227  di_current.set_j(j, pos_j);
228  di_current.set_k(k, pos_k);
229 }
230 
232 BitReader::decode_interpolative_next()
233 {
234  Assert(di_current.is_initialized());
235  while (!di_stack.empty() || di_current.is_next()) {
236  if (!di_current.is_next()) {
237  Xapian::termpos pos_ret = di_current.pos_k;
238  di_current = di_stack.back();
239  di_stack.pop_back();
240  int mid = (di_current.j + di_current.k) / 2;
241  di_current.set_j(mid, pos_ret);
242  return pos_ret;
243  }
244  di_stack.push_back(di_current);
245  int mid = (di_current.j + di_current.k) / 2;
246  Xapian::termpos pos_mid = decode(di_current.outof(), true) +
247  (di_current.pos_j + mid - di_current.j);
248  di_current.set_k(mid, pos_mid);
249  }
250 #ifdef XAPIAN_ASSERTIONS
251  di_current.uninit();
252 #endif
253  return di_current.pos_k;
254 }
255 
256 }
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
#define Assert(COND)
Definition: omassert.h:122
typedefs for Xapian
static constexpr T safe_shl(T x, U shift)
Shift left that&#39;s safe for shifts wider than the type.
Definition: bitstream.cc:87
STL namespace.
void decode(const char *value, size_t len, double &lat_ref, double &lon_ref)
Decode a coordinate from a buffer.
Definition: geoencode.cc:135
unsigned char byte
Definition: header.h:5
static int highest_order_bit(T mask)
Definition: bitstream.cc:39
bool encode(double lat, double lon, std::string &result)
Encode a coordinate and append it to a string.
Definition: geoencode.cc:73
Classes to encode/decode a bitstream.
Pack types into strings and unpack them again.
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
Definition: types.h:83
Various assertion macros.