xapian-core  1.4.25
bitstream.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2004,2005,2006,2008,2013,2014,2016,2018 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19  * USA
20  */
21 
22 #include <config.h>
23 
24 #include "bitstream.h"
25 
26 #include <xapian/types.h>
27 
28 #include "omassert.h"
29 #include "pack.h"
30 
31 #include <vector>
32 
33 using namespace std;
34 
35 // Find the position of the most significant set bit counting from 1 with
36 // 0 being returned if no bits are set (similar to how ffs() reports the least
37 // significant set bit).
38 template<typename T>
39 static inline int
41 {
42 #ifdef HAVE_DO_CLZ
43  return mask ? sizeof(T) * 8 - do_clz(mask) : 0;
44 #else
45  // Table of results for 8 bit inputs.
46  static const unsigned char hob_tab[256] = {
47  0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
48  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
49  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
50  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
51  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
52  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
53  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
54  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
55  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
56  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
57  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
58  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
59  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
60  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
61  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
62  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
63  };
64 
65  int result = 0;
66  if (sizeof(T) > 4) {
67  if (mask >= 0x100000000ul) {
68  mask >>= 32;
69  result += 32;
70  }
71  }
72  if (mask >= 0x10000u) {
73  mask >>= 16;
74  result += 16;
75  }
76  if (mask >= 0x100u) {
77  mask >>= 8;
78  result += 8;
79  }
80  return result + hob_tab[mask];
81 #endif
82 }
83 
84 namespace Xapian {
85 
87 template<typename T, typename U>
88 static constexpr inline
89 T safe_shl(T x, U shift)
90 {
91  return (shift >= sizeof(T) * 8 ? 0 : x << shift);
92 }
93 
94 void
96 {
97  Assert(value < outof);
98  unsigned bits = highest_order_bit(outof - Xapian::termpos(1));
99  const Xapian::termpos spare = safe_shl(Xapian::termpos(1), bits) - outof;
100  if (spare) {
101  /* If we have spare values, we can use one fewer bit to encode some
102  * values. We shorten the values in the middle of the range, as
103  * testing (on positional data) shows this works best. "Managing
104  * Gigabytes" suggests reversing this for the lowest level and encoding
105  * the end values of the range shorter, which is contrary to our
106  * testing (MG is talking about posting lists, which probably have
107  * different characteristics).
108  *
109  * For example, if outof is 11, the codes emitted are:
110  *
111  * value output
112  * 0 0000
113  * 1 0001
114  * 2 0010
115  * 3 011
116  * 4 100
117  * 5 101
118  * 6 110
119  * 7 111
120  * 8 1000
121  * 9 1001
122  * 10 1010
123  *
124  * Note the LSB comes first in the bitstream, so these codes need to be
125  * suffix-free to be decoded.
126  */
127  const Xapian::termpos mid_start = (outof - spare) / 2;
128  if (value >= mid_start + spare) {
129  value = (value - (mid_start + spare)) |
130  (Xapian::termpos(1) << (bits - 1));
131  } else if (value >= mid_start) {
132  --bits;
133  }
134  }
135 
136  if (bits + n_bits > sizeof(acc) * 8) {
137  // We need to write more bits than there's empty room for in
138  // the accumulator. So we arrange to shift out 8 bits, then
139  // adjust things so we're adding 8 fewer bits.
140  Assert(bits <= sizeof(acc) * 8);
141  acc |= (value << n_bits);
142  buf += char(acc);
143  acc >>= 8;
144  value >>= 8;
145  bits -= 8;
146  }
147  acc |= (value << n_bits);
148  n_bits += bits;
149  while (n_bits >= 8) {
150  buf += char(acc);
151  acc >>= 8;
152  n_bits -= 8;
153  }
154 }
155 
156 void
157 BitWriter::encode_interpolative(const vector<Xapian::termpos>& pos, int j, int k)
158 {
159  // "Interpolative code" - for an algorithm description, see "Managing
160  // Gigabytes" - pages 126-127 in the second edition. You can probably
161  // view those pages in google books.
162  while (j + 1 < k) {
163  const Xapian::termpos mid = j + (k - j) / 2;
164  // Encode one out of (pos[k] - pos[j] + 1) values
165  // (less some at either end because we must be able to fit
166  // all the intervening pos in)
167  const Xapian::termpos outof = pos[k] - pos[j] + j - k + 1;
168  const Xapian::termpos lowest = pos[j] + mid - j;
169  encode(pos[mid] - lowest, outof);
170  encode_interpolative(pos, j, mid);
171  j = mid;
172  }
173 }
174 
177 {
178  (void)force;
179  Assert(force == di_current.is_initialized());
181  const Xapian::termpos spare = safe_shl(Xapian::termpos(1), bits) - outof;
182  const Xapian::termpos mid_start = (outof - spare) / 2;
183  Xapian::termpos p;
184  if (spare) {
185  p = read_bits(bits - 1);
186  if (p < mid_start) {
187  if (read_bits(1)) p += mid_start + spare;
188  }
189  } else {
190  p = read_bits(bits);
191  }
192  Assert(p < outof);
193  return p;
194 }
195 
197 BitReader::read_bits(int count)
198 {
199  Xapian::termpos result;
200  if (count > int(sizeof(acc) * 8 - 7)) {
201  // If we need more than 7 bits less than fit in acc do the read in two
202  // goes to ensure that we don't overflow acc. This is a little more
203  // conservative than it needs to be, but such large values will
204  // inevitably be rare (because you can't fit very many of them into
205  // the full Xapian::termpos range).
206  Assert(count <= int(sizeof(acc) * 8));
207  const size_t half_the_bits = sizeof(acc) * 4;
208  result = read_bits(half_the_bits);
209  return result | (read_bits(count - half_the_bits) << half_the_bits);
210  }
211  while (n_bits < count) {
212  Assert(idx < buf.size());
213  unsigned char byte = buf[idx++];
214  acc |= Xapian::termpos(byte) << n_bits;
215  n_bits += 8;
216  }
217  result = acc & ((Xapian::termpos(1) << count) - Xapian::termpos(1));
218  acc >>= count;
219  n_bits -= count;
220  return result;
221 }
222 
223 void
224 BitReader::decode_interpolative(int j, int k,
225  Xapian::termpos pos_j, Xapian::termpos pos_k)
226 {
227  Assert(!di_current.is_initialized());
228  di_stack.reserve(highest_order_bit(pos_k - pos_j));
229  di_current.set_j(j, pos_j);
230  di_current.set_k(k, pos_k);
231 }
232 
234 BitReader::decode_interpolative_next()
235 {
236  Assert(di_current.is_initialized());
237  while (!di_stack.empty() || di_current.is_next()) {
238  if (!di_current.is_next()) {
239  Xapian::termpos pos_ret = di_current.pos_k;
240  di_current = di_stack.back();
241  di_stack.pop_back();
242  int mid = (di_current.j + di_current.k) / 2;
243  di_current.set_j(mid, pos_ret);
244  return pos_ret;
245  }
246  di_stack.push_back(di_current);
247  int mid = (di_current.j + di_current.k) / 2;
248  Xapian::termpos pos_mid = decode(di_current.outof(), true) +
249  (di_current.pos_j + mid - di_current.j);
250  di_current.set_k(mid, pos_mid);
251  }
252 #ifdef XAPIAN_ASSERTIONS
253  di_current.uninit();
254 #endif
255  return di_current.pos_k;
256 }
257 
258 }
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
#define Assert(COND)
Definition: omassert.h:122
typedefs for Xapian
static constexpr T safe_shl(T x, U shift)
Shift left that&#39;s safe for shifts wider than the type.
Definition: bitstream.cc:89
static int decode(const T(&table)[N], const char *s)
Decode a string to an integer.
Definition: quest.cc:70
STL namespace.
unsigned char byte
Definition: header.h:5
static int highest_order_bit(T mask)
Definition: bitstream.cc:40
bool encode(double lat, double lon, std::string &result)
Encode a coordinate and append it to a string.
Definition: geoencode.cc:73
Classes to encode/decode a bitstream.
Pack types into strings and unpack them again.
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
Definition: types.h:83
Various assertion macros.