xapian-core  2.0.0
honey_table.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2017,2018,2024 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 //#define DEBUGGING
24 
25 #include "honey_table.h"
26 
27 #include "honey_cursor.h"
28 #include "stringutils.h"
29 
31 
32 #include <cerrno>
33 
34 #ifdef DEBUGGING
35 # include <iostream>
36 #endif
37 
38 using Honey::RootInfo;
39 
40 using namespace std;
41 
42 void
43 HoneyTable::create_and_open(int flags_, const RootInfo& root_info)
44 {
45  Assert(!single_file());
46  flags = flags_;
47  compress_min = root_info.get_compress_min();
48  if (read_only) {
49  num_entries = root_info.get_num_entries();
50  root = root_info.get_root();
51  // FIXME: levels
52  }
53  if (!store.open(path, read_only))
54  throw Xapian::DatabaseOpeningError("Failed to open HoneyTable", errno);
55 }
56 
57 void
58 HoneyTable::open(int flags_, const RootInfo& root_info, honey_revision_number_t)
59 {
60  flags = flags_;
61  compress_min = root_info.get_compress_min();
62  num_entries = root_info.get_num_entries();
63  offset = root_info.get_offset();
64  root = root_info.get_root();
65  if (!single_file() && !store.open(path, read_only)) {
66  if (!lazy)
67  throw Xapian::DatabaseOpeningError("Failed to open HoneyTable",
68  errno);
69  }
70  store.set_pos(offset);
71 }
72 
73 void
74 HoneyTable::add(std::string_view key,
75  const char* val,
76  size_t val_size,
77  bool compressed)
78 {
79  if (rare(val_size == 0))
80  throw Xapian::DatabaseError("HoneyTable::add() passed empty value");
81  if (store.was_forced_closed())
83  if (!compressed && compress_min > 0 && val_size > compress_min) {
84  size_t compressed_size = val_size;
85  CompressionStream comp_stream; // FIXME: reuse
86  const char* p = comp_stream.compress(val, &compressed_size);
87  if (p) {
88  add(key, p, compressed_size, true);
89  return;
90  }
91  }
92 
93  if (read_only)
94  throw Xapian::InvalidOperationError("add() on read-only HoneyTable");
95  if (key.size() == 0 || key.size() > HONEY_MAX_KEY_LENGTH)
96  throw Xapian::InvalidArgumentError("Invalid key size: " +
97  str(key.size()));
98  if (key <= last_key)
99  throw Xapian::InvalidOperationError("New key <= previous key");
100  size_t reuse = common_prefix_length(last_key, key);
101 
102 #ifdef SSTINDEX_ARRAY
103  if (reuse == 0) {
104  index.maybe_add_entry(key, store.get_pos());
105  }
106 #elif defined SSTINDEX_BINARY_CHOP
107  // For a binary chop index, the index point is before the key info - the
108  // index key must have the same N first bytes as the previous key, where
109  // N >= the keep length.
110  index.maybe_add_entry(key, store.get_pos());
111 #elif defined SSTINDEX_SKIPLIST
112  // Handled below.
113 #else
114 # error SSTINDEX type not specified
115 #endif
116 
117  store.write(static_cast<unsigned char>(reuse));
118  store.write(static_cast<unsigned char>(key.size() - reuse));
119  store.write(key.data() + reuse, key.size() - reuse);
120  ++num_entries;
121 
122 #ifdef SSTINDEX_SKIPLIST
123  // For a skiplist index, the index provides the full key, so the index
124  // point is after the key at the level below.
125  index.maybe_add_entry(key, store.get_pos());
126 #endif
127 
128  // Encode "compressed?" flag in bottom bit.
129  // FIXME: Don't do this if a table is uncompressed? That saves a byte
130  // for each item where the extra bit pushes the length up by a byte.
131  size_t val_size_enc = (val_size << 1) | size_t{compressed};
132  std::string val_len;
133  pack_uint(val_len, val_size_enc);
134  // FIXME: pass together so we can potentially writev() both?
135  store.write(val_len.data(), val_len.size());
136  store.write(val, val_size);
137  last_key = key;
138 }
139 
140 void
142 {
143  if (root < 0)
144  throw Xapian::InvalidOperationError("root not set");
145 
146  root_info->set_num_entries(num_entries);
147  // offset should already be set.
148  root_info->set_root(root);
149  // Not really meaningful.
150  // root_info->set_free_list(std::string());
151 
152  read_only = true;
153  store.rewind(offset);
154  last_key = string();
155 }
156 
157 bool
158 HoneyTable::read_key(std::string& key,
159  size_t& val_size,
160  bool& compressed) const
161 {
162 #ifdef DEBUGGING
163  {
164  string desc;
165  description_append(desc, key);
166  cerr << "HoneyTable::read_key(" << desc << ", ...) for path=" << path
167  << endl;
168  }
169 #endif
170  if (!read_only) {
171  return false;
172  }
173 
174  AssertRel(store.get_pos(), >=, offset);
175  if (store.get_pos() >= root) {
176  AssertEq(store.get_pos(), root);
177  return false;
178  }
179  int ch = store.read();
180  if (ch == EOF) return false;
181 
182  size_t reuse = ch;
183  if (reuse > last_key.size()) {
184  throw Xapian::DatabaseCorruptError("Reuse > previous key size");
185  }
186  ch = store.read();
187  if (ch == EOF) {
188  throw Xapian::DatabaseError("EOF/error while reading key length",
189  errno);
190  }
191  size_t key_size = ch;
192  char buf[256];
193  store.read(buf, key_size);
194  key.assign(last_key, 0, reuse);
195  key.append(buf, key_size);
196  last_key = key;
197 
198 #ifdef DEBUGGING
199  if (false) {
200  std::string esc;
201  description_append(esc, key);
202  std::cout << "K:" << esc << std::endl;
203  }
204 #endif
205 
206  int r;
207  {
208  // FIXME: rework to take advantage of buffering that's happening anyway?
209  char* p = buf;
210  for (int i = 0; i < 8; ++i) {
211  int ch2 = store.read();
212  if (ch2 == EOF) {
213  break;
214  }
215  *p++ = char(ch2);
216  if (ch2 < 128) break;
217  }
218  r = p - buf;
219  }
220  const char* p = buf;
221  const char* end = p + r;
222  if (!unpack_uint(&p, end, &val_size)) {
223  throw Xapian::DatabaseError("val_size unpack_uint invalid");
224  }
225  compressed = val_size & 1;
226  val_size >>= 1;
227  Assert(p == end);
228  return true;
229 }
230 
231 void
232 HoneyTable::read_val(std::string& val, size_t val_size) const
233 {
234  AssertRel(store.get_pos() + val_size, <=, size_t(root));
235  val.resize(val_size);
236  store.read(&(val[0]), val_size);
237 #ifdef DEBUGGING
238  if (false) {
239  std::string esc;
240  description_append(esc, val);
241  std::cout << "V:" << esc << std::endl;
242  }
243 #endif
244 }
245 
246 bool
247 HoneyTable::get_exact_entry(std::string_view key, std::string* tag) const
248 {
249  if (!read_only) std::abort();
250  if (rare(!store.is_open())) {
251  if (store.was_forced_closed())
253  return false;
254  }
255  store.rewind(root);
256  if (rare(key.empty()))
257  return false;
258  bool exact_match = false;
259  bool compressed = false;
260  size_t val_size = 0;
261  int index_type = store.read();
262  switch (index_type) {
263  case EOF:
264  return false;
265  case 0x00: {
266  unsigned char first =
267  static_cast<unsigned char>(key[0] - store.read());
268  unsigned char range = store.read();
269  if (first > range)
270  return false;
271  store.skip(first * 4); // FIXME: pointer width
272  off_t jump = store.read_uint4_be();
273  store.rewind(jump);
274  // The jump point will be an entirely new key (because it is the
275  // first key with that initial character), and we drop in as if
276  // this was the first key so set last_key to be empty.
277  last_key = string();
278  break;
279  }
280  case 0x01: {
281  size_t j = store.read_uint4_be();
282  if (j == 0)
283  return false;
284  off_t base = store.get_pos();
286  size_t kkey_len = 0;
287  size_t i = 0;
288  while (j - i > 1) {
289  size_t k = i + (j - i) / 2;
290  store.set_pos(base + k * SSTINDEX_BINARY_CHOP_ENTRY_SIZE);
291  store.read(kkey, SSTINDEX_BINARY_CHOP_KEY_SIZE);
292  kkey_len = 4;
293  while (kkey_len > 0 && kkey[kkey_len - 1] == '\0') --kkey_len;
294  int r = key.compare(0, SSTINDEX_BINARY_CHOP_KEY_SIZE,
295  kkey, kkey_len);
296  if (r < 0) {
297  j = k;
298  } else {
299  i = k;
300  if (r == 0) {
301  break;
302  }
303  }
304  }
305  store.set_pos(base + i * SSTINDEX_BINARY_CHOP_ENTRY_SIZE);
306  store.read(kkey, SSTINDEX_BINARY_CHOP_KEY_SIZE);
307  kkey_len = 4;
308  while (kkey_len > 0 && kkey[kkey_len - 1] == '\0') --kkey_len;
309  off_t jump = store.read_uint4_be();
310  store.rewind(jump);
311  // The jump point is to the first key with prefix kkey, so will
312  // work if we set last key to kkey. Unless we're jumping to the
313  // start of the table, in which case last_key needs to be empty.
314  last_key.assign(kkey, jump == 0 ? 0 : kkey_len);
315  break;
316  }
317  case 0x02: {
318  // FIXME: If "close" just seek forwards? Or consider seeking from
319  // current index pos?
320  // off_t pos = store.get_pos();
321  string index_key, prev_index_key;
322  make_unsigned_t<off_t> ptr = 0;
323  int cmp0 = 1;
324  while (true) {
325  int reuse = store.read();
326  if (reuse == EOF) break;
327  int len = store.read();
328  if (len == EOF) abort(); // FIXME
329  index_key.resize(reuse + len);
330  store.read(&index_key[reuse], len);
331 
332 #ifdef DEBUGGING
333  {
334  string desc;
335  description_append(desc, index_key);
336  cerr << "Index key: " << desc << endl;
337  }
338 #endif
339 
340  cmp0 = index_key.compare(key);
341  if (cmp0 > 0) {
342  index_key = prev_index_key;
343  break;
344  }
345  char buf[8];
346  char* e = buf;
347  while (true) {
348  int b = store.read();
349  *e++ = b;
350  if ((b & 0x80) == 0) break;
351  }
352  const char* p = buf;
353  if (!unpack_uint(&p, e, &ptr) || p != e) abort(); // FIXME
354 #ifdef DEBUGGING
355  {
356  cerr << " -> " << ptr << endl;
357  }
358 #endif
359  if (cmp0 == 0)
360  break;
361  prev_index_key = index_key;
362  }
363 #ifdef DEBUGGING
364  {
365  cerr << " cmp0 = " << cmp0 << ", going to " << ptr << endl;
366  }
367 #endif
368  store.set_pos(ptr);
369 
370  if (ptr != 0) {
371  last_key = index_key;
372  char buf[8];
373  int r;
374  {
375  // FIXME: rework to take advantage of buffering that's happening anyway?
376  char* p = buf;
377  for (int i = 0; i < 8; ++i) {
378  int ch2 = store.read();
379  if (ch2 == EOF) {
380  break;
381  }
382  *p++ = ch2;
383  if (ch2 < 128) break;
384  }
385  r = p - buf;
386  }
387  const char* p = buf;
388  const char* end = p + r;
389  if (!unpack_uint(&p, end, &val_size)) {
390  throw Xapian::DatabaseError("val_size unpack_uint invalid");
391  }
392  compressed = val_size & 1;
393  val_size >>= 1;
394  Assert(p == end);
395  } else {
396  last_key = string();
397  }
398 
399  if (cmp0 == 0) {
400  exact_match = true;
401  break;
402  }
403 
404 #ifdef DEBUGGING
405  {
406  string desc;
407  description_append(desc, last_key);
408  cerr << "Dropped to data layer on key: " << desc << endl;
409  }
410 #endif
411 
412  break;
413  }
414  default: {
415  string m = "HoneyTable: Unknown index type ";
416  m += str(index_type);
418  }
419  }
420 
421  std::string k;
422  int cmp;
423  if (!exact_match) {
424  do {
425  if (val_size) {
426  // Skip val data we've not looked at.
427  store.skip(val_size);
428  val_size = 0;
429  }
430  if (!read_key(k, val_size, compressed)) return false;
431  cmp = k.compare(key);
432  } while (cmp < 0);
433  if (cmp > 0) return false;
434  }
435  if (tag != NULL) {
436  if (compressed) {
437  std::string v;
438  read_val(v, val_size);
439  CompressionStream comp_stream;
440  comp_stream.decompress_start();
441  tag->resize(0);
442  if (!comp_stream.decompress_chunk(v.data(), v.size(), *tag)) {
443  // Decompression didn't complete.
444  abort();
445  }
446  } else {
447  read_val(*tag, val_size);
448  }
449  }
450  return true;
451 }
452 
455 {
456  if (rare(!store.is_open())) {
457  if (store.was_forced_closed())
459  return NULL;
460  }
461  return new HoneyCursor(this);
462 }
bool decompress_chunk(const char *p, int len, std::string &buf)
Returns true if this was the final chunk.
const char * compress(const char *buf, size_t *p_size)
HoneyCursor * cursor_get() const
Definition: honey_table.cc:454
bool get_exact_entry(std::string_view key, std::string *tag) const
Definition: honey_table.cc:247
void create_and_open(int flags_, const Honey::RootInfo &root_info)
Definition: honey_table.cc:43
void add(std::string_view key, const char *val, size_t val_size, bool compressed=false)
Definition: honey_table.cc:74
bool read_key(std::string &key, size_t &val_size, bool &compressed) const
Definition: honey_table.cc:158
void read_val(std::string &val, size_t val_size) const
Definition: honey_table.cc:232
void commit(honey_revision_number_t, Honey::RootInfo *root_info)
Definition: honey_table.cc:141
void open(int flags_, const Honey::RootInfo &root_info, honey_revision_number_t)
Definition: honey_table.cc:58
void set_num_entries(honey_tablesize_t n)
Definition: honey_version.h:61
uint4 get_compress_min() const
Definition: honey_version.h:58
honey_tablesize_t get_num_entries() const
Definition: honey_version.h:57
off_t get_offset() const
Definition: honey_version.h:55
void set_root(off_t root_)
Definition: honey_version.h:63
off_t get_root() const
Definition: honey_version.h:56
DatabaseCorruptError indicates database corruption was detected.
Definition: error.h:397
DatabaseError indicates some sort of database related error.
Definition: error.h:355
DatabaseOpeningError indicates failure to open a database.
Definition: error.h:569
InvalidArgumentError indicates an invalid parameter value was passed to the API.
Definition: error.h:229
InvalidOperationError indicates the API was used in an invalid way.
Definition: error.h:271
#define rare(COND)
Definition: config.h:607
PositionList * p
Append a string to an object description, escaping invalid UTF-8.
HoneyCursor class.
#define HONEY_MAX_KEY_LENGTH
Maximum key length.
Definition: honey_defs.h:39
uint4 honey_revision_number_t
The revision number of a honey database.
Definition: honey_defs.h:104
HoneyTable class.
#define SSTINDEX_BINARY_CHOP_ENTRY_SIZE
Definition: honey_table.h:34
#define SSTINDEX_BINARY_CHOP_KEY_SIZE
Definition: honey_table.h:32
string str(int value)
Convert int to std::string.
Definition: str.cc:91
#define AssertEq(A, B)
Definition: omassert.h:124
#define AssertRel(A, REL, B)
Definition: omassert.h:123
#define Assert(COND)
Definition: omassert.h:122
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
Definition: pack.h:346
void pack_uint(std::string &s, U value)
Append an encoded unsigned integer to a string.
Definition: pack.h:315
static void throw_database_closed()
Various handy string-related helpers.
std::string::size_type common_prefix_length(std::string_view a, std::string_view b)
Definition: stringutils.h:128
void description_append(std::string &desc, std::string_view s)
Definition: unittest.cc:105