xapian-core  2.0.0
honey_compact.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2004-2024 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include "xapian/compactor.h"
24 #include "xapian/constants.h"
25 #include "xapian/error.h"
26 #include "xapian/types.h"
27 
28 #include <algorithm>
29 #include <memory>
30 #include <queue>
31 #include <type_traits>
32 
33 #include <cerrno>
34 
35 #include "backends/flint_lock.h"
36 #include "compression_stream.h"
37 #include "honey_cursor.h"
38 #include "honey_database.h"
39 #include "honey_defs.h"
41 #include "honey_table.h"
42 #include "honey_values.h"
43 #include "honey_version.h"
44 #include "filetests.h"
45 #include "internaltypes.h"
46 #include "overflow.h"
47 #include "pack.h"
48 #include "stringutils.h"
49 #include "backends/valuestats.h"
50 #include "wordaccess.h"
51 
52 #include "../byte_length_strings.h"
53 #include "../prefix_compressed_strings.h"
54 
55 #ifdef XAPIAN_HAS_GLASS_BACKEND
56 # include "../glass/glass_database.h"
57 # include "../glass/glass_table.h"
58 # include "../glass/glass_values.h"
59 #endif
60 
61 using namespace std;
63 
64 [[noreturn]]
65 static void
66 throw_database_corrupt(const char* item, const char* pos)
67 {
68  string message;
69  if (pos != NULL) {
70  message = "Value overflow unpacking termlist: ";
71  } else {
72  message = "Out of data unpacking termlist: ";
73  }
74  message += item;
75  throw Xapian::DatabaseCorruptError(message);
76 }
77 
78 #ifdef XAPIAN_HAS_GLASS_BACKEND
79 namespace GlassCompact {
80 
81 static inline bool
82 is_user_metadata_key(const string& key)
83 {
84  return key.size() > 1 && key[0] == '\0' && key[1] == '\xc0';
85 }
86 
87 static inline bool
88 is_valuestats_key(const string& key)
89 {
90  return key.size() > 1 && key[0] == '\0' && key[1] == '\xd0';
91 }
92 
93 static inline bool
94 is_valuechunk_key(const string& key)
95 {
96  return key.size() > 1 && key[0] == '\0' && key[1] == '\xd8';
97 }
98 
99 static inline bool
100 is_doclenchunk_key(const string& key)
101 {
102  return key.size() > 1 && key[0] == '\0' && key[1] == '\xe0';
103 }
104 
105 }
106 
107 static inline bool
108 termlist_key_is_values_used(const string& key)
109 {
110  const char* p = key.data();
111  const char* e = p + key.size();
112  Xapian::docid did;
113  if (unpack_uint_preserving_sort(&p, e, &did)) {
114  if (p == e)
115  return false;
116  if (e - p == 1 && *p == '\0')
117  return true;
118  }
119  throw Xapian::DatabaseCorruptError("termlist key format");
120 }
121 #endif
122 
123 // Put all the helpers in a namespace to avoid symbols colliding with those of
124 // the same name in other flint-derived backends.
125 namespace HoneyCompact {
126 
128 static inline int
129 key_type(const string& key)
130 {
131  if (key[0] != '\0')
133 
134  if (key.size() <= 1)
135  return -1;
136 
137  unsigned char ch = key[1];
139  return Honey::KEY_VALUE_STATS;
141  return Honey::KEY_VALUE_CHUNK;
144 
145  // Handle Honey::KEY_USER_METADATA, Honey::KEY_POSTING_CHUNK, and currently
146  // unused values.
147  return ch;
148 }
149 
150 template<typename T> class PostlistCursor;
151 
152 #ifdef XAPIAN_HAS_GLASS_BACKEND
153 // Convert glass to honey.
154 template<>
155 class PostlistCursor<const GlassTable&> : private GlassCursor {
156  class DoclenEncoder {
157  string data;
159 
160  size_t pos = 0;
161 
162  public:
163  bool in_progress() const { return pos != 0; }
164 
165  void initialise(Xapian::docid firstdid_,
166  string&& glass_data,
167  size_t data_start) {
168  did = firstdid_;
169  data = glass_data;
170  const char* d = data.data() + data_start;
171  const char* e = data.data() + data.size();
172 
173  // Skip the "last chunk" flag and increase_to_last.
174  if (d == e)
175  throw Xapian::DatabaseCorruptError("No last chunk flag in "
176  "glass docdata chunk");
177  ++d;
178  Xapian::docid increase_to_last;
179  if (!unpack_uint(&d, e, &increase_to_last))
180  throw Xapian::DatabaseCorruptError("Decoding last docid delta "
181  "in glass docdata chunk");
182 
183  pos = d - data.data();
184  Assert(pos > 0);
185  }
186 
187  std::tuple<Xapian::docid, Xapian::docid> get_chunk(string& chunk) {
189  chunk.resize(1);
190  chunk[0] = char(32);
191 
192  Assert(pos > 0);
193  const char* d = data.data() + pos;
194  const char* e = data.data() + data.size();
195 
196  // We need to convert the doclen chunk to honey format. One
197  // notable difference between the formats is that glass stores
198  // a delta between each docid (so a large gap is cheap to
199  // represent) whereas honey stores an array of fixed size entries
200  // with unused docids represented by the all-bits-set value (so
201  // a large gap needs O(gap_size) bytes. In an extreme case, on
202  // a 32-bit system we may not be able to allocate enough memory
203  // to represent a chunk that in glass takes just a few bytes
204  // (in honey this case should be handled as two chunks).
205  //
206  // To keep things simple, here we split the data at every gap and
207  // emit a separate honey chunk for each contiguous run of docids
208  // from the glass chunk, then let the code in merge_postlists()
209  // below decide which to join together.
210 
211  Xapian::docid gap_size;
212 
213  Xapian::termcount doclen_max = 0;
214  while (true) {
215  Xapian::termcount doclen;
216  if (!unpack_uint(&d, e, &doclen))
217  throw Xapian::DatabaseCorruptError("Decoding doclen in "
218  "glass docdata chunk");
219 
220  if (doclen > doclen_max) {
221  if (doclen >= 0xffffffff) {
222  // FIXME: Handle these.
223  const char* m = "Document length values >= 0xffffffff "
224  "not currently handled";
226  }
227  doclen_max = doclen;
228  }
229 
230  auto s = chunk.size();
231  chunk.resize(s + 4);
232  unaligned_write4(reinterpret_cast<unsigned char*>(&chunk[s]),
233  doclen);
234 
235  if (d == e) {
236  data = string();
237  pos = 0;
238  break;
239  }
240  if (!unpack_uint(&d, e, &gap_size))
241  throw Xapian::DatabaseCorruptError("Decoding docid "
242  "gap_size in glass "
243  "docdata chunk");
244  if (gap_size) {
245  pos = d - data.data();
246  break;
247  }
248  }
249 
250  Xapian::docid new_chunk_firstdid = did;
251  AssertEq(chunk.size() % 4, 1);
252  // If the maximum possible docid is used then this will overflow to 0.
253  // If this happens, we must be on the final chunk, and the only further
254  // uses of did are in the lines which immediately follow.
255  UNSIGNED_OVERFLOW_OK(did += chunk.size() / 4);
256  // In the overflow case, this will overflow back again.
257  Xapian::docid new_chunk_lastdid = UNSIGNED_OVERFLOW_OK(did - 1);
258  did += gap_size;
259 
260  // Only encode document lengths using a whole number of bytes for
261  // now. We could allow arbitrary bit widths, but it complicates
262  // encoding and decoding so we should consider if the fairly small
263  // additional saving is worth it.
264  if (doclen_max >= 0xffff) {
265  if (doclen_max >= 0xffffff) {
266  // Already encoded for this case.
267  } else {
268  chunk[0] = char(24);
269  Assert(chunk.size() >= 5);
270  char* p = &chunk[1];
271  for (size_t i = 2; i < chunk.size(); i += 4) {
272  memcpy(p, &chunk[i], 3);
273  p += 3;
274  }
275  chunk.resize(p - &chunk[0]);
276  }
277  } else {
278  if (doclen_max >= 0xff) {
279  chunk[0] = char(16);
280  Assert(chunk.size() >= 5);
281  char* p = &chunk[1];
282  for (size_t i = 3; i < chunk.size(); i += 4) {
283  memcpy(p, &chunk[i], 2);
284  p += 2;
285  }
286  chunk.resize(p - &chunk[0]);
287  } else if (chunk.size() > 1) {
288  chunk[0] = char(8);
289  Assert(chunk.size() >= 5);
290  char* p = &chunk[1];
291  for (size_t i = 4; i < chunk.size(); i += 4) {
292  *p++ = chunk[i];
293  }
294  chunk.resize(p - &chunk[0]);
295  }
296  }
297 
298  return std::pair(new_chunk_firstdid, new_chunk_lastdid);
299  }
300  };
301 
303 
304  DoclenEncoder doclen_encoder;
305 
306  glass_tablesize_t value_stats_count = 0;
307 
308  glass_tablesize_t value_chunk_count = 0;
309 
311 
312  public:
313  string key, tag;
319  bool have_wdfs;
320 
322  : GlassCursor(in), offset(offset_), firstdid(0)
323  {
324  rewind();
325  }
326 
327  bool next() {
328  if (doclen_encoder.in_progress()) {
329  // Handle in-progress document length chunk.
330  std::tie(firstdid, chunk_lastdid) = doclen_encoder.get_chunk(tag);
331  return true;
332  }
333 
334  if (value_stats_count > 1) {
335  // Glass uses pack_uint_last() to encode the slot in value stats
336  // keys, which isn't a great choice as the key order doesn't in
337  // general match the slot number order. Honey fixes this, but
338  // that means we need to do more work here to process the entries
339  // in value slot number order
340 start_value_stats:
341  // Jump to the next value stats entry in slot number order.
342  key.assign("\0\xd0", 2);
343  while (true) {
344  key.resize(2);
345  pack_uint_last(key, UNSIGNED_OVERFLOW_OK(++slot));
346  if (find_exact(key)) break;
347  }
348  --value_stats_count;
349  // Adjust key.
350  key = Honey::make_valuestats_key(slot);
351  tag = current_tag;
352  return true;
353  } else if (value_stats_count == 1) {
354  // We've done all the value stats so move to just before the first
355  // entry after the value stats (so GlassCursor::next() below moves
356  // us to the next entry to do).
357  value_stats_count = 0;
358  find_entry_lt("\0\xd1"s);
359  }
360 
361  if (value_chunk_count > 1) {
362  // Glass uses pack_uint() to encode the slot in value chunk
363  // keys, which isn't a great choice as the key order doesn't in
364  // general match the slot number order. Honey fixes this, but
365  // that means we need to do more work here to process the entries
366  // in value slot number order
367 start_value_chunk:
368  // First check if there are more chunks for the slot we're working
369  // on.
370  key.assign("\0\xd8", 2);
371  pack_uint(key, slot);
373  !GlassCursor::next() ||
374  !startswith(current_key, key)) {
375  // No more chunks for the same slot, so jump to the next value
376  // chunk entry in slot number order.
377  while (true) {
378  key.resize(2);
379  pack_uint(key, UNSIGNED_OVERFLOW_OK(++slot));
380  find_entry_ge(key);
381  if (startswith(current_key, key)) break;
382  }
383  }
384  --value_chunk_count;
385 
386  // Adjust key.
387  const char* p = current_key.data();
388  const char* end = p + current_key.size();
389  p += key.size();
390  Xapian::docid first_did;
391  if (!unpack_uint_preserving_sort(&p, end, &first_did))
392  throw Xapian::DatabaseCorruptError("bad value key");
393  first_did += offset;
394 
395  read_tag();
396  tag = current_tag;
397  Glass::ValueChunkReader reader(tag.data(), tag.size(), first_did);
398  Xapian::docid last_did = first_did;
399  while (reader.next(), !reader.at_end()) {
400  last_did = reader.get_docid();
401  }
402 
403  key = Honey::make_valuechunk_key(slot, last_did);
404 
405  // Add the docid delta across the chunk to the start of the tag.
406  string newtag;
407  pack_uint(newtag, last_did - first_did);
408  tag.insert(0, newtag);
409  return true;
410  } else if (value_chunk_count == 1) {
411  // We've done all the value chunks so move to just before the first
412  // entry after the value chunks (so GlassCursor::next() below moves
413  // us to the next entry to do).
414  value_chunk_count = 0;
415  find_entry_lt("\0\xd9"s);
416  }
417 
418  if (!GlassCursor::next()) return false;
419 
420  if (GlassCompact::is_valuestats_key(current_key)) {
421  // Set value_stats_count to one more than the number of entries so
422  // we can easily reset after the final entry.
423  value_stats_count = 2;
424  while (GlassCursor::next() &&
425  GlassCompact::is_valuestats_key(current_key)) {
426  ++value_stats_count;
427  }
428  // Set slot so incrementing it gives zero.
429  slot = Xapian::valueno(-1);
430  goto start_value_stats;
431  }
432 
433  if (GlassCompact::is_valuechunk_key(current_key)) {
434  // Set value_chunk_count to one more than the number of entries so
435  // we can easily reset after the final entry.
436  value_chunk_count = 2;
437  while (GlassCursor::next() &&
438  GlassCompact::is_valuechunk_key(current_key)) {
439  ++value_chunk_count;
440  }
441  // Set slot so incrementing it gives zero.
442  slot = Xapian::valueno(-1);
443  goto start_value_chunk;
444  }
445 
446  key = current_key;
447 
448  // We put all chunks into the non-initial chunk form here, then fix up
449  // the first chunk for each term in the merged database as we merge.
450  read_tag();
451  tag = current_tag;
452  tf = cf = 0;
454  key[1] = Honey::KEY_USER_METADATA;
455  return true;
456  }
457 
459  const char* d = key.data();
460  const char* e = d + key.size();
461  d += 2;
462 
463  size_t data_start = 0;
464  if (d == e) {
465  // This is an initial chunk, so adjust tag header.
466  d = tag.data();
467  e = d + tag.size();
468  if (!unpack_uint(&d, e, &tf) ||
469  !unpack_uint(&d, e, &cf) ||
470  !unpack_uint(&d, e, &firstdid)) {
471  throw Xapian::DatabaseCorruptError("Bad postlist key");
472  }
473  ++firstdid;
474  data_start = d - tag.data();
475  } else {
476  // Not an initial chunk, just unpack firstdid.
477  if (!unpack_uint_preserving_sort(&d, e, &firstdid) || d != e)
478  throw Xapian::DatabaseCorruptError("Bad postlist key");
479  }
480  firstdid += offset;
481 
482  // Set key to placeholder value which just indicates that this is a
483  // doclen chunk.
484  static const char doclen_key_prefix[2] = {
485  0, char(Honey::KEY_DOCLEN_CHUNK)
486  };
487  key.assign(doclen_key_prefix, 2);
488 
489  doclen_encoder.initialise(firstdid, std::move(tag), data_start);
490  std::tie(firstdid, chunk_lastdid) = doclen_encoder.get_chunk(tag);
491  return true;
492  }
493 
494  // Adjust key if this is *NOT* an initial chunk.
495  // key is: pack_string_preserving_sort(key, tname)
496  // plus optionally: pack_uint_preserving_sort(key, did)
497  const char* d = key.data();
498  const char* e = d + key.size();
499  string tname;
500  if (!unpack_string_preserving_sort(&d, e, tname))
501  throw Xapian::DatabaseCorruptError("Bad postlist key");
502 
503  if (d == e) {
504  // This is an initial chunk for a term, so adjust tag header.
505  d = tag.data();
506  e = d + tag.size();
507  if (!unpack_uint(&d, e, &tf) ||
508  !unpack_uint(&d, e, &cf) ||
509  !unpack_uint(&d, e, &firstdid)) {
510  throw Xapian::DatabaseCorruptError("Bad postlist key");
511  }
512  ++firstdid;
513  have_wdfs = (cf != 0);
514  tag.erase(0, d - tag.data());
515  wdf_max = 0;
516  } else {
517  // Not an initial chunk, so adjust key.
518  size_t tmp = d - key.data();
519  if (!unpack_uint_preserving_sort(&d, e, &firstdid) || d != e)
520  throw Xapian::DatabaseCorruptError("Bad postlist key");
521  key.erase(tmp - 1);
522  }
523  firstdid += offset;
524 
525  d = tag.data();
526  e = d + tag.size();
527 
528  // Convert posting chunk to honey format, but without any header.
529  string newtag;
530 
531  // Skip the "last chunk" flag; decode increase_to_last.
532  if (d == e)
533  throw Xapian::DatabaseCorruptError("No last chunk flag in glass "
534  "posting chunk");
535  ++d;
536  Xapian::docid increase_to_last;
537  if (!unpack_uint(&d, e, &increase_to_last))
538  throw Xapian::DatabaseCorruptError("Decoding last docid delta in "
539  "glass posting chunk");
540  chunk_lastdid = firstdid + increase_to_last;
541  if (!unpack_uint(&d, e, &first_wdf))
542  throw Xapian::DatabaseCorruptError("Decoding first wdf in glass "
543  "posting chunk");
544  if ((first_wdf != 0) != have_wdfs) {
545  // FIXME: Also need to adjust document length, total document
546  // length.
547 
548  // Convert wdf=0 to 1 when the term has non-zero wdf elsewhere.
549  // first_wdf = 1;
550  // ++cf;
551  throw Xapian::DatabaseError("Honey does not support a term having "
552  "both zero and non-zero wdf");
553  }
554  wdf_max = max(wdf_max, first_wdf);
555 
556  while (d != e) {
557  Xapian::docid delta;
558  if (!unpack_uint(&d, e, &delta))
559  throw Xapian::DatabaseCorruptError("Decoding docid delta in "
560  "glass posting chunk");
561  pack_uint(newtag, delta);
562  Xapian::termcount wdf;
563  if (!unpack_uint(&d, e, &wdf))
564  throw Xapian::DatabaseCorruptError("Decoding wdf in glass "
565  "posting chunk");
566  if ((wdf != 0) != have_wdfs) {
567  // FIXME: Also need to adjust document length, total document
568  // length.
569 
570  // Convert wdf=0 to 1 when the term has non-zero wdf elsewhere.
571  // wdf = 1;
572  // ++cf;
573  throw Xapian::DatabaseError("Honey does not support a term "
574  "having both zero and non-zero "
575  "wdf");
576  }
577  if (have_wdfs) {
578  pack_uint(newtag, wdf);
579  wdf_max = max(wdf_max, wdf);
580  }
581  }
582 
583  swap(tag, newtag);
584 
585  return true;
586  }
587 };
588 #endif
589 
590 template<>
591 class PostlistCursor<const HoneyTable&> : private HoneyCursor {
593 
594  public:
595  string key, tag;
602  bool have_wdfs;
603 
605  : HoneyCursor(in), offset(offset_), firstdid(0)
606  {
607  rewind();
608  }
609 
610  bool next() {
611  if (!HoneyCursor::next()) return false;
612  // We put all chunks into the non-initial chunk form here, then fix up
613  // the first chunk for each term in the merged database as we merge.
614  read_tag();
615  key = current_key;
616  tag = current_tag;
617  switch (key_type(key)) {
620  return true;
621  case Honey::KEY_VALUE_CHUNK: {
622  const char* p = key.data();
623  const char* end = p + key.length();
624  p += 2;
625  Xapian::valueno slot;
626  if (p[-1] != char(Honey::KEY_VALUE_CHUNK_HI)) {
627  slot = p[-1] - Honey::KEY_VALUE_CHUNK;
628  } else {
629  if (!unpack_uint_preserving_sort(&p, end, &slot))
630  throw Xapian::DatabaseCorruptError("bad value key");
631  }
632  Xapian::docid did;
633  if (!unpack_uint_preserving_sort(&p, end, &did))
634  throw Xapian::DatabaseCorruptError("bad value key");
635  key = Honey::make_valuechunk_key(slot, did + offset);
636  return true;
637  }
640  if (did == 0)
641  throw Xapian::DatabaseCorruptError("Bad doclen key");
642  chunk_lastdid = UNSIGNED_OVERFLOW_OK(did + offset);
643  // Each doclen chunk has a one byte header giving the number of
644  // bits per entry (which currently can be 8, 16, 24 or 32. We
645  // want to subtract one less than the number of entries in the
646  // chunk to get the first did, so we subtract an extra one
647  // before the division and integer division will rounding down
648  // to give us the result we want.
649  firstdid = chunk_lastdid - (tag.size() - 2) / (tag[0] / 8);
650  // Normalise so all doclen chunk keys are the same.
651  key.assign(KEY_DOCLEN_PREFIX, 2);
652  return true;
653  }
655  break;
656  default:
657  throw Xapian::DatabaseCorruptError("Bad postlist table key "
658  "type");
659  }
660 
661  // Adjust key if this is *NOT* an initial chunk.
662  // key is: pack_string_preserving_sort(key, term)
663  // plus optionally: pack_uint_preserving_sort(key, did)
664  const char* d = key.data();
665  const char* e = d + key.size();
666  string term;
667  if (!unpack_string_preserving_sort(&d, e, term))
668  throw Xapian::DatabaseCorruptError("Bad postlist key");
669 
670  if (d == e) {
671  // This is an initial chunk for a term, so remove tag header.
672  d = tag.data();
673  e = d + tag.size();
674 
675  Xapian::docid lastdid;
676  if (!decode_initial_chunk_header(&d, e, tf, cf,
677  firstdid, lastdid, chunk_lastdid,
678  first_wdf, wdf_max)) {
679  throw Xapian::DatabaseCorruptError("Bad postlist initial "
680  "chunk header");
681  }
682  // Ignore lastdid - we'll need to recalculate it (at least when
683  // merging, and for simplicity we always do).
684  (void)lastdid;
685  tag.erase(0, d - tag.data());
686 
687  if (tf <= 2) {
688  have_wdfs = false;
689  Assert(tag.empty());
690  } else {
691  have_wdfs = (cf != 0) && (cf - first_wdf != tf - 1);
692  Xapian::termcount remaining_cf_for_flat_wdf;
693  if (have_wdfs &&
694  !mul_overflows(tf - 1, wdf_max,
695  remaining_cf_for_flat_wdf) &&
696  cf - first_wdf == remaining_cf_for_flat_wdf) {
697  // The wdf is flat for the second and subsequent entries
698  // so we don't need to store it.
699  have_wdfs = false;
700  }
701  }
702  } else {
703  if (cf > 0) {
704  // The cf we report should only be non-zero for initial chunks
705  // (of which there may be several for the same term from
706  // different instances of this class when merging databases)
707  // and gets summed to give the total cf for the term, so we
708  // need to zero it here, after potentially using it to
709  // calculate first_wdf.
710  //
711  // If cf is zero for a term, first_wdf will be zero for all
712  // chunks so doesn't need setting specially here.
713  if (!have_wdfs)
714  first_wdf = (cf - first_wdf) / (tf - 1);
715  cf = 0;
716  }
717  tf = 0;
718 
719  // Not an initial chunk, so adjust key and remove tag header.
720  size_t tmp = d - key.data();
721  if (!unpack_uint_preserving_sort(&d, e, &chunk_lastdid) ||
722  d != e) {
723  throw Xapian::DatabaseCorruptError("Bad postlist key");
724  }
725  // -1 to remove the terminating zero byte too.
726  key.erase(tmp - 1);
727 
728  d = tag.data();
729  e = d + tag.size();
730 
731  if (have_wdfs) {
732  if (!decode_delta_chunk_header(&d, e, chunk_lastdid, firstdid,
733  first_wdf)) {
734  throw Xapian::DatabaseCorruptError("Bad postlist delta "
735  "chunk header");
736  }
737  } else {
738  if (!decode_delta_chunk_header_no_wdf(&d, e, chunk_lastdid,
739  firstdid)) {
740  throw Xapian::DatabaseCorruptError("Bad postlist delta "
741  "chunk header");
742  }
743  }
744  tag.erase(0, d - tag.data());
745  }
746  UNSIGNED_OVERFLOW_OK(firstdid += offset);
747  UNSIGNED_OVERFLOW_OK(chunk_lastdid += offset);
748  return true;
749  }
750 };
751 
752 template<>
754  public:
756  : PostlistCursor<const HoneyTable&>(in, offset_) {}
757 };
758 
759 template<typename T>
761  public:
764  bool operator()(const T* a, const T* b) const {
765  if (a->key > b->key) return true;
766  if (a->key != b->key) return false;
767  return (a->firstdid > b->firstdid);
768  }
769 };
770 
771 // U : vector<HoneyTable*>::const_iterator
772 template<typename T, typename U> void
774  T* out, vector<Xapian::docid>::const_iterator offset,
775  U b, U e)
776 {
777  typedef decltype(**b) table_type; // E.g. HoneyTable
778  typedef PostlistCursor<table_type> cursor_type;
779  typedef PostlistCursorGt<cursor_type> gt_type;
780  priority_queue<cursor_type*, vector<cursor_type*>, gt_type> pq;
781  for ( ; b != e; ++b, ++offset) {
782  auto in = *b;
783  auto cursor = new cursor_type(in, *offset);
784  if (cursor->next()) {
785  pq.push(cursor);
786  } else {
787  // Skip empty tables.
788  delete cursor;
789  }
790  }
791 
792  string last_key;
793  {
794  // Merge user metadata.
795  vector<string> tags;
796  while (!pq.empty()) {
797  cursor_type* cur = pq.top();
798  const string& key = cur->key;
799  if (key_type(key) != Honey::KEY_USER_METADATA) break;
800 
801  if (key != last_key) {
802  if (!tags.empty()) {
803  if (tags.size() > 1 && compactor) {
804  Assert(!last_key.empty());
805  // FIXME: It would be better to merge all duplicates
806  // for a key in one call, but currently we don't in
807  // multipass mode.
808  const string& resolved_tag =
809  compactor->resolve_duplicate_metadata(last_key,
810  tags.size(),
811  &tags[0]);
812  if (!resolved_tag.empty())
813  out->add(last_key, resolved_tag);
814  } else {
815  Assert(!last_key.empty());
816  out->add(last_key, tags[0]);
817  }
818  tags.resize(0);
819  }
820  last_key = key;
821  }
822  tags.push_back(cur->tag);
823 
824  pq.pop();
825  if (cur->next()) {
826  pq.push(cur);
827  } else {
828  delete cur;
829  }
830  }
831  if (!tags.empty()) {
832  if (tags.size() > 1 && compactor) {
833  Assert(!last_key.empty());
834  const string& resolved_tag =
835  compactor->resolve_duplicate_metadata(last_key,
836  tags.size(),
837  &tags[0]);
838  if (!resolved_tag.empty())
839  out->add(last_key, resolved_tag);
840  } else {
841  Assert(!last_key.empty());
842  out->add(last_key, tags[0]);
843  }
844  }
845  }
846 
847  {
848  // Merge valuestats.
849  Xapian::doccount freq = 0;
850  string lbound, ubound;
851 
852  while (!pq.empty()) {
853  cursor_type* cur = pq.top();
854  const string& key = cur->key;
855  if (key_type(key) != Honey::KEY_VALUE_STATS) break;
856  if (key != last_key) {
857  // For the first valuestats key, last_key will be the previous
858  // key we wrote, which we don't want to overwrite. This is the
859  // only time that freq will be 0, so check that.
860  if (freq) {
861  out->add(last_key, encode_valuestats(freq, lbound, ubound));
862  freq = 0;
863  }
864  last_key = key;
865  }
866 
867  const string& tag = cur->tag;
868 
869  const char* pos = tag.data();
870  const char* end = pos + tag.size();
871 
873  string l, u;
874  if (!unpack_uint(&pos, end, &f)) {
875  if (*pos == 0)
876  throw Xapian::DatabaseCorruptError("Incomplete stats item "
877  "in value table");
878  throw Xapian::RangeError("Frequency statistic in value table "
879  "is too large");
880  }
881  if (!unpack_string(&pos, end, l)) {
882  if (*pos == 0)
883  throw Xapian::DatabaseCorruptError("Incomplete stats item "
884  "in value table");
885  throw Xapian::RangeError("Lower bound in value table is too "
886  "large");
887  }
888  size_t len = end - pos;
889  if (len == 0) {
890  u = l;
891  } else {
892  u.assign(pos, len);
893  }
894  if (freq == 0) {
895  freq = f;
896  lbound = l;
897  ubound = u;
898  } else {
899  freq += f;
900  if (l < lbound) lbound = l;
901  if (u > ubound) ubound = u;
902  }
903 
904  pq.pop();
905  if (cur->next()) {
906  pq.push(cur);
907  } else {
908  delete cur;
909  }
910  }
911 
912  if (freq) {
913  out->add(last_key, encode_valuestats(freq, lbound, ubound));
914  }
915  }
916 
917  // Merge valuestream chunks.
918  while (!pq.empty()) {
919  cursor_type* cur = pq.top();
920  const string& key = cur->key;
921  if (key_type(key) != Honey::KEY_VALUE_CHUNK) break;
922  out->add(key, cur->tag);
923  pq.pop();
924  if (cur->next()) {
925  pq.push(cur);
926  } else {
927  delete cur;
928  }
929  }
930 
931  // Merge doclen chunks.
932  while (!pq.empty()) {
933  cursor_type* cur = pq.top();
934  if (key_type(cur->key) != Honey::KEY_DOCLEN_CHUNK) break;
935  string tag = std::move(cur->tag);
936  auto chunk_lastdid = cur->chunk_lastdid;
937  pq.pop();
938  if (cur->next()) {
939  pq.push(cur);
940  } else {
941  delete cur;
942  }
943  while (!pq.empty()) {
944  cur = pq.top();
945  if (key_type(cur->key) != Honey::KEY_DOCLEN_CHUNK)
946  break;
947  if (tag[0] != cur->tag[0]) {
948  // Different width values in the two tags, so punt for now.
949  // FIXME: We would ideally optimise the total size here.
950  break;
951  }
952  size_t byte_width = tag[0] / 8;
953  auto new_size = tag.size();
954  Xapian::docid gap_size = cur->firstdid - chunk_lastdid - 1;
955  new_size += gap_size * byte_width;
956  if (new_size >= HONEY_DOCLEN_CHUNK_MAX) {
957  // The gap spans beyond HONEY_DOCLEN_CHUNK_MAX.
958  break;
959  }
960  new_size += cur->tag.size() - 1;
961  auto full_new_size = new_size;
962  if (new_size > HONEY_DOCLEN_CHUNK_MAX) {
963  if (byte_width > 1) {
964  // HONEY_DOCLEN_CHUNK_MAX should be one more than a
965  // multiple of 12 so for widths 1,2,3,4 we can fit the
966  // initial byte which indicates the width for the chunk
967  // plus an exact number of entries.
968  auto m = (new_size - HONEY_DOCLEN_CHUNK_MAX) % byte_width;
969  (void)m;
970  AssertEq(m, 0);
971  }
972  new_size = HONEY_DOCLEN_CHUNK_MAX;
973  }
974  tag.reserve(new_size);
975  tag.append(byte_width * gap_size, '\xff');
976  if (new_size != full_new_size) {
977  // Partial copy.
978  auto copy_size = new_size - tag.size();
979  tag.append(cur->tag, 1, copy_size);
980  cur->tag.erase(1, copy_size);
981  copy_size /= byte_width;
982  cur->firstdid += copy_size;
983  chunk_lastdid += gap_size;
984  chunk_lastdid += copy_size;
985  break;
986  }
987 
988  tag.append(cur->tag, 1, string::npos);
989  chunk_lastdid = cur->chunk_lastdid;
990 
991  pq.pop();
992  if (cur->next()) {
993  pq.push(cur);
994  } else {
995  delete cur;
996  }
997  }
998  out->add(Honey::make_doclenchunk_key(chunk_lastdid), tag);
999  }
1000 
1001  struct HoneyPostListChunk {
1002  Xapian::docid first, last;
1003  Xapian::termcount first_wdf;
1004  Xapian::termcount wdf_max;
1005 
1006  private:
1007  Xapian::doccount tf;
1008  Xapian::termcount cf;
1009  bool have_wdfs;
1010  string data;
1011 
1012  public:
1013  HoneyPostListChunk(Xapian::docid first_,
1014  Xapian::docid last_,
1015  Xapian::termcount first_wdf_,
1016  Xapian::termcount wdf_max_,
1017  Xapian::doccount tf_,
1018  Xapian::termcount cf_,
1019  bool have_wdfs_,
1020  string&& data_)
1021  : first(first_),
1022  last(last_),
1023  first_wdf(first_wdf_),
1024  wdf_max(wdf_max_),
1025  tf(tf_),
1026  cf(cf_),
1027  have_wdfs(have_wdfs_),
1028  data(data_) {}
1029 
1034  size_t data_size(bool want_wdfs) const {
1035  if (data.empty()) {
1036  // Allow one byte for each docid delta, and another byte for
1037  // each wdf (if explicitly stored).
1038  return tf * (1u + size_t(want_wdfs));
1039  }
1040 
1041  if (have_wdfs == want_wdfs) {
1042  // Allow 1 for the explicit wdf once spliced.
1043  return data.size() + size_t(want_wdfs);
1044  }
1045 
1046  if (have_wdfs) {
1047  // Assume stripping wdfs will halve the size.
1048  return (data.size() + 1u) / 2u;
1049  }
1050 
1051  // Assume adding wdfs will double the size.
1052  return data.size() * 2u;
1053  }
1054 
1056  void append_postings_to(string& tag, bool want_wdfs) {
1057  if (data.empty()) {
1058  if (tf == 1) {
1059  return;
1060  }
1061  AssertEq(tf, 2);
1062  pack_uint(tag, last - first - 1);
1063  if (want_wdfs) {
1064  pack_uint(tag, cf - first_wdf);
1065  }
1066  return;
1067  }
1068 
1069  if (have_wdfs == want_wdfs) {
1070  // We can just copy the encoded data.
1071  tag += data;
1072  } else if (want_wdfs) {
1073  // Need to add wdfs which were implicit.
1074  auto wdf = (cf - first_wdf) / (tf - 1);
1075  const char* pos = data.data();
1076  const char* pos_end = pos + data.size();
1077  while (pos != pos_end) {
1078  Xapian::docid delta;
1079  if (!unpack_uint(&pos, pos_end, &delta))
1080  throw_database_corrupt("Decoding docid delta", pos);
1081  pack_uint(tag, delta);
1082  pack_uint(tag, wdf);
1083  }
1084  } else {
1085  // Need to remove wdfs which are now implicit.
1086  const char* pos = data.data();
1087  const char* pos_end = pos + data.size();
1088  while (pos != pos_end) {
1089  Xapian::docid delta;
1090  if (!unpack_uint(&pos, pos_end, &delta))
1091  throw_database_corrupt("Decoding docid delta", pos);
1092  pack_uint(tag, delta);
1093  Xapian::termcount wdf;
1094  if (!unpack_uint(&pos, pos_end, &wdf))
1095  throw_database_corrupt("Decoding wdf", pos);
1096  (void)wdf;
1097  }
1098  }
1099  }
1100 
1102  void append_postings_to(string& tag, bool want_wdfs,
1103  Xapian::docid splice_did) {
1104  pack_uint(tag, first - splice_did - 1);
1105  if (want_wdfs) {
1106  pack_uint(tag, first_wdf);
1107  }
1108  append_postings_to(tag, want_wdfs);
1109  }
1110  };
1111  vector<HoneyPostListChunk> tags;
1112 
1113  Xapian::termcount tf = 0, cf = 0; // Initialise to avoid warnings.
1114 
1115  while (true) {
1116  cursor_type* cur = NULL;
1117  if (!pq.empty()) {
1118  cur = pq.top();
1119  pq.pop();
1120  }
1121  if (cur) {
1123  }
1124  if (cur == NULL || cur->key != last_key) {
1125  if (!tags.empty()) {
1126  Xapian::termcount first_wdf = tags[0].first_wdf;
1127  Xapian::docid chunk_lastdid = tags[0].last;
1128  Xapian::docid last_did = tags.back().last;
1129  Xapian::termcount wdf_max =
1130  max_element(tags.begin(), tags.end(),
1131  [](const HoneyPostListChunk& x,
1132  const HoneyPostListChunk& y) {
1133  return x.wdf_max < y.wdf_max;
1134  })->wdf_max;
1135 
1136  bool have_wdfs = true;
1137  if (cf == 0) {
1138  // wdf must always be zero.
1139  have_wdfs = false;
1140  } else if (tf <= 2) {
1141  // We only need to store cf and first_wdf to know all wdfs.
1142  have_wdfs = false;
1143  } else if (cf == tf - 1 + first_wdf) {
1144  // wdf must be 1 for second and subsequent entries.
1145  have_wdfs = false;
1146  } else {
1147  Xapian::termcount remaining_cf_for_flat_wdf;
1148  if (!mul_overflows(tf - 1, wdf_max,
1149  remaining_cf_for_flat_wdf) &&
1150  cf - first_wdf == remaining_cf_for_flat_wdf) {
1151  // The wdf is flat for the second and subsequent entries
1152  // so we don't need to store it.
1153  have_wdfs = false;
1154  }
1155  }
1156 
1157  // Decide whether to concatenate chunks, and if so how many.
1158  // We need to know the final docid in the chunk in order to
1159  // encode the header. We merge [0,j) here.
1160  size_t j = 1;
1161  if (tags.size() > 1) {
1162  if (tf <= HONEY_POSTLIST_CHUNK_MAX / (2 * (64 / 7 + 1))) {
1163  // The worst-case size if we merged all the chunks is
1164  // small enough to just have one chunk.
1165  //
1166  // Each posting needs a docid delta and wdf value, both
1167  // of which might be 64 bit and the encoding used needs
1168  // a byte for every 7 bits up to the msb set bit.
1169  j = tags.size();
1170  } else {
1171  size_t est = tags[0].data_size(have_wdfs);
1172  while (j < tags.size()) {
1173  est += tags[j].data_size(have_wdfs);
1174  if (est > HONEY_POSTLIST_CHUNK_MAX) break;
1175  ++j;
1176  }
1177  }
1178  }
1179 
1180  chunk_lastdid = tags[j - 1].last;
1181 
1182  string first_tag;
1183  encode_initial_chunk_header(tf, cf, tags[0].first, last_did,
1184  chunk_lastdid,
1185  first_wdf, wdf_max, first_tag);
1186 
1187  if (tf > 2) {
1188  // If tf <= 2 there's no explicit posting data.
1189  tags[0].append_postings_to(first_tag, have_wdfs);
1190  for (size_t chunk = 1; chunk != j; ++chunk) {
1191  tags[chunk].append_postings_to(first_tag, have_wdfs,
1192  tags[chunk - 1].last);
1193  }
1194  }
1195  out->add(last_key, first_tag);
1196 
1197  if (j != tags.size()) {
1198  // Output continuation chunk(s).
1199  string term;
1200  const char* p = last_key.data();
1201  const char* end = p + last_key.size();
1202  if (!unpack_string_preserving_sort(&p, end, term) ||
1203  p != end) {
1204  throw Xapian::DatabaseCorruptError("Bad postlist "
1205  "chunk key");
1206  }
1207 
1208  while (j < tags.size()) {
1209  // Here we merge tags [i,j) to a continuation chunk.
1210  size_t i = j;
1211  size_t est = tags[j].data_size(have_wdfs);
1212  while (++j < tags.size()) {
1213  est += tags[j].data_size(have_wdfs);
1214  if (est > HONEY_POSTLIST_CHUNK_MAX) break;
1215  }
1216 
1217  last_did = tags[j - 1].last;
1218  string tag;
1219  if (have_wdfs) {
1221  last_did,
1222  tags[i].first_wdf,
1223  tag);
1224  } else {
1226  last_did,
1227  tag);
1228  }
1229 
1230  tags[i].append_postings_to(tag, have_wdfs);
1231  while (++i != j) {
1232  tags[i].append_postings_to(tag, have_wdfs,
1233  tags[i - 1].last);
1234  }
1235 
1236  out->add(pack_honey_postlist_key(term, last_did), tag);
1237  }
1238  }
1239  }
1240  tags.clear();
1241  if (cur == NULL) break;
1242  tf = cf = 0;
1243  last_key = cur->key;
1244  }
1245 
1246  if (tf && cur->tf && (cf == 0) != (cur->cf == 0)) {
1247  // FIXME: Also need to adjust document length, total document
1248  // length.
1249  // Convert wdf=0 to 1 when the term has non-zero wdf elsewhere.
1250  throw Xapian::DatabaseError("Honey does not support a term having "
1251  "both zero and non-zero wdf");
1252  }
1253  tf += cur->tf;
1254  cf += cur->cf;
1255 
1256  tags.push_back(HoneyPostListChunk(cur->firstdid,
1257  cur->chunk_lastdid,
1258  cur->first_wdf,
1259  cur->wdf_max,
1260  cur->tf,
1261  cur->cf,
1262  cur->have_wdfs,
1263  std::move(cur->tag)));
1264  if (cur->next()) {
1265  pq.push(cur);
1266  } else {
1267  delete cur;
1268  }
1269  }
1270 }
1271 
1272 template<typename T> struct MergeCursor;
1273 
1274 #ifdef XAPIAN_HAS_GLASS_BACKEND
1275 template<>
1276 struct MergeCursor<const GlassTable&> : public GlassCursor {
1277  explicit MergeCursor(const GlassTable* in) : GlassCursor(in) {
1278  rewind();
1279  }
1280 };
1281 #endif
1282 
1283 template<>
1284 struct MergeCursor<const HoneyTable&> : public HoneyCursor {
1285  explicit MergeCursor(const HoneyTable* in) : HoneyCursor(in) {
1286  rewind();
1287  }
1288 };
1289 
1290 template<typename T>
1291 struct CursorGt {
1293  bool operator()(const T* a, const T* b) const {
1294  if (b->after_end()) return false;
1295  if (a->after_end()) return true;
1296  return (a->current_key > b->current_key);
1297  }
1298 };
1299 
1300 #ifdef XAPIAN_HAS_GLASS_BACKEND
1301 // Convert glass to honey.
1302 static void
1304  vector<const GlassTable*>::const_iterator b,
1305  vector<const GlassTable*>::const_iterator e)
1306 {
1307  typedef MergeCursor<const GlassTable&> cursor_type;
1308  typedef CursorGt<cursor_type> gt_type;
1309  priority_queue<cursor_type*, vector<cursor_type*>, gt_type> pq;
1310  for ( ; b != e; ++b) {
1311  auto in = *b;
1312  auto cursor = new cursor_type(in);
1313  if (cursor->next()) {
1314  pq.push(cursor);
1315  } else {
1316  // Skip empty tables.
1317  delete cursor;
1318  }
1319  }
1320 
1321  while (!pq.empty()) {
1322  cursor_type* cur = pq.top();
1323  pq.pop();
1324 
1325  // Glass vs honey spelling key prefix map:
1326  //
1327  // Type Glass Honey
1328  //
1329  // bookend Bbd \x00bd
1330  // head Hhe \x01he
1331  // middle Mddl \x02ddl
1332  // tail Til \x03il
1333  // word Wword word
1334  //
1335  // In honey, if the word's first byte is <= '\x04', we add a prefix
1336  // of '\x04' (which is unlikely in real world use but ensures that
1337  // we can handle arbitrary strings).
1338  //
1339  // The prefixes for honey are chosen such that we save a byte for the
1340  // key of all real-world words, and so that more first bytes are used
1341  // so that a top-level array index is more effective, especially for
1342  // random-access lookup of word entries (which we do to check the
1343  // frequency of potential spelling candidates).
1344  //
1345  // The other prefixes are chosen such that we can do look up in key
1346  // order at search time, which is more efficient for a cursor which can
1347  // only efficiently move forwards.
1348  //
1349  // Note that the key ordering is the same for glass and honey, which
1350  // makes translating during compaction simpler.
1351  string key = cur->current_key;
1352  switch (key[0]) {
1353  case 'B':
1354  key[0] = Honey::KEY_PREFIX_BOOKEND;
1355  break;
1356  case 'H':
1357  key[0] = Honey::KEY_PREFIX_HEAD;
1358  break;
1359  case 'M':
1360  key[0] = Honey::KEY_PREFIX_MIDDLE;
1361  break;
1362  case 'T':
1363  key[0] = Honey::KEY_PREFIX_TAIL;
1364  break;
1365  case 'W':
1366  if (static_cast<unsigned char>(key[1]) > Honey::KEY_PREFIX_WORD)
1367  key.erase(0, 1);
1368  else
1369  key[0] = Honey::KEY_PREFIX_WORD;
1370  break;
1371  default: {
1372  string m = "Bad spelling key prefix: ";
1373  m += static_cast<unsigned char>(key[0]);
1375  }
1376  }
1377 
1378  if (pq.empty() || pq.top()->current_key > key) {
1379  // No merging to do for this key so just copy the tag value,
1380  // adjusting if necessary. If we don't need to adjust it, just
1381  // copy the compressed value.
1382  bool compressed;
1383  switch (key[0]) {
1384  case Honey::KEY_PREFIX_HEAD: {
1385  // Honey omits the first two bytes from the first entry
1386  // since we know what those most be from the key
1387  // (subsequent entries won't include those anyway, since
1388  // they'll be part of the reused prefix).
1389  compressed = cur->read_tag(false);
1390  unsigned char len = cur->current_tag[0] ^ MAGIC_XOR_VALUE;
1391  cur->current_tag[0] = (len - 2) ^ MAGIC_XOR_VALUE;
1392  AssertEq(cur->current_tag[1], key[1]);
1393  AssertEq(cur->current_tag[2], key[2]);
1394  cur->current_tag.erase(1, 2);
1395  break;
1396  }
1399  // Need to repack because honey omits the last 2 or 1 bytes
1400  // from each entry (2 for tail, 1 for bookend) since we
1401  // know what those must be from the key.
1402  compressed = cur->read_tag(false);
1403  PrefixCompressedStringItor spell_in(cur->current_tag);
1404  string new_tag;
1405  PrefixCompressedStringWriter spell_out(new_tag, key);
1406  while (!spell_in.at_end()) {
1407  spell_out.append(*spell_in);
1408  ++spell_in;
1409  }
1410  cur->current_tag = std::move(new_tag);
1411  break;
1412  }
1413  default:
1414  compressed = cur->read_tag(true);
1415  break;
1416  }
1417  out->add(key, cur->current_tag, compressed);
1418  if (cur->next()) {
1419  pq.push(cur);
1420  } else {
1421  delete cur;
1422  }
1423  continue;
1424  }
1425 
1426  // Merge tag values with the same key:
1427  string tag;
1428  if (static_cast<unsigned char>(key[0]) < Honey::KEY_PREFIX_WORD) {
1429  // We just want the union of words, so copy over the first instance
1430  // and skip any identical ones.
1431  priority_queue<PrefixCompressedStringItor*,
1432  vector<PrefixCompressedStringItor*>,
1434  // Stick all the cursor_type pointers in a vector because their
1435  // current_tag members must remain valid while we're merging their
1436  // tags, but we need to call next() on them all afterwards.
1437  vector<cursor_type*> vec;
1438  vec.reserve(pq.size());
1439 
1440  while (true) {
1441  cur->read_tag();
1442  pqtag.push(new PrefixCompressedStringItor(cur->current_tag));
1443  vec.push_back(cur);
1444  if (pq.empty() || pq.top()->current_key != key) break;
1445  cur = pq.top();
1446  pq.pop();
1447  }
1448 
1449  PrefixCompressedStringWriter wr(tag, key);
1450  string lastword;
1451  while (!pqtag.empty()) {
1452  PrefixCompressedStringItor* it = pqtag.top();
1453  pqtag.pop();
1454  string word = **it;
1455  if (word != lastword) {
1456  lastword = word;
1457  wr.append(lastword);
1458  }
1459  ++*it;
1460  if (!it->at_end()) {
1461  pqtag.push(it);
1462  } else {
1463  delete it;
1464  }
1465  }
1466 
1467  for (auto i : vec) {
1468  cur = i;
1469  if (cur->next()) {
1470  pq.push(cur);
1471  } else {
1472  delete cur;
1473  }
1474  }
1475  } else {
1476  // We want to sum the frequencies from tags for the same key.
1477  Xapian::termcount tot_freq = 0;
1478  while (true) {
1479  cur->read_tag();
1480  Xapian::termcount freq;
1481  const char* p = cur->current_tag.data();
1482  const char* end = p + cur->current_tag.size();
1483  if (!unpack_uint_last(&p, end, &freq) || freq == 0) {
1484  throw_database_corrupt("Bad spelling word freq", p);
1485  }
1486  tot_freq += freq;
1487  if (cur->next()) {
1488  pq.push(cur);
1489  } else {
1490  delete cur;
1491  }
1492  if (pq.empty() || pq.top()->current_key != key) break;
1493  cur = pq.top();
1494  pq.pop();
1495  }
1496  tag.resize(0);
1497  pack_uint_last(tag, tot_freq);
1498  }
1499  out->add(key, tag);
1500  }
1501 }
1502 #endif
1503 
1504 static void
1506  vector<const HoneyTable*>::const_iterator b,
1507  vector<const HoneyTable*>::const_iterator e)
1508 {
1509  typedef MergeCursor<const HoneyTable&> cursor_type;
1510  typedef CursorGt<cursor_type> gt_type;
1511  priority_queue<cursor_type*, vector<cursor_type*>, gt_type> pq;
1512  for ( ; b != e; ++b) {
1513  auto in = *b;
1514  auto cursor = new cursor_type(in);
1515  if (cursor->next()) {
1516  pq.push(cursor);
1517  } else {
1518  // Skip empty tables.
1519  delete cursor;
1520  }
1521  }
1522 
1523  while (!pq.empty()) {
1524  cursor_type* cur = pq.top();
1525  pq.pop();
1526 
1527  string key = cur->current_key;
1528  if (pq.empty() || pq.top()->current_key > key) {
1529  // No need to merge the tags, just copy the (possibly compressed)
1530  // tag value.
1531  bool compressed = cur->read_tag(true);
1532  out->add(key, cur->current_tag, compressed);
1533  if (cur->next()) {
1534  pq.push(cur);
1535  } else {
1536  delete cur;
1537  }
1538  continue;
1539  }
1540 
1541  // Merge tag values with the same key:
1542  string tag;
1543  if (static_cast<unsigned char>(key[0]) < Honey::KEY_PREFIX_WORD) {
1544  // We just want the union of words, so copy over the first instance
1545  // and skip any identical ones.
1546  priority_queue<PrefixCompressedStringItor*,
1547  vector<PrefixCompressedStringItor*>,
1549  // Stick all the cursor_type pointers in a vector because their
1550  // current_tag members must remain valid while we're merging their
1551  // tags, but we need to call next() on them all afterwards.
1552  vector<cursor_type*> vec;
1553  vec.reserve(pq.size());
1554 
1555  while (true) {
1556  cur->read_tag();
1557  pqtag.push(new PrefixCompressedStringItor(cur->current_tag,
1558  key));
1559  vec.push_back(cur);
1560  if (pq.empty() || pq.top()->current_key != key) break;
1561  cur = pq.top();
1562  pq.pop();
1563  }
1564 
1565  PrefixCompressedStringWriter wr(tag, key);
1566  string lastword;
1567  while (!pqtag.empty()) {
1568  PrefixCompressedStringItor* it = pqtag.top();
1569  pqtag.pop();
1570  string word = **it;
1571  if (word != lastword) {
1572  lastword = word;
1573  wr.append(lastword);
1574  }
1575  ++*it;
1576  if (!it->at_end()) {
1577  pqtag.push(it);
1578  } else {
1579  delete it;
1580  }
1581  }
1582 
1583  for (auto i : vec) {
1584  cur = i;
1585  if (cur->next()) {
1586  pq.push(cur);
1587  } else {
1588  delete cur;
1589  }
1590  }
1591  } else {
1592  // We want to sum the frequencies from tags for the same key.
1593  Xapian::termcount tot_freq = 0;
1594  while (true) {
1595  cur->read_tag();
1596  Xapian::termcount freq;
1597  const char* p = cur->current_tag.data();
1598  const char* end = p + cur->current_tag.size();
1599  if (!unpack_uint_last(&p, end, &freq) || freq == 0) {
1600  throw_database_corrupt("Bad spelling word freq", p);
1601  }
1602  tot_freq += freq;
1603  if (cur->next()) {
1604  pq.push(cur);
1605  } else {
1606  delete cur;
1607  }
1608  if (pq.empty() || pq.top()->current_key != key) break;
1609  cur = pq.top();
1610  pq.pop();
1611  }
1612  tag.resize(0);
1613  pack_uint_last(tag, tot_freq);
1614  }
1615  out->add(key, tag);
1616  }
1617 }
1618 
1619 // U : vector<HoneyTable*>::const_iterator
1620 template<typename T, typename U> void
1621 merge_synonyms(T* out, U b, U e)
1622 {
1623  typedef decltype(**b) table_type; // E.g. HoneyTable
1624  typedef MergeCursor<table_type> cursor_type;
1625  typedef CursorGt<cursor_type> gt_type;
1626  priority_queue<cursor_type*, vector<cursor_type*>, gt_type> pq;
1627  for ( ; b != e; ++b) {
1628  auto in = *b;
1629  auto cursor = new cursor_type(in);
1630  if (cursor->next()) {
1631  pq.push(cursor);
1632  } else {
1633  // Skip empty tables.
1634  delete cursor;
1635  }
1636  }
1637 
1638  while (!pq.empty()) {
1639  cursor_type* cur = pq.top();
1640  pq.pop();
1641 
1642  string key = cur->current_key;
1643  if (pq.empty() || pq.top()->current_key > key) {
1644  // No need to merge the tags, just copy the (possibly compressed)
1645  // tag value.
1646  bool compressed = cur->read_tag(true);
1647  out->add(key, cur->current_tag, compressed);
1648  if (cur->next()) {
1649  pq.push(cur);
1650  } else {
1651  delete cur;
1652  }
1653  continue;
1654  }
1655 
1656  // Merge tag values with the same key:
1657  string tag;
1658 
1659  // We just want the union of words, so copy over the first instance
1660  // and skip any identical ones.
1661  priority_queue<ByteLengthPrefixedStringItor*,
1662  vector<ByteLengthPrefixedStringItor*>,
1664  vector<cursor_type*> vec;
1665 
1666  while (true) {
1667  cur->read_tag();
1668  pqtag.push(new ByteLengthPrefixedStringItor(cur->current_tag));
1669  vec.push_back(cur);
1670  if (pq.empty() || pq.top()->current_key != key) break;
1671  cur = pq.top();
1672  pq.pop();
1673  }
1674 
1675  string_view lastword;
1676  while (!pqtag.empty()) {
1677  ByteLengthPrefixedStringItor* it = pqtag.top();
1678  pqtag.pop();
1679  string_view word = **it;
1680  if (word != lastword) {
1681  tag += uint8_t(word.size() ^ MAGIC_XOR_VALUE);
1682  tag += word;
1683  lastword = word;
1684  }
1685  ++*it;
1686  if (!it->at_end()) {
1687  pqtag.push(it);
1688  } else {
1689  delete it;
1690  }
1691  }
1692 
1693  for (auto i : vec) {
1694  cur = i;
1695  if (cur->next()) {
1696  pq.push(cur);
1697  } else {
1698  delete cur;
1699  }
1700  }
1701 
1702  out->add(key, tag);
1703  }
1704 }
1705 
1706 template<typename T, typename U> void
1708  T* out, const char* tmpdir,
1709  const vector<U*>& in,
1710  vector<Xapian::docid> off)
1711 {
1712  if (in.size() <= 3) {
1713  merge_postlists(compactor, out, off.begin(), in.begin(), in.end());
1714  return;
1715  }
1716  unsigned int c = 0;
1717  vector<HoneyTable*> tmp;
1718  tmp.reserve(in.size() / 2);
1719  {
1720  vector<Xapian::docid> newoff;
1721  newoff.resize(in.size() / 2);
1722  for (unsigned int i = 0, j; i < in.size(); i = j) {
1723  j = i + 2;
1724  if (j == in.size() - 1) ++j;
1725 
1726  string dest = tmpdir;
1727  dest += "/tmp";
1728  dest += str(c);
1729  dest += '_';
1730  dest += str(i / 2);
1731  dest += '.';
1732 
1733  HoneyTable* tmptab = new HoneyTable("postlist", dest, false);
1734 
1735  // Don't compress entries in temporary tables, even if the final
1736  // table would do so. Any already compressed entries will get
1737  // copied in compressed form.
1738  Honey::RootInfo root_info;
1739  root_info.init(0);
1740  const int flags = Xapian::DB_DANGEROUS|Xapian::DB_NO_SYNC;
1741  tmptab->create_and_open(flags, root_info);
1742 
1743  merge_postlists(compactor, tmptab, off.begin() + i,
1744  in.begin() + i, in.begin() + j);
1745  tmp.push_back(tmptab);
1746  tmptab->flush_db();
1747  tmptab->commit(1, &root_info);
1748  }
1749  swap(off, newoff);
1750  ++c;
1751  }
1752 
1753  while (tmp.size() > 3) {
1754  vector<HoneyTable*> tmpout;
1755  tmpout.reserve(tmp.size() / 2);
1756  vector<Xapian::docid> newoff;
1757  newoff.resize(tmp.size() / 2);
1758  for (unsigned int i = 0, j; i < tmp.size(); i = j) {
1759  j = i + 2;
1760  if (j == tmp.size() - 1) ++j;
1761 
1762  string dest = tmpdir;
1763  dest += "/tmp";
1764  dest += str(c);
1765  dest += '_';
1766  dest += str(i / 2);
1767  dest += '.';
1768 
1769  HoneyTable* tmptab = new HoneyTable("postlist", dest, false);
1770 
1771  // Don't compress entries in temporary tables, even if the final
1772  // table would do so. Any already compressed entries will get
1773  // copied in compressed form.
1774  Honey::RootInfo root_info;
1775  root_info.init(0);
1776  const int flags = Xapian::DB_DANGEROUS|Xapian::DB_NO_SYNC;
1777  tmptab->create_and_open(flags, root_info);
1778 
1779  merge_postlists(compactor, tmptab, off.begin() + i,
1780  tmp.begin() + i, tmp.begin() + j);
1781  if (c > 0) {
1782  for (unsigned int k = i; k < j; ++k) {
1783  // FIXME: unlink(tmp[k]->get_path().c_str());
1784  delete tmp[k];
1785  tmp[k] = NULL;
1786  }
1787  }
1788  tmpout.push_back(tmptab);
1789  tmptab->flush_db();
1790  tmptab->commit(1, &root_info);
1791  }
1792  swap(tmp, tmpout);
1793  swap(off, newoff);
1794  ++c;
1795  }
1796  merge_postlists(compactor, out, off.begin(), tmp.begin(), tmp.end());
1797  if (c > 0) {
1798  for (size_t k = 0; k < tmp.size(); ++k) {
1799  // FIXME: unlink(tmp[k]->get_path().c_str());
1800  delete tmp[k];
1801  tmp[k] = NULL;
1802  }
1803  }
1804 }
1805 
1806 template<typename T> class PositionCursor;
1807 
1808 #ifdef XAPIAN_HAS_GLASS_BACKEND
1809 template<>
1810 class PositionCursor<const GlassTable&> : private GlassCursor {
1812 
1813  public:
1814  string key;
1816 
1818  : GlassCursor(in), offset(offset_), firstdid(0) {
1819  rewind();
1820  }
1821 
1822  bool next() {
1823  if (!GlassCursor::next()) return false;
1824  read_tag();
1825  const char* d = current_key.data();
1826  const char* e = d + current_key.size();
1827  string term;
1828  Xapian::docid did;
1829  if (!unpack_string_preserving_sort(&d, e, term) ||
1830  !unpack_uint_preserving_sort(&d, e, &did) ||
1831  d != e) {
1832  throw Xapian::DatabaseCorruptError("Bad position key");
1833  }
1834 
1835  key.resize(0);
1837  pack_uint_preserving_sort(key, did + offset);
1838  return true;
1839  }
1840 
1841  const string& get_tag() const {
1842  return current_tag;
1843  }
1844 };
1845 #endif
1846 
1847 template<>
1848 class PositionCursor<const HoneyTable&> : private HoneyCursor {
1850 
1851  public:
1852  string key;
1854 
1856  : HoneyCursor(in), offset(offset_), firstdid(0) {
1857  rewind();
1858  }
1859 
1860  bool next() {
1861  if (!HoneyCursor::next()) return false;
1862  read_tag();
1863  const char* d = current_key.data();
1864  const char* e = d + current_key.size();
1865  string term;
1866  Xapian::docid did;
1867  if (!unpack_string_preserving_sort(&d, e, term) ||
1868  !unpack_uint_preserving_sort(&d, e, &did) ||
1869  d != e) {
1870  throw Xapian::DatabaseCorruptError("Bad position key");
1871  }
1872 
1873  key.resize(0);
1875  pack_uint_preserving_sort(key, did + offset);
1876  return true;
1877  }
1878 
1879  const string& get_tag() const {
1880  return current_tag;
1881  }
1882 };
1883 
1884 template<typename T>
1886  public:
1889  bool operator()(const T* a, const T* b) const {
1890  return a->key > b->key;
1891  }
1892 };
1893 
1894 template<typename T, typename U> void
1895 merge_positions(T* out, const vector<U*>& inputs,
1896  const vector<Xapian::docid>& offset)
1897 {
1898  typedef decltype(*inputs[0]) table_type; // E.g. HoneyTable
1899  typedef PositionCursor<table_type> cursor_type;
1900  typedef PositionCursorGt<cursor_type> gt_type;
1901  priority_queue<cursor_type*, vector<cursor_type*>, gt_type> pq;
1902  for (size_t i = 0; i < inputs.size(); ++i) {
1903  auto in = inputs[i];
1904  auto cursor = new cursor_type(in, offset[i]);
1905  if (cursor->next()) {
1906  pq.push(cursor);
1907  } else {
1908  // Skip empty tables.
1909  delete cursor;
1910  }
1911  }
1912 
1913  while (!pq.empty()) {
1914  cursor_type* cur = pq.top();
1915  pq.pop();
1916  out->add(cur->key, cur->get_tag());
1917  if (cur->next()) {
1918  pq.push(cur);
1919  } else {
1920  delete cur;
1921  }
1922  }
1923 }
1924 
1925 template<typename T, typename U> void
1926 merge_docid_keyed(T* out, const vector<U*>& inputs,
1927  const vector<Xapian::docid>& offset,
1928  int = 0)
1929 {
1930  for (size_t i = 0; i < inputs.size(); ++i) {
1931  Xapian::docid off = offset[i];
1932 
1933  auto in = inputs[i];
1934  HoneyCursor cur(in);
1935  cur.rewind();
1936 
1937  string key;
1938  while (cur.next()) {
1939  // Adjust the key if this isn't the first database.
1940  if (off) {
1941  Xapian::docid did;
1942  const char* d = cur.current_key.data();
1943  const char* e = d + cur.current_key.size();
1944  if (!unpack_uint_preserving_sort(&d, e, &did)) {
1945  string msg = "Bad key in ";
1946  msg += inputs[i]->get_path();
1947  throw Xapian::DatabaseCorruptError(msg);
1948  }
1949  UNSIGNED_OVERFLOW_OK(did += off);
1950  key.resize(0);
1951  pack_uint_preserving_sort(key, did);
1952  if (d != e) {
1953  // Copy over anything extra in the key (e.g. the zero byte
1954  // at the end of "used value slots" in the termlist table).
1955  key.append(d, e - d);
1956  }
1957  } else {
1958  key = cur.current_key;
1959  }
1960  bool compressed = cur.read_tag(true);
1961  out->add(key, cur.current_tag, compressed);
1962  }
1963  }
1964 }
1965 
1966 #ifdef XAPIAN_HAS_GLASS_BACKEND
1967 template<typename T> void
1968 merge_docid_keyed(T* out, const vector<const GlassTable*>& inputs,
1969  const vector<Xapian::docid>& offset,
1970  Xapian::termcount& ut_lb, Xapian::termcount& ut_ub,
1971  int table_type = 0)
1972 {
1973  for (size_t i = 0; i < inputs.size(); ++i) {
1974  Xapian::docid off = offset[i];
1975 
1976  auto in = inputs[i];
1977  if (in->empty()) continue;
1978 
1979  GlassCursor cur(in);
1980  cur.rewind();
1981 
1982  string key;
1983  while (cur.next()) {
1984 next_without_next:
1985  // Adjust the key if this isn't the first database.
1986  if (off) {
1987  Xapian::docid did;
1988  const char* d = cur.current_key.data();
1989  const char* e = d + cur.current_key.size();
1990  if (!unpack_uint_preserving_sort(&d, e, &did)) {
1991  string msg = "Bad key in ";
1992  msg += inputs[i]->get_path();
1993  throw Xapian::DatabaseCorruptError(msg);
1994  }
1995  did += off;
1996  key.resize(0);
1997  pack_uint_preserving_sort(key, did);
1998  if (d != e) {
1999  // Copy over anything extra in the key (e.g. the zero byte
2000  // at the end of "used value slots" in the termlist table).
2001  key.append(d, e - d);
2002  }
2003  } else {
2004  key = cur.current_key;
2005  }
2006  if (table_type == Honey::TERMLIST) {
2007  if (termlist_key_is_values_used(key)) {
2008  throw Xapian::DatabaseCorruptError("value slots used "
2009  "entry without a "
2010  "corresponding "
2011  "termlist entry");
2012  }
2013  cur.read_tag();
2014  string tag = cur.current_tag;
2015 
2016  bool next_result = cur.next();
2017  bool next_already_done = true;
2018  unsigned bitmap_slots_used = 0;
2019  string encoded_slots_used;
2020  if (next_result &&
2022  next_already_done = false;
2023  cur.read_tag();
2024  const string& valtag = cur.current_tag;
2025 
2026  const char* p = valtag.data();
2027  const char* end = p + valtag.size();
2028 
2030 
2031  Xapian::valueno first_slot;
2032  if (!unpack_uint(&p, end, &first_slot)) {
2033  throw_database_corrupt("Value slot encoding corrupt",
2034  p);
2035  }
2036  slots.push_back(first_slot);
2037  Xapian::valueno last_slot = first_slot;
2038  while (p != end) {
2039  Xapian::valueno slot;
2040  if (!unpack_uint(&p, end, &slot)) {
2041  throw_database_corrupt("Value slot encoding "
2042  "corrupt", p);
2043  }
2044  slot += last_slot + 1;
2045  last_slot = slot;
2046 
2047  slots.push_back(slot);
2048  }
2049 
2050  if (slots.back() <= 6) {
2051  // Encode as a bitmap if only slots in the range 0-6
2052  // are used.
2053  for (auto slot : slots) {
2054  bitmap_slots_used |= 1 << slot;
2055  }
2056  } else {
2057  string enc;
2058  pack_uint(enc, last_slot);
2059  if (slots.size() > 1) {
2060  BitWriter slots_used(enc);
2061  slots_used.encode(first_slot, last_slot);
2062  slots_used.encode(slots.size() - 2,
2063  last_slot - first_slot);
2064  slots_used.encode_interpolative(slots, 0,
2065  slots.size() - 1);
2066  encoded_slots_used = slots_used.freeze();
2067  } else {
2068  encoded_slots_used = std::move(enc);
2069  }
2070  }
2071  }
2072 
2073  const char* pos = tag.data();
2074  const char* end = pos + tag.size();
2075 
2076  string newtag;
2077  if (encoded_slots_used.empty()) {
2078  newtag += char(bitmap_slots_used);
2079  } else {
2080  auto size = encoded_slots_used.size();
2081  if (size < 0x80) {
2082  newtag += char(0x80 | size);
2083  } else {
2084  newtag += '\x80';
2085  pack_uint(newtag, size);
2086  }
2087  newtag += encoded_slots_used;
2088  }
2089 
2090  if (pos != end) {
2091  Xapian::termcount doclen;
2092  if (!unpack_uint(&pos, end, &doclen)) {
2093  throw_database_corrupt("doclen", pos);
2094  }
2095  Xapian::termcount termlist_size;
2096  if (!unpack_uint(&pos, end, &termlist_size)) {
2097  throw_database_corrupt("termlist length", pos);
2098  }
2099 
2100  auto uniq_terms = min(termlist_size, doclen);
2101  if (uniq_terms &&
2102  (ut_lb == 0 || uniq_terms < ut_lb)) {
2103  ut_lb = uniq_terms;
2104  }
2105  if (uniq_terms > ut_ub)
2106  ut_ub = uniq_terms;
2107 
2108  pack_uint(newtag, termlist_size - 1);
2109  pack_uint(newtag, doclen);
2110 
2111  string current_term;
2112  while (pos != end) {
2113  Xapian::termcount current_wdf = 0;
2114 
2115  if (!current_term.empty()) {
2116  size_t reuse = static_cast<unsigned char>(*pos++);
2117  newtag += char(reuse);
2118 
2119  if (reuse > current_term.size()) {
2120  current_wdf = reuse / (current_term.size() + 1);
2121  reuse = reuse % (current_term.size() + 1);
2122  }
2123  current_term.resize(reuse);
2124 
2125  if (pos == end)
2126  throw_database_corrupt("term", NULL);
2127  }
2128 
2129  size_t append = static_cast<unsigned char>(*pos++);
2130  if (size_t(end - pos) < append)
2131  throw_database_corrupt("term", NULL);
2132 
2133  current_term.append(pos, append);
2134  pos += append;
2135 
2136  if (current_wdf) {
2137  --current_wdf;
2138  } else {
2139  if (!unpack_uint(&pos, end, &current_wdf)) {
2140  throw_database_corrupt("wdf", pos);
2141  }
2142  pack_uint(newtag, current_wdf);
2143  }
2144 
2145  newtag += char(append);
2146  newtag.append(current_term.end() - append,
2147  current_term.end());
2148  }
2149  }
2150  if (!newtag.empty())
2151  out->add(key, newtag);
2152  if (!next_result) break;
2153  if (next_already_done) goto next_without_next;
2154  } else {
2155  bool compressed = cur.read_tag(true);
2156  out->add(key, cur.current_tag, compressed);
2157  }
2158  }
2159  }
2160 }
2161 #endif
2162 
2163 }
2164 
2165 using namespace HoneyCompact;
2166 
2167 void
2169  const char* destdir,
2170  int fd,
2171  int source_backend,
2172  const vector<const Xapian::Database::Internal*>& sources,
2173  const vector<Xapian::docid>& offset,
2175  unsigned flags,
2176  Xapian::docid last_docid)
2177 {
2178  // Currently unused for honey.
2179  (void)compaction;
2180 
2181  struct table_list {
2182  // The "base name" of the table.
2183  char name[9];
2184  // The type.
2185  Honey::table_type type;
2186  // Create tables after position lazily.
2187  bool lazy;
2188  };
2189 
2190  static const table_list tables[] = {
2191  // name type lazy
2192  { "postlist", Honey::POSTLIST, false },
2193  { "docdata", Honey::DOCDATA, true },
2194  { "termlist", Honey::TERMLIST, false },
2195  { "position", Honey::POSITION, true },
2196  { "spelling", Honey::SPELLING, true },
2197  { "synonym", Honey::SYNONYM, true }
2198  };
2199 
2200  const int FLAGS = Xapian::DB_DANGEROUS;
2201 
2202  bool single_file = (flags & Xapian::DBCOMPACT_SINGLE_FILE);
2203  bool multipass = (flags & Xapian::DBCOMPACT_MULTIPASS);
2204  if (single_file) {
2205  // FIXME: Support this combination - we need to put temporary files
2206  // somewhere.
2207  multipass = false;
2208  }
2209 
2210  if (single_file) {
2211  for (size_t i = 0; i != sources.size(); ++i) {
2212  bool has_uncommitted_changes;
2213  if (source_backend == Xapian::DB_BACKEND_GLASS) {
2214 #ifdef XAPIAN_HAS_GLASS_BACKEND
2215  auto db = static_cast<const GlassDatabase*>(sources[i]);
2216  has_uncommitted_changes = db->has_uncommitted_changes();
2217 #else
2218  throw Xapian::FeatureUnavailableError("Glass backend disabled");
2219 #endif
2220  } else {
2221  auto db = static_cast<const HoneyDatabase*>(sources[i]);
2222  has_uncommitted_changes = db->has_uncommitted_changes();
2223  }
2224  if (has_uncommitted_changes) {
2225  const char* m =
2226  "Can't compact from a WritableDatabase with uncommitted "
2227  "changes - either call commit() first, or create a new "
2228  "Database object from the filename on disk";
2230  }
2231  }
2232  }
2233 
2234  FlintLock lock(destdir ? destdir : "");
2235  if (!single_file) {
2236  string explanation;
2237  FlintLock::reason why = lock.lock(true, false, explanation);
2238  if (why != FlintLock::SUCCESS) {
2239  lock.throw_databaselockerror(why, destdir, explanation);
2240  }
2241  }
2242 
2243  unique_ptr<HoneyVersion> version_file_out;
2244  if (single_file) {
2245  if (destdir) {
2246  fd = open(destdir, O_RDWR|O_CREAT|O_TRUNC|O_BINARY|O_CLOEXEC, 0666);
2247  if (fd < 0) {
2248  throw Xapian::DatabaseCreateError("open() failed", errno);
2249  }
2250  }
2251  version_file_out.reset(new HoneyVersion(fd));
2252  } else {
2253  fd = -1;
2254  version_file_out.reset(new HoneyVersion(destdir));
2255  }
2256 
2257  // Set to true if stat() failed (which can happen if the files are > 2GB
2258  // and off_t is 32 bit) or one of the totals overflowed.
2259  bool bad_totals = false;
2260  file_size_type in_total = 0, out_total = 0;
2261 
2262  version_file_out->create();
2263  for (size_t i = 0; i != sources.size(); ++i) {
2264  bool source_single_file = false;
2265  if (source_backend == Xapian::DB_BACKEND_GLASS) {
2266 #ifdef XAPIAN_HAS_GLASS_BACKEND
2267  auto db = static_cast<const GlassDatabase*>(sources[i]);
2268  auto& v_in = db->version_file;
2269  auto& v_out = version_file_out;
2270  // Glass backend doesn't track unique term bounds, hence setting
2271  // them to 0. We calculate the unique term bounds as we convert the
2272  // termlist table and fill in the correct values before we write out
2273  // version_file_out.
2274  v_out->merge_stats(v_in.get_doccount(),
2275  v_in.get_doclength_lower_bound(),
2276  v_in.get_doclength_upper_bound(),
2277  v_in.get_wdf_upper_bound(),
2278  v_in.get_total_doclen(),
2279  v_in.get_spelling_wordfreq_upper_bound(),
2280  0,
2281  0);
2282  source_single_file = db->single_file();
2283 #else
2284  Assert(false);
2285 #endif
2286  } else {
2287  auto db = static_cast<const HoneyDatabase*>(sources[i]);
2288  version_file_out->merge_stats(db->version_file);
2289  source_single_file = db->single_file();
2290  }
2291  if (source_single_file) {
2292  // Add single file input DB sizes to in_total here. For other
2293  // databases, we sum per-table below.
2294  string path;
2295  sources[i]->get_backend_info(&path);
2296  auto db_size = file_size(path);
2297  if (errno == 0) {
2298  if (add_overflows(in_total, db_size, in_total)) {
2299  bad_totals = true;
2300  }
2301  } else {
2302  bad_totals = true;
2303  }
2304  }
2305  }
2306 
2307  string fl_serialised;
2308 #if 0
2309  if (single_file) {
2310  HoneyFreeList fl;
2311  fl.set_first_unused_block(1); // FIXME: Assumption?
2312  fl.pack(fl_serialised);
2313  }
2314 #endif
2315 
2316  // FIXME: sort out indentation.
2317 if (source_backend == Xapian::DB_BACKEND_GLASS) {
2318 #ifndef XAPIAN_HAS_GLASS_BACKEND
2319  throw Xapian::FeatureUnavailableError("Glass backend disabled");
2320 #else
2321  vector<HoneyTable*> tabs;
2322  tabs.reserve(std::end(tables) - std::begin(tables));
2323  file_size_type prev_size = 0;
2324  for (const auto& t : tables) {
2325  // The postlist table requires an N-way merge, adjusting the
2326  // headers of various blocks. The spelling and synonym tables also
2327  // need special handling. The other tables have keys sorted in
2328  // docid order, so we can merge them by simply copying all the keys
2329  // from each source table in turn.
2330  if (compactor)
2331  compactor->set_status(t.name, string());
2332 
2333  string dest;
2334  if (!single_file) {
2335  dest = destdir;
2336  dest += '/';
2337  dest += t.name;
2338  dest += '.';
2339  }
2340 
2341  bool output_will_exist = !t.lazy;
2342 
2343  // Sometimes stat can fail for benign reasons (e.g. >= 2GB file
2344  // on certain systems).
2345  bool bad_stat = false;
2346 
2347  // We can't currently report input sizes if there's a single file DB
2348  // amongst the inputs.
2349  bool single_file_in = false;
2350 
2351  file_size_type in_size = 0;
2352 
2353  vector<const GlassTable*> inputs;
2354  inputs.reserve(sources.size());
2355  size_t inputs_present = 0;
2356  for (auto src : sources) {
2357  auto db = static_cast<const GlassDatabase*>(src);
2358  const GlassTable* table;
2359  switch (t.type) {
2360  case Honey::POSTLIST:
2361  table = &(db->postlist_table);
2362  break;
2363  case Honey::DOCDATA:
2364  table = &(db->docdata_table);
2365  break;
2366  case Honey::TERMLIST:
2367  table = &(db->termlist_table);
2368  break;
2369  case Honey::POSITION:
2370  table = &(db->position_table);
2371  break;
2372  case Honey::SPELLING:
2373  table = &(db->spelling_table);
2374  break;
2375  case Honey::SYNONYM:
2376  table = &(db->synonym_table);
2377  break;
2378  default:
2379  Assert(false);
2380  return;
2381  }
2382 
2383  if (db->single_file()) {
2384  if (t.lazy && !GlassCursor(table).next()) {
2385  // Lazy table with no entries, so essentially doesn't
2386  // exist.
2387  } else {
2388  // FIXME: Find actual size somehow?
2389  // in_size += table->size() / 1024;
2390  single_file_in = true;
2391  output_will_exist = true;
2392  ++inputs_present;
2393  }
2394  } else {
2395  auto db_size = file_size(table->get_path());
2396  if (errno == 0) {
2397  if (add_overflows(in_total, db_size, in_total)) {
2398  bad_totals = true;
2399  }
2400  in_size += db_size / 1024;
2401  output_will_exist = true;
2402  ++inputs_present;
2403  } else if (errno != ENOENT) {
2404  // We get ENOENT for an optional table.
2405  bad_totals = bad_stat = true;
2406  output_will_exist = true;
2407  ++inputs_present;
2408  }
2409  }
2410  inputs.push_back(table);
2411  }
2412 
2413  // If any inputs lack a termlist table, suppress it in the output.
2414  if (t.type == Honey::TERMLIST && inputs_present != sources.size()) {
2415  if (inputs_present != 0) {
2416  if (compactor) {
2417  string m = str(inputs_present);
2418  m += " of ";
2419  m += str(sources.size());
2420  m += " inputs present, so suppressing output";
2421  compactor->set_status(t.name, m);
2422  }
2423  continue;
2424  }
2425  output_will_exist = false;
2426  }
2427 
2428  if (!output_will_exist) {
2429  if (compactor)
2430  compactor->set_status(t.name, "doesn't exist");
2431  continue;
2432  }
2433 
2434  HoneyTable* out;
2435  off_t table_start_offset = -1;
2436  if (single_file) {
2437  if (&t == tables) {
2438  // Start first table HONEY_VERSION_MAX_SIZE bytes in to allow
2439  // space for version file. It's tricky to exactly know the
2440  // size of the version file beforehand.
2441  table_start_offset = lseek(fd, HONEY_VERSION_MAX_SIZE, SEEK_CUR);
2442  if (table_start_offset < 0)
2443  throw Xapian::DatabaseError("lseek() failed", errno);
2444  } else {
2445  table_start_offset = lseek(fd, 0, SEEK_CUR);
2446  }
2447  out = new HoneyTable(t.name, fd, version_file_out->get_offset(),
2448  false, false);
2449  } else {
2450  out = new HoneyTable(t.name, dest, false, t.lazy);
2451  }
2452  tabs.push_back(out);
2453  Honey::RootInfo* root_info = version_file_out->root_to_set(t.type);
2454  if (single_file) {
2455  root_info->set_free_list(fl_serialised);
2456  root_info->set_offset(table_start_offset);
2457  out->open(FLAGS,
2458  version_file_out->get_root(t.type),
2459  version_file_out->get_revision());
2460  } else {
2461  out->create_and_open(FLAGS, *root_info);
2462  }
2463 
2464  switch (t.type) {
2465  case Honey::POSTLIST: {
2466  if (multipass && inputs.size() > 3) {
2467  multimerge_postlists(compactor, out, destdir,
2468  inputs, offset);
2469  } else {
2470  merge_postlists(compactor, out, offset.begin(),
2471  inputs.begin(), inputs.end());
2472  }
2473  break;
2474  }
2475  case Honey::SPELLING:
2476  merge_spellings(out, inputs.cbegin(), inputs.cend());
2477  break;
2478  case Honey::SYNONYM:
2479  merge_synonyms(out, inputs.begin(), inputs.end());
2480  break;
2481  case Honey::POSITION:
2482  merge_positions(out, inputs, offset);
2483  break;
2484  default: {
2485  // DocData, Termlist
2486  auto& v_out = version_file_out;
2487  auto ut_lb = v_out->get_unique_terms_lower_bound();
2488  auto ut_ub = v_out->get_unique_terms_upper_bound();
2489  merge_docid_keyed(out, inputs, offset, ut_lb, ut_ub, t.type);
2490  version_file_out->set_unique_terms_lower_bound(ut_lb);
2491  version_file_out->set_unique_terms_upper_bound(ut_ub);
2492  break;
2493  }
2494  }
2495 
2496  // Commit as revision 1.
2497  out->flush_db();
2498  out->commit(1, root_info);
2499  out->sync();
2500  if (single_file) fl_serialised = root_info->get_free_list();
2501 
2502  file_size_type out_size = 0;
2503  if (!bad_stat && !single_file_in) {
2504  file_size_type db_size;
2505  if (single_file) {
2506  db_size = file_size(fd);
2507  } else {
2508  db_size = file_size(dest + HONEY_TABLE_EXTENSION);
2509  }
2510  if (errno == 0) {
2511  if (single_file) {
2512  auto old_prev_size = prev_size;
2513  prev_size = db_size;
2514  db_size -= old_prev_size;
2515  }
2516  if (add_overflows(out_total, db_size, out_total)) {
2517  bad_totals = true;
2518  }
2519  out_size = db_size / 1024;
2520  } else if (errno != ENOENT) {
2521  bad_totals = bad_stat = true;
2522  }
2523  }
2524  if (bad_stat) {
2525  if (compactor)
2526  compactor->set_status(t.name,
2527  "Done (couldn't stat all the DB files)");
2528  } else if (single_file_in) {
2529  if (compactor)
2530  compactor->set_status(t.name,
2531  "Done (table sizes unknown for single "
2532  "file DB input)");
2533  } else {
2534  string status;
2535  if (out_size == in_size) {
2536  status = "Size unchanged (";
2537  } else {
2538  file_size_type delta;
2539  if (out_size < in_size) {
2540  delta = in_size - out_size;
2541  status = "Reduced by ";
2542  } else {
2543  delta = out_size - in_size;
2544  status = "INCREASED by ";
2545  }
2546  if (in_size) {
2547  status += str(100 * delta / in_size);
2548  status += "% ";
2549  }
2550  status += str(delta);
2551  status += "K (";
2552  status += str(in_size);
2553  status += "K -> ";
2554  }
2555  status += str(out_size);
2556  status += "K)";
2557  if (compactor)
2558  compactor->set_status(t.name, status);
2559  }
2560  }
2561 
2562  // If compacting to a single file output and all the tables are empty, pad
2563  // the output so that it isn't mistaken for a stub database when we try to
2564  // open it. For this it needs to at least HONEY_MIN_DB_SIZE in size.
2565  if (single_file && prev_size < HONEY_MIN_DB_SIZE) {
2566  out_total = HONEY_MIN_DB_SIZE;
2567 #ifdef HAVE_FTRUNCATE
2568  if (ftruncate(fd, HONEY_MIN_DB_SIZE) < 0) {
2569  throw Xapian::DatabaseError("Failed to set size of output "
2570  "database", errno);
2571  }
2572 #else
2573  const off_t off = HONEY_MIN_DB_SIZE - 1;
2574  if (lseek(fd, off, SEEK_SET) != off || write(fd, "", 1) != 1) {
2575  throw Xapian::DatabaseError("Failed to set size of output "
2576  "database", errno);
2577  }
2578 #endif
2579  }
2580 
2581  if (single_file) {
2582  if (lseek(fd, version_file_out->get_offset(), SEEK_SET) == -1) {
2583  throw Xapian::DatabaseError("lseek() failed", errno);
2584  }
2585  }
2586  version_file_out->set_last_docid(last_docid);
2587  string tmpfile = version_file_out->write(1, FLAGS);
2588  if (single_file) {
2589  off_t version_file_size = lseek(fd, 0, SEEK_CUR);
2590  if (version_file_size < 0) {
2591  throw Xapian::DatabaseError("lseek() failed", errno);
2592  }
2593  if (version_file_size > HONEY_VERSION_MAX_SIZE) {
2594  throw Xapian::DatabaseError("Didn't allow enough space for "
2595  "version file data");
2596  }
2597  }
2598  for (unsigned j = 0; j != tabs.size(); ++j) {
2599  tabs[j]->sync();
2600  }
2601  // Commit with revision 1.
2602  version_file_out->sync(tmpfile, 1, FLAGS);
2603  for (unsigned j = 0; j != tabs.size(); ++j) {
2604  delete tabs[j];
2605  }
2606 #endif
2607 } else {
2608  vector<HoneyTable*> tabs;
2609  tabs.reserve(std::end(tables) - std::begin(tables));
2610  file_size_type prev_size = HONEY_MIN_DB_SIZE;
2611  for (const auto& t : tables) {
2612  // The postlist table requires an N-way merge, adjusting the
2613  // headers of various blocks. The spelling and synonym tables also
2614  // need special handling. The other tables have keys sorted in
2615  // docid order, so we can merge them by simply copying all the keys
2616  // from each source table in turn.
2617  if (compactor)
2618  compactor->set_status(t.name, string());
2619 
2620  string dest;
2621  if (!single_file) {
2622  dest = destdir;
2623  dest += '/';
2624  dest += t.name;
2625  dest += '.';
2626  }
2627 
2628  bool output_will_exist = !t.lazy;
2629 
2630  // Sometimes stat can fail for benign reasons (e.g. >= 2GB file
2631  // on certain systems).
2632  bool bad_stat = false;
2633 
2634  // We can't currently report input sizes if there's a single file DB
2635  // amongst the inputs.
2636  bool single_file_in = false;
2637 
2638  file_size_type in_size = 0;
2639 
2640  vector<const HoneyTable*> inputs;
2641  inputs.reserve(sources.size());
2642  size_t inputs_present = 0;
2643  for (auto src : sources) {
2644  auto db = static_cast<const HoneyDatabase*>(src);
2645  const HoneyTable* table;
2646  switch (t.type) {
2647  case Honey::POSTLIST:
2648  table = &(db->postlist_table);
2649  break;
2650  case Honey::DOCDATA:
2651  table = &(db->docdata_table);
2652  break;
2653  case Honey::TERMLIST:
2654  table = &(db->termlist_table);
2655  break;
2656  case Honey::POSITION:
2657  table = &(db->position_table);
2658  break;
2659  case Honey::SPELLING:
2660  table = &(db->spelling_table);
2661  break;
2662  case Honey::SYNONYM:
2663  table = &(db->synonym_table);
2664  break;
2665  default:
2666  Assert(false);
2667  return;
2668  }
2669 
2670  if (db->single_file()) {
2671  if (t.lazy && !HoneyCursor(table).next()) {
2672  // Lazy table with no entries, so essentially doesn't
2673  // exist.
2674  } else {
2675  // FIXME: Find actual size somehow?
2676  // in_size += table->size() / 1024;
2677  single_file_in = true;
2678  output_will_exist = true;
2679  ++inputs_present;
2680  }
2681  } else {
2682  auto db_size = file_size(table->get_path());
2683  if (errno == 0) {
2684  if (add_overflows(in_total, db_size, in_total)) {
2685  bad_totals = true;
2686  }
2687  in_size += db_size / 1024;
2688  output_will_exist = true;
2689  ++inputs_present;
2690  } else if (errno != ENOENT) {
2691  // We get ENOENT for an optional table.
2692  bad_totals = bad_stat = true;
2693  output_will_exist = true;
2694  ++inputs_present;
2695  }
2696  }
2697  inputs.push_back(table);
2698  }
2699 
2700  // If any inputs lack a termlist table, suppress it in the output.
2701  if (t.type == Honey::TERMLIST && inputs_present != sources.size()) {
2702  if (inputs_present != 0) {
2703  if (compactor) {
2704  string m = str(inputs_present);
2705  m += " of ";
2706  m += str(sources.size());
2707  m += " inputs present, so suppressing output";
2708  compactor->set_status(t.name, m);
2709  }
2710  continue;
2711  }
2712  output_will_exist = false;
2713  }
2714 
2715  if (!output_will_exist) {
2716  if (compactor)
2717  compactor->set_status(t.name, "doesn't exist");
2718  continue;
2719  }
2720 
2721  HoneyTable* out;
2722  off_t table_start_offset = -1;
2723  if (single_file) {
2724  if (&t == tables) {
2725  // Start first table HONEY_VERSION_MAX_SIZE bytes in to allow
2726  // space for version file. It's tricky to exactly know the
2727  // size of the version file beforehand.
2728  table_start_offset = lseek(fd, HONEY_VERSION_MAX_SIZE, SEEK_CUR);
2729  if (table_start_offset < 0)
2730  throw Xapian::DatabaseError("lseek() failed", errno);
2731  } else {
2732  table_start_offset = lseek(fd, 0, SEEK_CUR);
2733  }
2734  out = new HoneyTable(t.name, fd, version_file_out->get_offset(),
2735  false, false);
2736  } else {
2737  out = new HoneyTable(t.name, dest, false, t.lazy);
2738  }
2739  tabs.push_back(out);
2740  Honey::RootInfo* root_info = version_file_out->root_to_set(t.type);
2741  if (single_file) {
2742  root_info->set_free_list(fl_serialised);
2743  root_info->set_offset(table_start_offset);
2744  out->open(FLAGS,
2745  version_file_out->get_root(t.type),
2746  version_file_out->get_revision());
2747  } else {
2748  out->create_and_open(FLAGS, *root_info);
2749  }
2750 
2751  switch (t.type) {
2752  case Honey::POSTLIST: {
2753  if (multipass && inputs.size() > 3) {
2754  multimerge_postlists(compactor, out, destdir,
2755  inputs, offset);
2756  } else {
2757  merge_postlists(compactor, out, offset.begin(),
2758  inputs.begin(), inputs.end());
2759  }
2760  break;
2761  }
2762  case Honey::SPELLING:
2763  merge_spellings(out, inputs.begin(), inputs.end());
2764  break;
2765  case Honey::SYNONYM:
2766  merge_synonyms(out, inputs.begin(), inputs.end());
2767  break;
2768  case Honey::POSITION:
2769  merge_positions(out, inputs, offset);
2770  break;
2771  default:
2772  // DocData, Termlist
2773  merge_docid_keyed(out, inputs, offset);
2774  break;
2775  }
2776 
2777  // Commit as revision 1.
2778  out->flush_db();
2779  out->commit(1, root_info);
2780  out->sync();
2781  if (single_file) fl_serialised = root_info->get_free_list();
2782 
2783  file_size_type out_size = 0;
2784  if (!bad_stat && !single_file_in) {
2785  file_size_type db_size;
2786  if (single_file) {
2787  db_size = file_size(fd);
2788  } else {
2789  db_size = file_size(dest + HONEY_TABLE_EXTENSION);
2790  }
2791  if (errno == 0) {
2792  if (single_file) {
2793  auto old_prev_size = prev_size;
2794  prev_size = db_size;
2795  db_size -= old_prev_size;
2796  }
2797  if (add_overflows(out_total, db_size, out_total)) {
2798  bad_totals = true;
2799  }
2800  out_size = db_size / 1024;
2801  } else if (errno != ENOENT) {
2802  bad_totals = bad_stat = true;
2803  }
2804  }
2805  if (bad_stat) {
2806  if (compactor)
2807  compactor->set_status(t.name,
2808  "Done (couldn't stat all the DB files)");
2809  } else if (single_file_in) {
2810  if (compactor)
2811  compactor->set_status(t.name,
2812  "Done (table sizes unknown for single "
2813  "file DB input)");
2814  } else {
2815  string status;
2816  if (out_size == in_size) {
2817  status = "Size unchanged (";
2818  } else {
2819  file_size_type delta;
2820  if (out_size < in_size) {
2821  delta = in_size - out_size;
2822  status = "Reduced by ";
2823  } else {
2824  delta = out_size - in_size;
2825  status = "INCREASED by ";
2826  }
2827  if (in_size) {
2828  status += str(100 * delta / in_size);
2829  status += "% ";
2830  }
2831  status += str(delta);
2832  status += "K (";
2833  status += str(in_size);
2834  status += "K -> ";
2835  }
2836  status += str(out_size);
2837  status += "K)";
2838  if (compactor)
2839  compactor->set_status(t.name, status);
2840  }
2841  }
2842 
2843  // If compacting to a single file output and all the tables are empty, pad
2844  // the output so that it isn't mistaken for a stub database when we try to
2845  // open it. For this it needs to at least HONEY_MIN_DB_SIZE in size.
2846  if (single_file && prev_size < HONEY_MIN_DB_SIZE) {
2847  out_total = HONEY_MIN_DB_SIZE;
2848 #ifdef HAVE_FTRUNCATE
2849  if (ftruncate(fd, HONEY_MIN_DB_SIZE) < 0) {
2850  throw Xapian::DatabaseError("Failed to set size of output "
2851  "database", errno);
2852  }
2853 #else
2854  const off_t off = HONEY_MIN_DB_SIZE - 1;
2855  if (lseek(fd, off, SEEK_SET) != off || write(fd, "", 1) != 1) {
2856  throw Xapian::DatabaseError("Failed to set size of output "
2857  "database", errno);
2858  }
2859 #endif
2860  }
2861 
2862  if (single_file) {
2863  if (lseek(fd, version_file_out->get_offset(), SEEK_SET) < 0) {
2864  throw Xapian::DatabaseError("lseek() failed", errno);
2865  }
2866  }
2867  version_file_out->set_last_docid(last_docid);
2868  string tmpfile = version_file_out->write(1, FLAGS);
2869  for (unsigned j = 0; j != tabs.size(); ++j) {
2870  tabs[j]->sync();
2871  }
2872  // Commit with revision 1.
2873  version_file_out->sync(tmpfile, 1, FLAGS);
2874  for (unsigned j = 0; j != tabs.size(); ++j) {
2875  delete tabs[j];
2876  }
2877 }
2878 
2879  if (!single_file) lock.release();
2880 
2881  if (!bad_totals && compactor) {
2882  string status;
2883  in_total /= 1024;
2884  out_total /= 1024;
2885  if (out_total == in_total) {
2886  status = "Size unchanged (";
2887  } else {
2888  off_t delta;
2889  if (out_total < in_total) {
2890  delta = in_total - out_total;
2891  status = "Reduced by ";
2892  } else {
2893  delta = out_total - in_total;
2894  status = "INCREASED by ";
2895  }
2896  if (in_total) {
2897  status += str(100 * delta / in_total);
2898  status += "% ";
2899  }
2900  status += str(delta);
2901  status += "K (";
2902  status += str(in_total);
2903  status += "K -> ";
2904  }
2905  status += str(out_total);
2906  status += "K)";
2907  compactor->set_status("Total", status);
2908  }
2909 }
#define MAGIC_XOR_VALUE
void release()
Release the lock.
Definition: flint_lock.cc:458
reason lock(bool exclusive, bool wait, std::string &explanation)
Attempt to obtain the lock.
Definition: flint_lock.cc:124
void throw_databaselockerror(FlintLock::reason why, const std::string &db_dir, const std::string &explanation) const
Throw Xapian::DatabaseLockError.
Definition: flint_lock.cc:494
A cursor pointing to a position in a Btree table, for reading several entries in order,...
Definition: glass_cursor.h:148
string current_key
Current key pointed to by cursor.
Definition: glass_cursor.h:239
bool read_tag(bool keep_compressed=false)
Read the tag from the table and store it in current_tag.
bool next()
Advance to the next key.
void rewind()
Position cursor on the dummy empty key.
Definition: glass_cursor.h:250
string current_tag
Current tag pointed to by cursor.
Definition: glass_cursor.h:244
A backend designed for efficient indexing and retrieval, using compressed posting lists and a btree s...
GlassVersion version_file
The file describing the Glass database.
virtual bool has_uncommitted_changes() const
Return true if there are uncommitted changes.
Class managing a Btree table in a Glass database.
Definition: glass_table.h:432
string get_path() const
Definition: glass_table.h:738
void merge_stats(const GlassVersion &o)
Merge the database stats.
Xapian::docid get_docid() const
Definition: glass_values.h:200
bool operator()(const T *a, const T *b) const
Return true if and only if a's key is strictly greater than b's key.
PositionCursor(const GlassTable *in, Xapian::docid offset_)
PositionCursor(const HoneyTable *in, Xapian::docid offset_)
bool operator()(const T *a, const T *b) const
Return true if and only if a's key is strictly greater than b's key.
PostlistCursor(HoneyTable *in, Xapian::docid offset_)
void initialise(Xapian::docid firstdid_, string &&glass_data, size_t data_start)
std::tuple< Xapian::docid, Xapian::docid > get_chunk(string &chunk)
PostlistCursor(const GlassTable *in, Xapian::docid offset_)
PostlistCursor(const HoneyTable *in, Xapian::docid offset_)
bool read_tag(bool keep_compressed=false)
void rewind()
Position cursor on the dummy empty key.
Definition: honey_cursor.h:85
std::string current_tag
Definition: honey_cursor.h:43
bool next()
Definition: honey_cursor.h:96
std::string current_key
Definition: honey_cursor.h:43
Database using honey backend.
bool single_file() const
bool has_uncommitted_changes() const
static void compact(Xapian::Compactor *compactor, const char *destdir, int fd, int source_backend, const std::vector< const Xapian::Database::Internal * > &sources, const std::vector< Xapian::docid > &offset, Xapian::Compactor::compaction_level compaction, unsigned flags, Xapian::docid last_docid)
void pack(std::string &buf)
void set_first_unused_block(uint4 base)
void create_and_open(int flags_, const Honey::RootInfo &root_info)
Definition: honey_table.cc:43
void add(std::string_view key, const char *val, size_t val_size, bool compressed=false)
Definition: honey_table.cc:74
const std::string & get_path() const
Definition: honey_table.h:630
void commit(honey_revision_number_t, Honey::RootInfo *root_info)
Definition: honey_table.cc:141
void flush_db()
Definition: honey_table.h:643
bool sync()
Definition: honey_table.h:654
void open(int flags_, const Honey::RootInfo &root_info, honey_revision_number_t)
Definition: honey_table.cc:58
The HoneyVersion class manages the revision files.
Definition: honey_version.h:79
void init(uint4 compress_min_)
const std::string & get_free_list() const
Definition: honey_version.h:59
void set_free_list(const std::string &s)
Definition: honey_version.h:64
void set_offset(off_t offset_)
Definition: honey_version.h:62
void append(const std::string &word)
Create a stream to which non-byte-aligned values can be written.
Definition: bitstream.h:34
void encode(Xapian::termpos value, Xapian::termpos outof)
Encode value, known to be less than outof.
Definition: bitstream.cc:92
std::string & freeze()
Finish encoding and return the encoded data as a std::string.
Definition: bitstream.h:51
void encode_interpolative(const Xapian::VecCOW< Xapian::termpos > &pos, int j, int k)
Perform interpolative encoding of pos elements between j and k.
Definition: bitstream.cc:158
Compact a database, or merge and compact several.
Definition: compactor.h:39
virtual void set_status(const std::string &table, const std::string &status)
Update progress.
Definition: compactor.cc:87
compaction_level
Compaction level.
Definition: compactor.h:42
virtual std::string resolve_duplicate_metadata(const std::string &key, size_t num_tags, const std::string tags[])
Resolve multiple user metadata entries with the same key.
Definition: compactor.cc:94
DatabaseCorruptError indicates database corruption was detected.
Definition: error.h:397
DatabaseCreateError indicates a failure to create a database.
Definition: error.h:439
DatabaseError indicates some sort of database related error.
Definition: error.h:355
Indicates an attempt to use a feature which is unavailable.
Definition: error.h:707
InvalidOperationError indicates the API was used in an invalid way.
Definition: error.h:271
RangeError indicates an attempt to access outside the bounds of a container.
Definition: error.h:959
Suitable for "simple" type T.
Definition: smallvector.h:62
const T & back() const
Definition: smallvector.h:337
void push_back(T elt)
Definition: smallvector.h:190
size_type size() const
Definition: smallvector.h:135
Compact a database, or merge and compact several.
class wrapper around zlib
#define UNSIGNED_OVERFLOW_OK(X)
Definition: config.h:626
Constants in the Xapian namespace.
string term
PositionList * p
Xapian::termpos pos
Hierarchy of classes which Xapian can throw as exceptions.
Utility functions for testing files.
std::make_unsigned_t< off_t > file_size_type
Unsigned return type of file_size() function.
Definition: filetests.h:57
file_size_type file_size(const char *path)
Returns the size of a file.
Definition: filetests.h:76
Flint-compatible database locking.
static bool is_user_metadata_key(const string &key)
unsigned long long glass_tablesize_t
How many entries there are in a table.
Definition: glass_defs.h:71
enc
Definition: header.h:76
static void throw_database_corrupt(const char *item, const char *pos)
static bool termlist_key_is_values_used(const string &key)
HoneyCursor class.
Database using honey backend.
Definitions, types, etc for use inside honey.
#define HONEY_DOCLEN_CHUNK_MAX
Definition: honey_defs.h:50
#define HONEY_POSTLIST_CHUNK_MAX
Maximum size of a postlist chunk in bytes.
Definition: honey_defs.h:47
#define KEY_DOCLEN_PREFIX
Definition: honey_defs.h:93
#define HONEY_TABLE_EXTENSION
Honey table extension.
Definition: honey_defs.h:29
#define HONEY_MIN_DB_SIZE
Minimum size to pad a honey table to.
Definition: honey_defs.h:36
Encoding and decoding functions for honey postlists.
bool decode_delta_chunk_header(const char **p, const char *end, Xapian::docid chunk_last, Xapian::docid &chunk_first, Xapian::termcount &chunk_first_wdf)
void encode_delta_chunk_header(Xapian::docid chunk_first, Xapian::docid chunk_last, Xapian::termcount chunk_first_wdf, std::string &out)
void encode_delta_chunk_header_no_wdf(Xapian::docid chunk_first, Xapian::docid chunk_last, std::string &out)
bool decode_initial_chunk_header(const char **p, const char *end, Xapian::doccount &termfreq, Xapian::termcount &collfreq, Xapian::docid &first, Xapian::docid &last, Xapian::docid &chunk_last, Xapian::termcount &first_wdf, Xapian::termcount &wdf_max)
bool decode_delta_chunk_header_no_wdf(const char **p, const char *end, Xapian::docid chunk_last, Xapian::docid &chunk_first)
void encode_initial_chunk_header(Xapian::doccount termfreq, Xapian::termcount collfreq, Xapian::docid first, Xapian::docid last, Xapian::docid chunk_last, Xapian::termcount first_wdf, Xapian::termcount wdf_max, std::string &out)
HoneyTable class.
HoneyValueManager class.
HoneyVersion class.
#define HONEY_VERSION_MAX_SIZE
Maximum size to allow for honey version file data in single file DB.
Definition: honey_version.h:70
Types used internally.
static string encode_valuestats(Xapian::doccount freq, const string &lbound, const string &ubound)
static bool is_user_metadata_key(const string &key)
static bool is_doclenchunk_key(const string &key)
static bool is_valuestats_key(const string &key)
static bool is_valuechunk_key(const string &key)
table_type
Definition: glass_defs.h:53
void multimerge_postlists(Xapian::Compactor *compactor, T *out, const char *tmpdir, const vector< U * > &in, vector< Xapian::docid > off)
void merge_docid_keyed(T *out, const vector< const GlassTable * > &inputs, const vector< Xapian::docid > &offset, Xapian::termcount &ut_lb, Xapian::termcount &ut_ub, int table_type=0)
static int key_type(const string &key)
Return a Honey::KEY_* constant, or a different value for an invalid key.
void merge_synonyms(T *out, U b, U e)
static void merge_spellings(HoneyTable *out, vector< const HoneyTable * >::const_iterator b, vector< const HoneyTable * >::const_iterator e)
void merge_postlists(Xapian::Compactor *compactor, T *out, vector< Xapian::docid >::const_iterator offset, U b, U e)
void merge_positions(T *out, const vector< U * > &inputs, const vector< Xapian::docid > &offset)
static std::string encode_valuestats(Xapian::doccount freq, const std::string &lbound, const std::string &ubound)
Definition: honey_values.h:96
const unsigned KEY_PREFIX_WORD
std::string make_doclenchunk_key(Xapian::docid last_did)
Generate a key for a doclen chunk.
table_type
Definition: honey_defs.h:68
@ TERMLIST
Definition: honey_defs.h:71
@ DOCDATA
Definition: honey_defs.h:70
@ SYNONYM
Definition: honey_defs.h:74
@ POSITION
Definition: honey_defs.h:72
@ SPELLING
Definition: honey_defs.h:73
@ POSTLIST
Definition: honey_defs.h:69
std::string make_valuechunk_key(Xapian::valueno slot, Xapian::docid last_did)
Generate a key for a value stream chunk.
Definition: honey_values.h:39
const unsigned KEY_PREFIX_MIDDLE
const unsigned KEY_PREFIX_TAIL
const unsigned KEY_PREFIX_BOOKEND
Xapian::docid docid_from_key(const std::string &key)
std::string make_valuestats_key(Xapian::valueno slot)
Definition: honey_values.h:83
const unsigned KEY_PREFIX_HEAD
@ KEY_VALUE_CHUNK_HI
Definition: honey_defs.h:84
@ KEY_VALUE_CHUNK
Definition: honey_defs.h:83
@ KEY_DOCLEN_CHUNK
Definition: honey_defs.h:88
@ KEY_USER_METADATA
Definition: honey_defs.h:80
@ KEY_DOCLEN_CHUNK_HI
Definition: honey_defs.h:89
@ KEY_POSTING_CHUNK
Definition: honey_defs.h:90
@ KEY_VALUE_STATS_HI
Definition: honey_defs.h:82
@ KEY_VALUE_STATS
Definition: honey_defs.h:81
string str(int value)
Convert int to std::string.
Definition: str.cc:91
Database open(std::string_view host, unsigned int port, unsigned timeout=10000, unsigned connect_timeout=10000)
Construct a Database object for read-only access to a remote database accessed via a TCP connection.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
const int DB_NO_SYNC
Don't attempt to ensure changes have hit disk.
Definition: constants.h:65
const int DBCOMPACT_MULTIPASS
If merging more than 3 databases, merge the postlists in multiple passes.
Definition: constants.h:257
unsigned valueno
The number for a value slot in a document.
Definition: types.h:90
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
const int DB_BACKEND_GLASS
Use the glass backend.
Definition: constants.h:157
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
const int DBCOMPACT_SINGLE_FILE
Produce a single-file database.
Definition: constants.h:263
const int DB_DANGEROUS
Update the database in-place.
Definition: constants.h:102
#define AssertEq(A, B)
Definition: omassert.h:124
#define Assert(COND)
Definition: omassert.h:122
Arithmetic operations with overflow checks.
std::enable_if_t< std::is_unsigned_v< T1 > &&std::is_unsigned_v< T2 > &&std::is_unsigned_v< R >, bool > add_overflows(T1 a, T2 b, R &res)
Addition with overflow checking.
Definition: overflow.h:58
std::enable_if_t< std::is_unsigned_v< T1 > &&std::is_unsigned_v< T2 > &&std::is_unsigned_v< R >, bool > mul_overflows(T1 a, T2 b, R &res)
Multiplication with overflow checking.
Definition: overflow.h:188
Pack types into strings and unpack them again.
std::string pack_honey_postlist_key(std::string_view term)
Definition: pack.h:602
bool unpack_uint_last(const char **p, const char *end, U *result)
Decode an unsigned integer as the last item in a string.
Definition: pack.h:118
bool unpack_string_preserving_sort(const char **p, const char *end, std::string &result)
Decode a "sort preserved" std::string from a string.
Definition: pack.h:551
bool unpack_string(const char **p, const char *end, std::string &result)
Decode a std::string from a string.
Definition: pack.h:468
void pack_uint_last(std::string &s, U value)
Append an encoded unsigned integer to a string as the last item.
Definition: pack.h:100
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
Definition: pack.h:346
void pack_string_preserving_sort(std::string &s, std::string_view value, bool last=false)
Append an encoded std::string to a string, preserving the sort order.
Definition: pack.h:528
void pack_uint(std::string &s, U value)
Append an encoded unsigned integer to a string.
Definition: pack.h:315
bool unpack_uint_preserving_sort(const char **p, const char *end, U *result)
Decode a "sort preserved" unsigned integer from a string.
Definition: pack.h:251
void pack_uint_preserving_sort(std::string &s, U value)
Append an encoded unsigned integer to a string, preserving the sort order.
Definition: pack.h:204
#define O_BINARY
Definition: safefcntl.h:80
#define O_CLOEXEC
Definition: safefcntl.h:89
Various handy string-related helpers.
bool startswith(std::string_view s, char pfx)
Definition: stringutils.h:56
bool operator()(const T *a, const T *b) const
Return true if and only if a's key is strictly greater than b's key.
Definition: header.h:215
typedefs for Xapian
Statistics about values.
functions for reading and writing different width words
void unaligned_write4(unsigned char *ptr, T value)
Definition: wordaccess.h:173
static bool tags