31 #include <type_traits>
52 #include "../byte_length_strings.h"
53 #include "../prefix_compressed_strings.h"
55 #ifdef XAPIAN_HAS_GLASS_BACKEND
56 # include "../glass/glass_database.h"
57 # include "../glass/glass_table.h"
58 # include "../glass/glass_values.h"
70 message =
"Value overflow unpacking termlist: ";
72 message =
"Out of data unpacking termlist: ";
78 #ifdef XAPIAN_HAS_GLASS_BACKEND
84 return key.size() > 1 && key[0] ==
'\0' && key[1] ==
'\xc0';
90 return key.size() > 1 && key[0] ==
'\0' && key[1] ==
'\xd0';
96 return key.size() > 1 && key[0] ==
'\0' && key[1] ==
'\xd8';
102 return key.size() > 1 && key[0] ==
'\0' && key[1] ==
'\xe0';
110 const char*
p = key.data();
111 const char* e =
p + key.size();
116 if (e -
p == 1 && *
p ==
'\0')
137 unsigned char ch = key[1];
152 #ifdef XAPIAN_HAS_GLASS_BACKEND
156 class DoclenEncoder {
170 const char* d = data.data() + data_start;
171 const char* e = data.data() + data.size();
176 "glass docdata chunk");
181 "in glass docdata chunk");
183 pos = d - data.data();
187 std::tuple<Xapian::docid, Xapian::docid>
get_chunk(
string& chunk) {
193 const char* d = data.data() +
pos;
194 const char* e = data.data() + data.size();
218 "glass docdata chunk");
220 if (doclen > doclen_max) {
221 if (doclen >= 0xffffffff) {
223 const char* m =
"Document length values >= 0xffffffff "
224 "not currently handled";
230 auto s = chunk.size();
245 pos = d - data.data();
264 if (doclen_max >= 0xffff) {
265 if (doclen_max >= 0xffffff) {
269 Assert(chunk.size() >= 5);
271 for (
size_t i = 2; i < chunk.size(); i += 4) {
272 memcpy(
p, &chunk[i], 3);
275 chunk.resize(
p - &chunk[0]);
278 if (doclen_max >= 0xff) {
280 Assert(chunk.size() >= 5);
282 for (
size_t i = 3; i < chunk.size(); i += 4) {
283 memcpy(
p, &chunk[i], 2);
286 chunk.resize(
p - &chunk[0]);
287 }
else if (chunk.size() > 1) {
289 Assert(chunk.size() >= 5);
291 for (
size_t i = 4; i < chunk.size(); i += 4) {
294 chunk.resize(
p - &chunk[0]);
298 return std::pair(new_chunk_firstdid, new_chunk_lastdid);
328 if (doclen_encoder.in_progress()) {
330 std::tie(firstdid, chunk_lastdid) = doclen_encoder.get_chunk(tag);
334 if (value_stats_count > 1) {
342 key.assign(
"\0\xd0", 2);
346 if (find_exact(key))
break;
353 }
else if (value_stats_count == 1) {
357 value_stats_count = 0;
358 find_entry_lt(
"\0\xd1"s);
361 if (value_chunk_count > 1) {
370 key.assign(
"\0\xd8", 2);
387 const char*
p = current_key.data();
388 const char* end =
p + current_key.size();
408 tag.insert(0, newtag);
410 }
else if (value_chunk_count == 1) {
414 value_chunk_count = 0;
415 find_entry_lt(
"\0\xd9"s);
423 value_stats_count = 2;
430 goto start_value_stats;
436 value_chunk_count = 2;
443 goto start_value_chunk;
459 const char* d = key.data();
460 const char* e = d + key.size();
463 size_t data_start = 0;
474 data_start = d - tag.data();
484 static const char doclen_key_prefix[2] = {
487 key.assign(doclen_key_prefix, 2);
489 doclen_encoder.initialise(firstdid, std::move(tag), data_start);
490 std::tie(firstdid, chunk_lastdid) = doclen_encoder.get_chunk(tag);
497 const char* d = key.data();
498 const char* e = d + key.size();
513 have_wdfs = (cf != 0);
514 tag.erase(0, d - tag.data());
518 size_t tmp = d - key.data();
539 "glass posting chunk");
540 chunk_lastdid = firstdid + increase_to_last;
544 if ((first_wdf != 0) != have_wdfs) {
552 "both zero and non-zero wdf");
554 wdf_max = max(wdf_max, first_wdf);
560 "glass posting chunk");
566 if ((wdf != 0) != have_wdfs) {
574 "having both zero and non-zero "
579 wdf_max = max(wdf_max, wdf);
622 const char*
p = key.data();
623 const char* end =
p + key.length();
649 firstdid = chunk_lastdid - (tag.size() - 2) / (tag[0] / 8);
664 const char* d = key.data();
665 const char* e = d + key.size();
677 firstdid, lastdid, chunk_lastdid,
678 first_wdf, wdf_max)) {
685 tag.erase(0, d - tag.data());
691 have_wdfs = (cf != 0) && (cf - first_wdf != tf - 1);
695 remaining_cf_for_flat_wdf) &&
696 cf - first_wdf == remaining_cf_for_flat_wdf) {
714 first_wdf = (cf - first_wdf) / (tf - 1);
720 size_t tmp = d - key.data();
744 tag.erase(0, d - tag.data());
765 if (a->key > b->key)
return true;
766 if (a->key != b->key)
return false;
767 return (a->firstdid > b->firstdid);
772 template<
typename T,
typename U>
void
774 T* out, vector<Xapian::docid>::const_iterator offset,
780 priority_queue<cursor_type*, vector<cursor_type*>, gt_type> pq;
781 for ( ; b != e; ++b, ++offset) {
783 auto cursor =
new cursor_type(in, *offset);
784 if (cursor->next()) {
796 while (!pq.empty()) {
797 cursor_type* cur = pq.top();
798 const string& key = cur->key;
801 if (key != last_key) {
803 if (
tags.size() > 1 && compactor) {
804 Assert(!last_key.empty());
808 const string& resolved_tag =
812 if (!resolved_tag.empty())
813 out->add(last_key, resolved_tag);
815 Assert(!last_key.empty());
816 out->add(last_key,
tags[0]);
822 tags.push_back(cur->tag);
832 if (
tags.size() > 1 && compactor) {
833 Assert(!last_key.empty());
834 const string& resolved_tag =
838 if (!resolved_tag.empty())
839 out->add(last_key, resolved_tag);
841 Assert(!last_key.empty());
842 out->add(last_key,
tags[0]);
850 string lbound, ubound;
852 while (!pq.empty()) {
853 cursor_type* cur = pq.top();
854 const string& key = cur->key;
856 if (key != last_key) {
867 const string& tag = cur->tag;
869 const char*
pos = tag.data();
870 const char* end =
pos + tag.size();
888 size_t len = end -
pos;
900 if (l < lbound) lbound = l;
901 if (u > ubound) ubound = u;
918 while (!pq.empty()) {
919 cursor_type* cur = pq.top();
920 const string& key = cur->key;
922 out->add(key, cur->tag);
932 while (!pq.empty()) {
933 cursor_type* cur = pq.top();
935 string tag = std::move(cur->tag);
936 auto chunk_lastdid = cur->chunk_lastdid;
943 while (!pq.empty()) {
947 if (tag[0] != cur->tag[0]) {
952 size_t byte_width = tag[0] / 8;
953 auto new_size = tag.size();
955 new_size += gap_size * byte_width;
960 new_size += cur->tag.size() - 1;
961 auto full_new_size = new_size;
963 if (byte_width > 1) {
974 tag.reserve(new_size);
975 tag.append(byte_width * gap_size,
'\xff');
976 if (new_size != full_new_size) {
978 auto copy_size = new_size - tag.size();
979 tag.append(cur->tag, 1, copy_size);
980 cur->tag.erase(1, copy_size);
981 copy_size /= byte_width;
982 cur->firstdid += copy_size;
983 chunk_lastdid += gap_size;
984 chunk_lastdid += copy_size;
988 tag.append(cur->tag, 1, string::npos);
989 chunk_lastdid = cur->chunk_lastdid;
1001 struct HoneyPostListChunk {
1023 first_wdf(first_wdf_),
1027 have_wdfs(have_wdfs_),
1034 size_t data_size(
bool want_wdfs)
const {
1038 return tf * (1u + size_t(want_wdfs));
1041 if (have_wdfs == want_wdfs) {
1043 return data.size() + size_t(want_wdfs);
1048 return (data.size() + 1u) / 2u;
1052 return data.size() * 2u;
1056 void append_postings_to(
string& tag,
bool want_wdfs) {
1069 if (have_wdfs == want_wdfs) {
1072 }
else if (want_wdfs) {
1074 auto wdf = (cf - first_wdf) / (tf - 1);
1075 const char*
pos = data.data();
1076 const char* pos_end =
pos + data.size();
1077 while (
pos != pos_end) {
1086 const char*
pos = data.data();
1087 const char* pos_end =
pos + data.size();
1088 while (
pos != pos_end) {
1102 void append_postings_to(
string& tag,
bool want_wdfs,
1108 append_postings_to(tag, want_wdfs);
1111 vector<HoneyPostListChunk>
tags;
1116 cursor_type* cur = NULL;
1124 if (cur == NULL || cur->key != last_key) {
1125 if (!
tags.empty()) {
1130 max_element(
tags.begin(),
tags.end(),
1131 [](
const HoneyPostListChunk& x,
1132 const HoneyPostListChunk& y) {
1133 return x.wdf_max < y.wdf_max;
1136 bool have_wdfs =
true;
1140 }
else if (tf <= 2) {
1143 }
else if (cf == tf - 1 + first_wdf) {
1149 remaining_cf_for_flat_wdf) &&
1150 cf - first_wdf == remaining_cf_for_flat_wdf) {
1161 if (
tags.size() > 1) {
1171 size_t est =
tags[0].data_size(have_wdfs);
1172 while (j <
tags.size()) {
1173 est +=
tags[j].data_size(have_wdfs);
1180 chunk_lastdid =
tags[j - 1].last;
1185 first_wdf, wdf_max, first_tag);
1189 tags[0].append_postings_to(first_tag, have_wdfs);
1190 for (
size_t chunk = 1; chunk != j; ++chunk) {
1191 tags[chunk].append_postings_to(first_tag, have_wdfs,
1192 tags[chunk - 1].last);
1195 out->add(last_key, first_tag);
1197 if (j !=
tags.size()) {
1200 const char*
p = last_key.data();
1201 const char* end =
p + last_key.size();
1208 while (j <
tags.size()) {
1211 size_t est =
tags[j].data_size(have_wdfs);
1212 while (++j <
tags.size()) {
1213 est +=
tags[j].data_size(have_wdfs);
1217 last_did =
tags[j - 1].last;
1230 tags[i].append_postings_to(tag, have_wdfs);
1232 tags[i].append_postings_to(tag, have_wdfs,
1241 if (cur == NULL)
break;
1243 last_key = cur->key;
1246 if (tf && cur->tf && (cf == 0) != (cur->cf == 0)) {
1251 "both zero and non-zero wdf");
1256 tags.push_back(HoneyPostListChunk(cur->firstdid,
1263 std::move(cur->tag)));
1274 #ifdef XAPIAN_HAS_GLASS_BACKEND
1290 template<
typename T>
1294 if (b->after_end())
return false;
1295 if (a->after_end())
return true;
1296 return (a->current_key > b->current_key);
1300 #ifdef XAPIAN_HAS_GLASS_BACKEND
1304 vector<const GlassTable*>::const_iterator b,
1305 vector<const GlassTable*>::const_iterator e)
1309 priority_queue<cursor_type*, vector<cursor_type*>, gt_type> pq;
1310 for ( ; b != e; ++b) {
1312 auto cursor =
new cursor_type(in);
1313 if (cursor->next()) {
1321 while (!pq.empty()) {
1322 cursor_type* cur = pq.top();
1351 string key = cur->current_key;
1372 string m =
"Bad spelling key prefix: ";
1373 m +=
static_cast<unsigned char>(key[0]);
1378 if (pq.empty() || pq.top()->current_key > key) {
1389 compressed = cur->read_tag(
false);
1392 AssertEq(cur->current_tag[1], key[1]);
1393 AssertEq(cur->current_tag[2], key[2]);
1394 cur->current_tag.erase(1, 2);
1402 compressed = cur->read_tag(
false);
1406 while (!spell_in.
at_end()) {
1407 spell_out.
append(*spell_in);
1410 cur->current_tag = std::move(new_tag);
1414 compressed = cur->read_tag(
true);
1417 out->
add(key, cur->current_tag, compressed);
1432 vector<PrefixCompressedStringItor*>,
1437 vector<cursor_type*> vec;
1438 vec.reserve(pq.size());
1444 if (pq.empty() || pq.top()->current_key != key)
break;
1451 while (!pqtag.empty()) {
1455 if (word != lastword) {
1467 for (
auto i : vec) {
1481 const char*
p = cur->current_tag.data();
1482 const char* end =
p + cur->current_tag.size();
1492 if (pq.empty() || pq.top()->current_key != key)
break;
1506 vector<const HoneyTable*>::const_iterator b,
1507 vector<const HoneyTable*>::const_iterator e)
1511 priority_queue<cursor_type*, vector<cursor_type*>, gt_type> pq;
1512 for ( ; b != e; ++b) {
1514 auto cursor =
new cursor_type(in);
1515 if (cursor->next()) {
1523 while (!pq.empty()) {
1524 cursor_type* cur = pq.top();
1527 string key = cur->current_key;
1528 if (pq.empty() || pq.top()->current_key > key) {
1531 bool compressed = cur->read_tag(
true);
1532 out->
add(key, cur->current_tag, compressed);
1547 vector<PrefixCompressedStringItor*>,
1552 vector<cursor_type*> vec;
1553 vec.reserve(pq.size());
1560 if (pq.empty() || pq.top()->current_key != key)
break;
1567 while (!pqtag.empty()) {
1571 if (word != lastword) {
1583 for (
auto i : vec) {
1597 const char*
p = cur->current_tag.data();
1598 const char* end =
p + cur->current_tag.size();
1608 if (pq.empty() || pq.top()->current_key != key)
break;
1620 template<
typename T,
typename U>
void
1626 priority_queue<cursor_type*, vector<cursor_type*>, gt_type> pq;
1627 for ( ; b != e; ++b) {
1629 auto cursor =
new cursor_type(in);
1630 if (cursor->next()) {
1638 while (!pq.empty()) {
1639 cursor_type* cur = pq.top();
1642 string key = cur->current_key;
1643 if (pq.empty() || pq.top()->current_key > key) {
1646 bool compressed = cur->read_tag(
true);
1647 out->add(key, cur->current_tag, compressed);
1662 vector<ByteLengthPrefixedStringItor*>,
1664 vector<cursor_type*> vec;
1670 if (pq.empty() || pq.top()->current_key != key)
break;
1675 string_view lastword;
1676 while (!pqtag.empty()) {
1679 string_view word = **it;
1680 if (word != lastword) {
1693 for (
auto i : vec) {
1706 template<
typename T,
typename U>
void
1708 T* out,
const char* tmpdir,
1709 const vector<U*>& in,
1710 vector<Xapian::docid> off)
1712 if (in.size() <= 3) {
1717 vector<HoneyTable*> tmp;
1718 tmp.reserve(in.size() / 2);
1720 vector<Xapian::docid> newoff;
1721 newoff.resize(in.size() / 2);
1722 for (
unsigned int i = 0, j; i < in.size(); i = j) {
1724 if (j == in.size() - 1) ++j;
1726 string dest = tmpdir;
1744 in.begin() + i, in.begin() + j);
1745 tmp.push_back(tmptab);
1747 tmptab->
commit(1, &root_info);
1753 while (tmp.size() > 3) {
1754 vector<HoneyTable*> tmpout;
1755 tmpout.reserve(tmp.size() / 2);
1756 vector<Xapian::docid> newoff;
1757 newoff.resize(tmp.size() / 2);
1758 for (
unsigned int i = 0, j; i < tmp.size(); i = j) {
1760 if (j == tmp.size() - 1) ++j;
1762 string dest = tmpdir;
1780 tmp.begin() + i, tmp.begin() + j);
1782 for (
unsigned int k = i; k < j; ++k) {
1788 tmpout.push_back(tmptab);
1790 tmptab->
commit(1, &root_info);
1798 for (
size_t k = 0; k < tmp.size(); ++k) {
1808 #ifdef XAPIAN_HAS_GLASS_BACKEND
1825 const char* d = current_key.data();
1826 const char* e = d + current_key.size();
1863 const char* d = current_key.data();
1864 const char* e = d + current_key.size();
1884 template<
typename T>
1890 return a->key > b->key;
1894 template<
typename T,
typename U>
void
1896 const vector<Xapian::docid>& offset)
1901 priority_queue<cursor_type*, vector<cursor_type*>, gt_type> pq;
1902 for (
size_t i = 0; i < inputs.size(); ++i) {
1903 auto in = inputs[i];
1904 auto cursor =
new cursor_type(in, offset[i]);
1905 if (cursor->next()) {
1913 while (!pq.empty()) {
1914 cursor_type* cur = pq.top();
1916 out->add(cur->key, cur->get_tag());
1925 template<
typename T,
typename U>
void
1927 const vector<Xapian::docid>& offset,
1930 for (
size_t i = 0; i < inputs.size(); ++i) {
1933 auto in = inputs[i];
1938 while (cur.
next()) {
1945 string msg =
"Bad key in ";
1946 msg += inputs[i]->get_path();
1955 key.append(d, e - d);
1960 bool compressed = cur.
read_tag(
true);
1966 #ifdef XAPIAN_HAS_GLASS_BACKEND
1967 template<
typename T>
void
1969 const vector<Xapian::docid>& offset,
1973 for (
size_t i = 0; i < inputs.size(); ++i) {
1976 auto in = inputs[i];
1977 if (in->empty())
continue;
1983 while (cur.
next()) {
1991 string msg =
"Bad key in ";
1992 msg += inputs[i]->get_path();
2001 key.append(d, e - d);
2016 bool next_result = cur.
next();
2017 bool next_already_done =
true;
2018 unsigned bitmap_slots_used = 0;
2019 string encoded_slots_used;
2022 next_already_done =
false;
2026 const char*
p = valtag.data();
2027 const char* end =
p + valtag.size();
2044 slot += last_slot + 1;
2050 if (slots.
back() <= 6) {
2053 for (
auto slot : slots) {
2054 bitmap_slots_used |= 1 << slot;
2059 if (slots.
size() > 1) {
2061 slots_used.
encode(first_slot, last_slot);
2063 last_slot - first_slot);
2066 encoded_slots_used = slots_used.
freeze();
2068 encoded_slots_used = std::move(
enc);
2073 const char*
pos = tag.data();
2074 const char* end =
pos + tag.size();
2077 if (encoded_slots_used.empty()) {
2078 newtag += char(bitmap_slots_used);
2080 auto size = encoded_slots_used.size();
2082 newtag += char(0x80 | size);
2087 newtag += encoded_slots_used;
2100 auto uniq_terms = min(termlist_size, doclen);
2102 (ut_lb == 0 || uniq_terms < ut_lb)) {
2105 if (uniq_terms > ut_ub)
2111 string current_term;
2112 while (
pos != end) {
2115 if (!current_term.empty()) {
2116 size_t reuse =
static_cast<unsigned char>(*
pos++);
2117 newtag += char(reuse);
2119 if (reuse > current_term.size()) {
2120 current_wdf = reuse / (current_term.size() + 1);
2121 reuse = reuse % (current_term.size() + 1);
2123 current_term.resize(reuse);
2129 size_t append =
static_cast<unsigned char>(*
pos++);
2130 if (
size_t(end -
pos) < append)
2133 current_term.append(
pos, append);
2145 newtag += char(append);
2146 newtag.append(current_term.end() - append,
2147 current_term.end());
2150 if (!newtag.empty())
2151 out->add(key, newtag);
2152 if (!next_result)
break;
2153 if (next_already_done)
goto next_without_next;
2155 bool compressed = cur.
read_tag(
true);
2169 const char* destdir,
2172 const vector<const Xapian::Database::Internal*>& sources,
2173 const vector<Xapian::docid>& offset,
2190 static const table_list tables[] = {
2211 for (
size_t i = 0; i != sources.size(); ++i) {
2212 bool has_uncommitted_changes;
2214 #ifdef XAPIAN_HAS_GLASS_BACKEND
2224 if (has_uncommitted_changes) {
2226 "Can't compact from a WritableDatabase with uncommitted "
2227 "changes - either call commit() first, or create a new "
2228 "Database object from the filename on disk";
2243 unique_ptr<HoneyVersion> version_file_out;
2259 bool bad_totals =
false;
2262 version_file_out->create();
2263 for (
size_t i = 0; i != sources.size(); ++i) {
2264 bool source_single_file =
false;
2266 #ifdef XAPIAN_HAS_GLASS_BACKEND
2269 auto& v_out = version_file_out;
2275 v_in.get_doclength_lower_bound(),
2276 v_in.get_doclength_upper_bound(),
2277 v_in.get_wdf_upper_bound(),
2278 v_in.get_total_doclen(),
2279 v_in.get_spelling_wordfreq_upper_bound(),
2282 source_single_file = db->single_file();
2288 version_file_out->merge_stats(db->version_file);
2291 if (source_single_file) {
2295 sources[i]->get_backend_info(&path);
2307 string fl_serialised;
2312 fl.
pack(fl_serialised);
2318 #ifndef XAPIAN_HAS_GLASS_BACKEND
2321 vector<HoneyTable*> tabs;
2322 tabs.reserve(std::end(tables) - std::begin(tables));
2324 for (
const auto& t : tables) {
2341 bool output_will_exist = !t.lazy;
2345 bool bad_stat =
false;
2349 bool single_file_in =
false;
2353 vector<const GlassTable*> inputs;
2354 inputs.reserve(sources.size());
2355 size_t inputs_present = 0;
2356 for (
auto src : sources) {
2361 table = &(db->postlist_table);
2364 table = &(db->docdata_table);
2367 table = &(db->termlist_table);
2370 table = &(db->position_table);
2373 table = &(db->spelling_table);
2376 table = &(db->synonym_table);
2383 if (db->single_file()) {
2390 single_file_in =
true;
2391 output_will_exist =
true;
2400 in_size += db_size / 1024;
2401 output_will_exist =
true;
2403 }
else if (errno != ENOENT) {
2405 bad_totals = bad_stat =
true;
2406 output_will_exist =
true;
2410 inputs.push_back(table);
2415 if (inputs_present != 0) {
2417 string m =
str(inputs_present);
2419 m +=
str(sources.size());
2420 m +=
" inputs present, so suppressing output";
2425 output_will_exist =
false;
2428 if (!output_will_exist) {
2430 compactor->
set_status(t.name,
"doesn't exist");
2435 off_t table_start_offset = -1;
2442 if (table_start_offset < 0)
2445 table_start_offset = lseek(fd, 0, SEEK_CUR);
2447 out =
new HoneyTable(t.name, fd, version_file_out->get_offset(),
2450 out =
new HoneyTable(t.name, dest,
false, t.lazy);
2452 tabs.push_back(out);
2458 version_file_out->get_root(t.type),
2459 version_file_out->get_revision());
2466 if (multipass && inputs.size() > 3) {
2471 inputs.begin(), inputs.end());
2486 auto& v_out = version_file_out;
2487 auto ut_lb = v_out->get_unique_terms_lower_bound();
2488 auto ut_ub = v_out->get_unique_terms_upper_bound();
2490 version_file_out->set_unique_terms_lower_bound(ut_lb);
2491 version_file_out->set_unique_terms_upper_bound(ut_ub);
2498 out->
commit(1, root_info);
2500 if (single_file) fl_serialised = root_info->
get_free_list();
2503 if (!bad_stat && !single_file_in) {
2512 auto old_prev_size = prev_size;
2513 prev_size = db_size;
2514 db_size -= old_prev_size;
2519 out_size = db_size / 1024;
2520 }
else if (errno != ENOENT) {
2521 bad_totals = bad_stat =
true;
2527 "Done (couldn't stat all the DB files)");
2528 }
else if (single_file_in) {
2531 "Done (table sizes unknown for single "
2535 if (out_size == in_size) {
2536 status =
"Size unchanged (";
2539 if (out_size < in_size) {
2540 delta = in_size - out_size;
2541 status =
"Reduced by ";
2543 delta = out_size - in_size;
2544 status =
"INCREASED by ";
2547 status +=
str(100 * delta / in_size);
2550 status +=
str(delta);
2552 status +=
str(in_size);
2555 status +=
str(out_size);
2567 #ifdef HAVE_FTRUNCATE
2574 if (lseek(fd, off, SEEK_SET) != off || write(fd,
"", 1) != 1) {
2582 if (lseek(fd, version_file_out->get_offset(), SEEK_SET) == -1) {
2586 version_file_out->set_last_docid(last_docid);
2587 string tmpfile = version_file_out->write(1, FLAGS);
2589 off_t version_file_size = lseek(fd, 0, SEEK_CUR);
2590 if (version_file_size < 0) {
2595 "version file data");
2598 for (
unsigned j = 0; j != tabs.size(); ++j) {
2602 version_file_out->sync(tmpfile, 1, FLAGS);
2603 for (
unsigned j = 0; j != tabs.size(); ++j) {
2608 vector<HoneyTable*> tabs;
2609 tabs.reserve(std::end(tables) - std::begin(tables));
2611 for (
const auto& t : tables) {
2628 bool output_will_exist = !t.lazy;
2632 bool bad_stat =
false;
2636 bool single_file_in =
false;
2640 vector<const HoneyTable*> inputs;
2641 inputs.reserve(sources.size());
2642 size_t inputs_present = 0;
2643 for (
auto src : sources) {
2648 table = &(db->postlist_table);
2651 table = &(db->docdata_table);
2654 table = &(db->termlist_table);
2657 table = &(db->position_table);
2660 table = &(db->spelling_table);
2663 table = &(db->synonym_table);
2670 if (db->single_file()) {
2677 single_file_in =
true;
2678 output_will_exist =
true;
2687 in_size += db_size / 1024;
2688 output_will_exist =
true;
2690 }
else if (errno != ENOENT) {
2692 bad_totals = bad_stat =
true;
2693 output_will_exist =
true;
2697 inputs.push_back(table);
2702 if (inputs_present != 0) {
2704 string m =
str(inputs_present);
2706 m +=
str(sources.size());
2707 m +=
" inputs present, so suppressing output";
2712 output_will_exist =
false;
2715 if (!output_will_exist) {
2717 compactor->
set_status(t.name,
"doesn't exist");
2722 off_t table_start_offset = -1;
2729 if (table_start_offset < 0)
2732 table_start_offset = lseek(fd, 0, SEEK_CUR);
2734 out =
new HoneyTable(t.name, fd, version_file_out->get_offset(),
2737 out =
new HoneyTable(t.name, dest,
false, t.lazy);
2739 tabs.push_back(out);
2745 version_file_out->get_root(t.type),
2746 version_file_out->get_revision());
2753 if (multipass && inputs.size() > 3) {
2758 inputs.begin(), inputs.end());
2779 out->
commit(1, root_info);
2781 if (single_file) fl_serialised = root_info->
get_free_list();
2784 if (!bad_stat && !single_file_in) {
2793 auto old_prev_size = prev_size;
2794 prev_size = db_size;
2795 db_size -= old_prev_size;
2800 out_size = db_size / 1024;
2801 }
else if (errno != ENOENT) {
2802 bad_totals = bad_stat =
true;
2808 "Done (couldn't stat all the DB files)");
2809 }
else if (single_file_in) {
2812 "Done (table sizes unknown for single "
2816 if (out_size == in_size) {
2817 status =
"Size unchanged (";
2820 if (out_size < in_size) {
2821 delta = in_size - out_size;
2822 status =
"Reduced by ";
2824 delta = out_size - in_size;
2825 status =
"INCREASED by ";
2828 status +=
str(100 * delta / in_size);
2831 status +=
str(delta);
2833 status +=
str(in_size);
2836 status +=
str(out_size);
2848 #ifdef HAVE_FTRUNCATE
2855 if (lseek(fd, off, SEEK_SET) != off || write(fd,
"", 1) != 1) {
2863 if (lseek(fd, version_file_out->get_offset(), SEEK_SET) < 0) {
2867 version_file_out->set_last_docid(last_docid);
2868 string tmpfile = version_file_out->write(1, FLAGS);
2869 for (
unsigned j = 0; j != tabs.size(); ++j) {
2873 version_file_out->sync(tmpfile, 1, FLAGS);
2874 for (
unsigned j = 0; j != tabs.size(); ++j) {
2879 if (!single_file) lock.
release();
2881 if (!bad_totals && compactor) {
2885 if (out_total == in_total) {
2886 status =
"Size unchanged (";
2889 if (out_total < in_total) {
2890 delta = in_total - out_total;
2891 status =
"Reduced by ";
2893 delta = out_total - in_total;
2894 status =
"INCREASED by ";
2897 status +=
str(100 * delta / in_total);
2900 status +=
str(delta);
2902 status +=
str(in_total);
2905 status +=
str(out_total);
void release()
Release the lock.
reason lock(bool exclusive, bool wait, std::string &explanation)
Attempt to obtain the lock.
void throw_databaselockerror(FlintLock::reason why, const std::string &db_dir, const std::string &explanation) const
Throw Xapian::DatabaseLockError.
A cursor pointing to a position in a Btree table, for reading several entries in order,...
string current_key
Current key pointed to by cursor.
bool read_tag(bool keep_compressed=false)
Read the tag from the table and store it in current_tag.
bool next()
Advance to the next key.
void rewind()
Position cursor on the dummy empty key.
string current_tag
Current tag pointed to by cursor.
A backend designed for efficient indexing and retrieval, using compressed posting lists and a btree s...
GlassVersion version_file
The file describing the Glass database.
virtual bool has_uncommitted_changes() const
Return true if there are uncommitted changes.
Class managing a Btree table in a Glass database.
void merge_stats(const GlassVersion &o)
Merge the database stats.
Xapian::docid get_docid() const
bool operator()(const T *a, const T *b) const
Return true if and only if a's key is strictly greater than b's key.
const string & get_tag() const
PositionCursor(const GlassTable *in, Xapian::docid offset_)
const string & get_tag() const
PositionCursor(const HoneyTable *in, Xapian::docid offset_)
bool operator()(const T *a, const T *b) const
Return true if and only if a's key is strictly greater than b's key.
PostlistCursor(HoneyTable *in, Xapian::docid offset_)
void initialise(Xapian::docid firstdid_, string &&glass_data, size_t data_start)
std::tuple< Xapian::docid, Xapian::docid > get_chunk(string &chunk)
Xapian::docid chunk_lastdid
Xapian::termcount first_wdf
Xapian::termcount wdf_max
PostlistCursor(const GlassTable *in, Xapian::docid offset_)
DoclenEncoder doclen_encoder
Xapian::termcount wdf_max
Xapian::termcount first_wdf
Xapian::docid chunk_lastdid
PostlistCursor(const HoneyTable *in, Xapian::docid offset_)
bool read_tag(bool keep_compressed=false)
void rewind()
Position cursor on the dummy empty key.
Database using honey backend.
bool has_uncommitted_changes() const
static void compact(Xapian::Compactor *compactor, const char *destdir, int fd, int source_backend, const std::vector< const Xapian::Database::Internal * > &sources, const std::vector< Xapian::docid > &offset, Xapian::Compactor::compaction_level compaction, unsigned flags, Xapian::docid last_docid)
void pack(std::string &buf)
void set_first_unused_block(uint4 base)
void create_and_open(int flags_, const Honey::RootInfo &root_info)
void add(std::string_view key, const char *val, size_t val_size, bool compressed=false)
const std::string & get_path() const
void commit(honey_revision_number_t, Honey::RootInfo *root_info)
void open(int flags_, const Honey::RootInfo &root_info, honey_revision_number_t)
The HoneyVersion class manages the revision files.
void init(uint4 compress_min_)
const std::string & get_free_list() const
void set_free_list(const std::string &s)
void set_offset(off_t offset_)
void append(const std::string &word)
Create a stream to which non-byte-aligned values can be written.
void encode(Xapian::termpos value, Xapian::termpos outof)
Encode value, known to be less than outof.
std::string & freeze()
Finish encoding and return the encoded data as a std::string.
void encode_interpolative(const Xapian::VecCOW< Xapian::termpos > &pos, int j, int k)
Perform interpolative encoding of pos elements between j and k.
Compact a database, or merge and compact several.
virtual void set_status(const std::string &table, const std::string &status)
Update progress.
compaction_level
Compaction level.
virtual std::string resolve_duplicate_metadata(const std::string &key, size_t num_tags, const std::string tags[])
Resolve multiple user metadata entries with the same key.
DatabaseCorruptError indicates database corruption was detected.
DatabaseCreateError indicates a failure to create a database.
DatabaseError indicates some sort of database related error.
Indicates an attempt to use a feature which is unavailable.
InvalidOperationError indicates the API was used in an invalid way.
RangeError indicates an attempt to access outside the bounds of a container.
Suitable for "simple" type T.
Compact a database, or merge and compact several.
class wrapper around zlib
#define UNSIGNED_OVERFLOW_OK(X)
Constants in the Xapian namespace.
Hierarchy of classes which Xapian can throw as exceptions.
Utility functions for testing files.
std::make_unsigned_t< off_t > file_size_type
Unsigned return type of file_size() function.
file_size_type file_size(const char *path)
Returns the size of a file.
Flint-compatible database locking.
static bool is_user_metadata_key(const string &key)
unsigned long long glass_tablesize_t
How many entries there are in a table.
static void throw_database_corrupt(const char *item, const char *pos)
static bool termlist_key_is_values_used(const string &key)
Database using honey backend.
Definitions, types, etc for use inside honey.
#define HONEY_DOCLEN_CHUNK_MAX
#define HONEY_POSTLIST_CHUNK_MAX
Maximum size of a postlist chunk in bytes.
#define KEY_DOCLEN_PREFIX
#define HONEY_TABLE_EXTENSION
Honey table extension.
#define HONEY_MIN_DB_SIZE
Minimum size to pad a honey table to.
Encoding and decoding functions for honey postlists.
bool decode_delta_chunk_header(const char **p, const char *end, Xapian::docid chunk_last, Xapian::docid &chunk_first, Xapian::termcount &chunk_first_wdf)
void encode_delta_chunk_header(Xapian::docid chunk_first, Xapian::docid chunk_last, Xapian::termcount chunk_first_wdf, std::string &out)
void encode_delta_chunk_header_no_wdf(Xapian::docid chunk_first, Xapian::docid chunk_last, std::string &out)
bool decode_initial_chunk_header(const char **p, const char *end, Xapian::doccount &termfreq, Xapian::termcount &collfreq, Xapian::docid &first, Xapian::docid &last, Xapian::docid &chunk_last, Xapian::termcount &first_wdf, Xapian::termcount &wdf_max)
bool decode_delta_chunk_header_no_wdf(const char **p, const char *end, Xapian::docid chunk_last, Xapian::docid &chunk_first)
void encode_initial_chunk_header(Xapian::doccount termfreq, Xapian::termcount collfreq, Xapian::docid first, Xapian::docid last, Xapian::docid chunk_last, Xapian::termcount first_wdf, Xapian::termcount wdf_max, std::string &out)
#define HONEY_VERSION_MAX_SIZE
Maximum size to allow for honey version file data in single file DB.
static string encode_valuestats(Xapian::doccount freq, const string &lbound, const string &ubound)
static bool is_user_metadata_key(const string &key)
static bool is_doclenchunk_key(const string &key)
static bool is_valuestats_key(const string &key)
static bool is_valuechunk_key(const string &key)
void multimerge_postlists(Xapian::Compactor *compactor, T *out, const char *tmpdir, const vector< U * > &in, vector< Xapian::docid > off)
void merge_docid_keyed(T *out, const vector< const GlassTable * > &inputs, const vector< Xapian::docid > &offset, Xapian::termcount &ut_lb, Xapian::termcount &ut_ub, int table_type=0)
static int key_type(const string &key)
Return a Honey::KEY_* constant, or a different value for an invalid key.
void merge_synonyms(T *out, U b, U e)
static void merge_spellings(HoneyTable *out, vector< const HoneyTable * >::const_iterator b, vector< const HoneyTable * >::const_iterator e)
void merge_postlists(Xapian::Compactor *compactor, T *out, vector< Xapian::docid >::const_iterator offset, U b, U e)
void merge_positions(T *out, const vector< U * > &inputs, const vector< Xapian::docid > &offset)
static std::string encode_valuestats(Xapian::doccount freq, const std::string &lbound, const std::string &ubound)
const unsigned KEY_PREFIX_WORD
std::string make_doclenchunk_key(Xapian::docid last_did)
Generate a key for a doclen chunk.
std::string make_valuechunk_key(Xapian::valueno slot, Xapian::docid last_did)
Generate a key for a value stream chunk.
const unsigned KEY_PREFIX_MIDDLE
const unsigned KEY_PREFIX_TAIL
const unsigned KEY_PREFIX_BOOKEND
Xapian::docid docid_from_key(const std::string &key)
std::string make_valuestats_key(Xapian::valueno slot)
const unsigned KEY_PREFIX_HEAD
string str(int value)
Convert int to std::string.
Database open(std::string_view host, unsigned int port, unsigned timeout=10000, unsigned connect_timeout=10000)
Construct a Database object for read-only access to a remote database accessed via a TCP connection.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
const int DB_NO_SYNC
Don't attempt to ensure changes have hit disk.
const int DBCOMPACT_MULTIPASS
If merging more than 3 databases, merge the postlists in multiple passes.
unsigned valueno
The number for a value slot in a document.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
const int DB_BACKEND_GLASS
Use the glass backend.
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
const int DBCOMPACT_SINGLE_FILE
Produce a single-file database.
const int DB_DANGEROUS
Update the database in-place.
Arithmetic operations with overflow checks.
std::enable_if_t< std::is_unsigned_v< T1 > &&std::is_unsigned_v< T2 > &&std::is_unsigned_v< R >, bool > add_overflows(T1 a, T2 b, R &res)
Addition with overflow checking.
std::enable_if_t< std::is_unsigned_v< T1 > &&std::is_unsigned_v< T2 > &&std::is_unsigned_v< R >, bool > mul_overflows(T1 a, T2 b, R &res)
Multiplication with overflow checking.
Pack types into strings and unpack them again.
std::string pack_honey_postlist_key(std::string_view term)
bool unpack_uint_last(const char **p, const char *end, U *result)
Decode an unsigned integer as the last item in a string.
bool unpack_string_preserving_sort(const char **p, const char *end, std::string &result)
Decode a "sort preserved" std::string from a string.
bool unpack_string(const char **p, const char *end, std::string &result)
Decode a std::string from a string.
void pack_uint_last(std::string &s, U value)
Append an encoded unsigned integer to a string as the last item.
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
void pack_string_preserving_sort(std::string &s, std::string_view value, bool last=false)
Append an encoded std::string to a string, preserving the sort order.
void pack_uint(std::string &s, U value)
Append an encoded unsigned integer to a string.
bool unpack_uint_preserving_sort(const char **p, const char *end, U *result)
Decode a "sort preserved" unsigned integer from a string.
void pack_uint_preserving_sort(std::string &s, U value)
Append an encoded unsigned integer to a string, preserving the sort order.
Various handy string-related helpers.
bool startswith(std::string_view s, char pfx)
bool operator()(const T *a, const T *b) const
Return true if and only if a's key is strictly greater than b's key.
MergeCursor(const GlassTable *in)
MergeCursor(const HoneyTable *in)
functions for reading and writing different width words
void unaligned_write4(unsigned char *ptr, T value)