45 #include "../byte_length_strings.h" 46 #include "../prefix_compressed_strings.h" 57 return key.size() == 1 && key[0] ==
'\0';
63 return key.size() > 1 && key[0] ==
'\0' && key[1] ==
'\xc0';
69 return key.size() > 1 && key[0] ==
'\0' && key[1] ==
'\xd0';
75 return key.size() > 1 && key[0] ==
'\0' && key[1] ==
'\xd8';
81 return key.size() > 1 && key[0] ==
'\0' && key[1] ==
'\xe0';
111 const char * p = key.data();
112 const char * end = p + key.length();
122 key.assign(
"\0\xd8", 2);
131 const char * d = key.data();
132 const char * e = d + key.size();
151 tag.erase(0, d - tag.data());
154 size_t tmp = d - key.data();
173 if (a->
key > b->
key)
return true;
174 if (a->
key != b->
key)
return false;
181 const string & lbound,
const string & ubound)
189 if (lbound != ubound) value += ubound;
195 ChertTable * out, vector<Xapian::docid>::const_iterator offset,
196 vector<ChertTable*>::const_iterator b,
197 vector<ChertTable*>::const_iterator e,
204 priority_queue<PostlistCursor *, vector<PostlistCursor *>,
PostlistCursorGt> pq;
205 for ( ; b != e; ++b, ++offset) {
218 const char * data = cur->
tag.data();
219 const char * end = data + cur->
tag.size();
226 if (!
unpack_uint(&data, end, &doclen_lbound_tmp)) {
229 doclen_lbound = min(doclen_lbound, doclen_lbound_tmp);
235 wdf_ubound = max(wdf_ubound, wdf_ubound_tmp);
238 if (!
unpack_uint(&data, end, &doclen_ubound_tmp)) {
241 doclen_ubound_tmp += wdf_ubound_tmp;
242 doclen_ubound = max(doclen_ubound, doclen_ubound_tmp);
248 tot_totlen += totlen;
249 if (tot_totlen < totlen) {
250 throw "totlen wrapped!";
264 if (doclen_lbound > doclen_ubound)
265 doclen_lbound = doclen_ubound;
270 pack_uint(tag, doclen_ubound - wdf_ubound);
272 out->
add(
string(1,
'\0'), tag);
279 while (!pq.empty()) {
281 const string& key = cur->
key;
284 if (key != last_key) {
286 if (tags.size() > 1 && compactor) {
287 Assert(!last_key.empty());
291 const string & resolved_tag =
295 if (!resolved_tag.empty())
296 out->
add(last_key, resolved_tag);
298 Assert(!last_key.empty());
299 out->
add(last_key, tags[0]);
305 tags.push_back(cur->
tag);
315 if (tags.size() > 1 && compactor) {
316 Assert(!last_key.empty());
317 const string & resolved_tag =
321 if (!resolved_tag.empty())
322 out->
add(last_key, resolved_tag);
324 Assert(!last_key.empty());
325 out->
add(last_key, tags[0]);
333 string lbound, ubound;
335 while (!pq.empty()) {
337 const string& key = cur->
key;
339 if (key != last_key) {
350 const string & tag = cur->
tag;
352 const char * pos = tag.data();
353 const char * end = pos + tag.size();
365 size_t len = end - pos;
377 if (l < lbound) lbound = l;
378 if (u > ubound) ubound = u;
395 while (!pq.empty()) {
397 const string & key = cur->
key;
410 vector<pair<Xapian::docid, string> >
tags;
418 if (cur == NULL || cur->
key != last_key) {
424 string tag = tags[0].second;
425 tag[0] = (tags.size() == 1) ?
'1' :
'0';
427 out->
add(last_key, first_tag);
431 const char * p = last_key.data();
432 const char * end = p + last_key.size();
437 vector<pair<Xapian::docid, string> >::const_iterator i;
439 while (++i != tags.end()) {
441 tag[0] = (i + 1 == tags.end()) ?
'1' :
'0';
446 if (cur == NULL)
break;
452 tags.push_back(make_pair(cur->
firstdid, cur->
tag));
463 find_entry(
string());
479 vector<ChertTable*>::const_iterator b,
480 vector<ChertTable*>::const_iterator e)
482 priority_queue<MergeCursor *, vector<MergeCursor *>,
CursorGt> pq;
483 for ( ; b != e; ++b) {
490 while (!pq.empty()) {
495 if (pq.empty() || pq.top()->current_key > key) {
498 bool compressed = cur->
read_tag(
true);
514 vector<PrefixCompressedStringItor *>,
519 vector<MergeCursor *> vec;
520 vec.reserve(pq.size());
524 pqtag.push(
new PrefixCompressedStringItor(cur->
current_tag));
526 if (pq.empty() || pq.top()->current_key != key)
break;
533 while (!pqtag.empty()) {
534 PrefixCompressedStringItor * it = pqtag.top();
537 if (word != lastword) {
549 vector<MergeCursor *>::const_iterator i;
550 for (i = vec.begin(); i != vec.end(); ++i) {
575 if (pq.empty() || pq.top()->current_key != key)
break;
588 vector<ChertTable*>::const_iterator b,
589 vector<ChertTable*>::const_iterator e)
591 priority_queue<MergeCursor *, vector<MergeCursor *>,
CursorGt> pq;
592 for ( ; b != e; ++b) {
599 while (!pq.empty()) {
604 if (pq.empty() || pq.top()->current_key > key) {
607 bool compressed = cur->
read_tag(
true);
623 vector<ByteLengthPrefixedStringItor *>,
625 vector<MergeCursor *> vec;
629 pqtag.push(
new ByteLengthPrefixedStringItor(cur->
current_tag));
631 if (pq.empty() || pq.top()->current_key != key)
break;
637 while (!pqtag.empty()) {
638 ByteLengthPrefixedStringItor * it = pqtag.top();
640 if (**it != lastword) {
653 vector<MergeCursor *>::const_iterator i;
654 for (i = vec.begin(); i != vec.end(); ++i) {
670 vector<ChertTable *> tmp,
671 vector<Xapian::docid> off,
675 while (tmp.size() > 3) {
676 vector<ChertTable *> tmpout;
677 tmpout.reserve(tmp.size() / 2);
678 vector<Xapian::docid> newoff;
679 newoff.resize(tmp.size() / 2);
680 for (
unsigned int i = 0, j; i < tmp.size(); i = j) {
682 if (j == tmp.size() - 1) ++j;
684 string dest = tmpdir;
698 tmp.begin() + i, tmp.begin() + j,
701 for (
unsigned int k = i; k < j; ++k) {
702 unlink((tmp[k]->get_path() +
"DB").c_str());
703 unlink((tmp[k]->get_path() +
"baseA").c_str());
704 unlink((tmp[k]->get_path() +
"baseB").c_str());
709 tmpout.push_back(tmptab);
720 for (
size_t k = 0; k < tmp.size(); ++k) {
721 unlink((tmp[k]->get_path() +
"DB").c_str());
722 unlink((tmp[k]->get_path() +
"baseA").c_str());
723 unlink((tmp[k]->get_path() +
"baseB").c_str());
732 const vector<Xapian::docid> & offset)
734 for (
size_t i = 0; i < inputs.size(); ++i) {
738 if (in->
empty())
continue;
751 string msg =
"Bad key in ";
752 msg += inputs[i]->get_path();
761 key.append(d, e - d);
766 bool compressed = cur.
read_tag(
true);
778 const char * destdir,
779 const vector<Xapian::Database::Internal*> & sources,
780 const vector<Xapian::docid> & offset,
795 int compress_strategy;
800 static const table_list tables[] = {
803 {
"record", RECORD, Z_DEFAULT_STRATEGY,
false },
804 {
"termlist",
TERMLIST, Z_DEFAULT_STRATEGY,
false },
806 {
"spelling",
SPELLING, Z_DEFAULT_STRATEGY,
true },
807 {
"synonym",
SYNONYM, Z_DEFAULT_STRATEGY,
true }
809 const table_list * tables_end = tables +
810 (
sizeof(tables) /
sizeof(tables[0]));
818 for (
size_t i = 0; i != sources.size(); ++i) {
822 "Can't compact from a WritableDatabase with uncommitted " 823 "changes - either call commit() first, or create a new " 824 "Database object from the filename on disk";
829 if (block_size < 2048 || block_size > 65536 ||
830 (block_size & (block_size - 1)) != 0) {
843 vector<ChertTable *> tabs;
844 tabs.reserve(tables_end - tables);
845 for (
const table_list * t = tables; t < tables_end; ++t) {
854 string dest = destdir;
859 bool output_will_exist = !t->lazy;
863 bool bad_stat =
false;
867 vector<ChertTable*> inputs;
868 inputs.reserve(sources.size());
869 size_t inputs_present = 0;
870 for (
auto src : sources) {
899 in_size += db_size / 1024;
900 output_will_exist =
true;
902 }
else if (errno != ENOENT) {
905 output_will_exist =
true;
908 inputs.push_back(table);
912 if (t->type ==
TERMLIST && inputs_present != sources.size()) {
913 if (inputs_present != 0) {
915 string m =
str(inputs_present);
917 m +=
str(sources.size());
918 m +=
" inputs present, so suppressing output";
923 output_will_exist =
false;
926 if (!output_will_exist) {
928 compactor->
set_status(t->name,
"doesn't exist");
932 ChertTable out(t->name, dest,
false, t->compress_strategy, t->lazy);
945 if (multipass && inputs.size() > 3) {
950 inputs.begin(), inputs.end(),
974 out_size = db_size / 1024;
976 bad_stat = (errno != ENOENT);
981 compactor->
set_status(t->name,
"Done (couldn't stat all the DB files)");
984 if (out_size == in_size) {
985 status =
"Size unchanged (";
988 if (out_size < in_size) {
989 delta = in_size - out_size;
990 status =
"Reduced by ";
992 delta = out_size - in_size;
993 status =
"INCREASED by ";
996 status +=
str(100 * delta / in_size);
999 status +=
str(delta);
1001 status +=
str(in_size);
1004 status +=
str(out_size);
static void compact(Xapian::Compactor *compactor, const char *destdir, const std::vector< Xapian::Database::Internal *> &sources, const std::vector< Xapian::docid > &offset, size_t block_size, Xapian::Compactor::compaction_level compaction, unsigned flags, Xapian::docid last_docid)
void throw_databaselockerror(FlintLock::reason why, const std::string &db_dir, const std::string &explanation) const
Throw Xapian::DatabaseLockError.
A cursor pointing to a position in a Btree table, for reading several entries in order, or finding approximate matches.
static bool is_metainfo_key(const string &key)
virtual bool has_uncommitted_changes() const
Return true if there are uncommitted changes.
Allow oversize items to save more space (not recommended if you ever plan to update the compacted dat...
bool next()
Advance to the next key.
InvalidOperationError indicates the API was used in an invalid way.
ChertRecordTable record_table
Table storing records.
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
static void merge_synonyms(ChertTable *out, vector< ChertTable *>::const_iterator b, vector< ChertTable *>::const_iterator e)
void set_full_compaction(bool parity)
Constants in the Xapian namespace.
static bool is_valuestats_key(const string &key)
Class managing a Btree table in a Chert database.
ChertSpellingTable spelling_table
Table storing spelling correction data.
Don't split items unnecessarily.
Compact a database, or merge and compact several.
ChertPositionListTable position_table
Table storing position lists.
Utility functions for testing files.
ChertPostListTable postlist_table
Table storing posting lists.
static void multimerge_postlists(Xapian::Compactor *compactor, ChertTable *out, const char *tmpdir, vector< ChertTable *> tmp, vector< Xapian::docid > off, Xapian::docid last_docid)
ChertSynonymTable synonym_table
Table storing synonym data.
bool operator()(const ChertCursor *a, const ChertCursor *b) const
Return true if and only if a's key is strictly greater than b's key.
void create_and_open(unsigned int blocksize)
Create a new empty btree structure on disk and open it at the initial revision.
virtual void set_status(const std::string &table, const std::string &status)
Update progress.
static bool is_user_metadata_key(const string &key)
PostlistCursor(ChertTable *in, Xapian::docid offset_)
Hierarchy of classes which Xapian can throw as exceptions.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
RangeError indicates an attempt to access outside the bounds of a container.
string current_tag
Current tag pointed to by cursor.
void pack_uint_last(std::string &s, U value)
Append an encoded unsigned integer to a string as the last item.
static string encode_valuestats(Xapian::doccount freq, const string &lbound, const string &ubound)
static bool is_doclenchunk_key(const string &key)
void C_pack_uint_preserving_sort(std::string &s, U value)
Append an encoded unsigned integer to a string, preserving the sort order.
bool C_unpack_uint_preserving_sort(const char **p, const char *end, U *result)
Decode an "sort preserved" unsigned integer from a string.
Interface to Btree cursors.
virtual std::string resolve_duplicate_metadata(const std::string &key, size_t num_tags, const std::string tags[])
Resolve multiple user metadata entries with the same key.
Compact a database, or merge and compact several.
static bool is_valuechunk_key(const string &key)
static void merge_postlists(Xapian::Compactor *compactor, ChertTable *out, vector< Xapian::docid >::const_iterator offset, vector< ChertTable *>::const_iterator b, vector< ChertTable *>::const_iterator e, Xapian::docid last_docid)
string str(int value)
Convert int to std::string.
void erase()
Erase this table from disk.
bool empty() const
Return true if there are no entries in the table.
void set_block_size(unsigned int block_size_)
Set the block size.
bool unpack_string_preserving_sort(const char **p, const char *end, std::string &result)
Decode a "sort preserved" std::string from a string.
void set_max_item_size(size_t block_capacity)
Set the maximum item size given the block capacity.
C++ class definition for chert database.
MergeCursor(ChertTable *in)
void commit(chert_revision_number_t revision, int changes_fd=-1, const std::string *changes_tail=NULL)
Commit any outstanding changes to the table.
bool operator()(const PostlistCursor *a, const PostlistCursor *b) const
Return true if and only if a's key is strictly greater than b's key.
DatabaseCorruptError indicates database corruption was detected.
std::string pack_chert_postlist_key(const std::string &term)
void add(const std::string &key, std::string tag, bool already_compressed=false)
Add a key/tag pair to the table, replacing any existing pair with the same key.
static void merge_docid_keyed(ChertTable *out, const vector< ChertTable *> &inputs, const vector< Xapian::docid > &offset)
void pack_uint(std::string &s, U value)
Append an encoded unsigned integer to a string.
#define CHERT_DEFAULT_BLOCK_SIZE
The default block size to use in a B-tree table.
bool after_end() const
Determine whether cursor is off the end of table.
ChertTermListTable termlist_table
Table storing term lists.
string current_key
Current key pointed to by cursor.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
void pack_string(std::string &s, const std::string &value)
Append an encoded std::string to a string.
A backend designed for efficient indexing and retrieval, using compressed posting lists and a btree s...
compaction_level
Compaction level.
bool read_tag(bool keep_compressed=false)
Read the tag from the table and store it in current_tag.
Pack types into strings and unpack them again.
unsigned valueno
The number for a value slot in a document.
bool unpack_uint_last(const char **p, const char *end, U *result)
Decode an unsigned integer as the last item in a string.
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
const int DBCOMPACT_MULTIPASS
If merging more than 3 databases, merge the postlists in multiple passes.
<unistd.h>, but with compat.
void flush_db()
Flush any outstanding changes to the DB file of the table.
reason lock(bool exclusive, bool wait, std::string &explanation)
Attempt to obtain the lock.
off_t file_size(const char *path)
Returns the size of a file.
void append(const std::string &word)
bool unpack_string(const char **p, const char *end, std::string &result)
Decode a std::string from a string.
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
const int DBCOMPACT_SINGLE_FILE
Produce a single-file database.
bool find_entry(const string &key)
Position the cursor on the highest entry with key <= key.
static void merge_spellings(ChertTable *out, vector< ChertTable *>::const_iterator b, vector< ChertTable *>::const_iterator e)