45 #include "../byte_length_strings.h"
46 #include "../prefix_compressed_strings.h"
57 return key.size() == 1 && key[0] ==
'\0';
63 return key.size() > 1 && key[0] ==
'\0' && key[1] ==
'\xc0';
69 return key.size() > 1 && key[0] ==
'\0' && key[1] ==
'\xd0';
75 return key.size() > 1 && key[0] ==
'\0' && key[1] ==
'\xd8';
81 return key.size() > 1 && key[0] ==
'\0' && key[1] ==
'\xe0';
111 const char * p = key.data();
112 const char * end = p + key.length();
122 key.assign(
"\0\xd8", 2);
131 const char * d = key.data();
132 const char * e = d + key.size();
151 tag.erase(0, d - tag.data());
154 size_t tmp = d - key.data();
173 if (a->
key > b->
key)
return true;
174 if (a->
key != b->
key)
return false;
181 const string & lbound,
const string & ubound)
189 if (lbound != ubound) value += ubound;
195 ChertTable * out, vector<Xapian::docid>::const_iterator offset,
196 vector<ChertTable*>::const_iterator b,
197 vector<ChertTable*>::const_iterator e,
204 priority_queue<PostlistCursor *, vector<PostlistCursor *>,
PostlistCursorGt> pq;
205 for ( ; b != e; ++b, ++offset) {
218 const char * data = cur->
tag.data();
219 const char * end = data + cur->
tag.size();
226 if (!
unpack_uint(&data, end, &doclen_lbound_tmp)) {
229 doclen_lbound = min(doclen_lbound, doclen_lbound_tmp);
235 wdf_ubound = max(wdf_ubound, wdf_ubound_tmp);
238 if (!
unpack_uint(&data, end, &doclen_ubound_tmp)) {
241 doclen_ubound_tmp += wdf_ubound_tmp;
242 doclen_ubound = max(doclen_ubound, doclen_ubound_tmp);
248 tot_totlen += totlen;
249 if (tot_totlen < totlen) {
250 throw "totlen wrapped!";
264 if (doclen_lbound > doclen_ubound)
265 doclen_lbound = doclen_ubound;
270 pack_uint(tag, doclen_ubound - wdf_ubound);
272 out->
add(
string(1,
'\0'), tag);
279 while (!pq.empty()) {
281 const string& key = cur->
key;
284 if (key != last_key) {
286 if (
tags.size() > 1 && compactor) {
287 Assert(!last_key.empty());
291 const string & resolved_tag =
295 if (!resolved_tag.empty())
296 out->
add(last_key, resolved_tag);
298 Assert(!last_key.empty());
315 if (
tags.size() > 1 && compactor) {
316 Assert(!last_key.empty());
317 const string & resolved_tag =
321 if (!resolved_tag.empty())
322 out->
add(last_key, resolved_tag);
324 Assert(!last_key.empty());
333 string lbound, ubound;
335 while (!pq.empty()) {
337 const string& key = cur->
key;
339 if (key != last_key) {
350 const string & tag = cur->
tag;
352 const char * pos = tag.data();
353 const char * end = pos + tag.size();
365 size_t len = end - pos;
377 if (l < lbound) lbound = l;
378 if (u > ubound) ubound = u;
395 while (!pq.empty()) {
397 const string & key = cur->
key;
410 vector<pair<Xapian::docid, string> >
tags;
418 if (cur == NULL || cur->
key != last_key) {
424 string tag =
tags[0].second;
425 tag[0] = (
tags.size() == 1) ?
'1' :
'0';
427 out->
add(last_key, first_tag);
431 const char * p = last_key.data();
432 const char * end = p + last_key.size();
437 vector<pair<Xapian::docid, string> >::const_iterator i;
439 while (++i !=
tags.end()) {
441 tag[0] = (i + 1 ==
tags.end()) ?
'1' :
'0';
446 if (cur == NULL)
break;
463 find_entry(
string());
479 vector<ChertTable*>::const_iterator b,
480 vector<ChertTable*>::const_iterator e)
482 priority_queue<MergeCursor *, vector<MergeCursor *>,
CursorGt> pq;
483 for ( ; b != e; ++b) {
490 while (!pq.empty()) {
495 if (pq.empty() || pq.top()->current_key > key) {
498 bool compressed = cur->
read_tag(
true);
514 vector<PrefixCompressedStringItor *>,
519 vector<MergeCursor *> vec;
520 vec.reserve(pq.size());
526 if (pq.empty() || pq.top()->current_key != key)
break;
533 while (!pqtag.empty()) {
537 if (word != lastword) {
549 vector<MergeCursor *>::const_iterator i;
550 for (i = vec.begin(); i != vec.end(); ++i) {
575 if (pq.empty() || pq.top()->current_key != key)
break;
588 vector<ChertTable*>::const_iterator b,
589 vector<ChertTable*>::const_iterator e)
591 priority_queue<MergeCursor *, vector<MergeCursor *>,
CursorGt> pq;
592 for ( ; b != e; ++b) {
599 while (!pq.empty()) {
604 if (pq.empty() || pq.top()->current_key > key) {
607 bool compressed = cur->
read_tag(
true);
623 vector<ByteLengthPrefixedStringItor *>,
625 vector<MergeCursor *> vec;
631 if (pq.empty() || pq.top()->current_key != key)
break;
637 while (!pqtag.empty()) {
640 if (**it != lastword) {
653 vector<MergeCursor *>::const_iterator i;
654 for (i = vec.begin(); i != vec.end(); ++i) {
670 vector<ChertTable *> tmp,
671 vector<Xapian::docid> off,
675 while (tmp.size() > 3) {
676 vector<ChertTable *> tmpout;
677 tmpout.reserve(tmp.size() / 2);
678 vector<Xapian::docid> newoff;
679 newoff.resize(tmp.size() / 2);
680 for (
unsigned int i = 0, j; i < tmp.size(); i = j) {
682 if (j == tmp.size() - 1) ++j;
684 string dest = tmpdir;
698 tmp.begin() + i, tmp.begin() + j,
701 for (
unsigned int k = i; k < j; ++k) {
702 unlink((tmp[k]->get_path() +
"DB").c_str());
703 unlink((tmp[k]->get_path() +
"baseA").c_str());
704 unlink((tmp[k]->get_path() +
"baseB").c_str());
709 tmpout.push_back(tmptab);
720 for (
size_t k = 0; k < tmp.size(); ++k) {
721 unlink((tmp[k]->get_path() +
"DB").c_str());
722 unlink((tmp[k]->get_path() +
"baseA").c_str());
723 unlink((tmp[k]->get_path() +
"baseB").c_str());
732 const vector<Xapian::docid> & offset)
734 for (
size_t i = 0; i < inputs.size(); ++i) {
738 if (in->
empty())
continue;
751 string msg =
"Bad key in ";
752 msg += inputs[i]->get_path();
761 key.append(d, e - d);
766 bool compressed = cur.
read_tag(
true);
778 const char * destdir,
779 const vector<Xapian::Database::Internal*> & sources,
780 const vector<Xapian::docid> & offset,
795 int compress_strategy;
800 static const table_list tables[] = {
803 {
"record", RECORD, Z_DEFAULT_STRATEGY,
false },
804 {
"termlist",
TERMLIST, Z_DEFAULT_STRATEGY,
false },
806 {
"spelling",
SPELLING, Z_DEFAULT_STRATEGY,
true },
807 {
"synonym",
SYNONYM, Z_DEFAULT_STRATEGY,
true }
809 const table_list * tables_end = tables +
810 (
sizeof(tables) /
sizeof(tables[0]));
818 for (
size_t i = 0; i != sources.size(); ++i) {
822 "Can't compact from a WritableDatabase with uncommitted "
823 "changes - either call commit() first, or create a new "
824 "Database object from the filename on disk";
829 if (block_size < 2048 || block_size > 65536 ||
830 (block_size & (block_size - 1)) != 0) {
843 vector<ChertTable *> tabs;
844 tabs.reserve(tables_end - tables);
845 for (
const table_list * t = tables; t < tables_end; ++t) {
854 string dest = destdir;
859 bool output_will_exist = !t->lazy;
863 bool bad_stat =
false;
867 vector<ChertTable*> inputs;
868 inputs.reserve(sources.size());
869 size_t inputs_present = 0;
870 for (
auto src : sources) {
899 in_size += db_size / 1024;
900 output_will_exist =
true;
902 }
else if (errno != ENOENT) {
905 output_will_exist =
true;
908 inputs.push_back(table);
912 if (t->type ==
TERMLIST && inputs_present != sources.size()) {
913 if (inputs_present != 0) {
915 string m =
str(inputs_present);
917 m +=
str(sources.size());
918 m +=
" inputs present, so suppressing output";
923 output_will_exist =
false;
926 if (!output_will_exist) {
928 compactor->
set_status(t->name,
"doesn't exist");
932 ChertTable out(t->name, dest,
false, t->compress_strategy, t->lazy);
945 if (multipass && inputs.size() > 3) {
950 inputs.begin(), inputs.end(),
974 out_size = db_size / 1024;
976 bad_stat = (errno != ENOENT);
981 compactor->
set_status(t->name,
"Done (couldn't stat all the DB files)");
984 if (out_size == in_size) {
985 status =
"Size unchanged (";
988 if (out_size < in_size) {
989 delta = in_size - out_size;
990 status =
"Reduced by ";
992 delta = out_size - in_size;
993 status =
"INCREASED by ";
996 status +=
str(100 * delta / in_size);
999 status +=
str(delta);
1001 status +=
str(in_size);
1004 status +=
str(out_size);
Interface to Btree cursors.
C++ class definition for chert database.
#define CHERT_DEFAULT_BLOCK_SIZE
The default block size to use in a B-tree table.
bool operator()(const PostlistCursor *a, const PostlistCursor *b) const
Return true if and only if a's key is strictly greater than b's key.
PostlistCursor(ChertTable *in, Xapian::docid offset_)
A cursor pointing to a position in a Btree table, for reading several entries in order,...
bool after_end() const
Determine whether cursor is off the end of table.
string current_tag
Current tag pointed to by cursor.
bool read_tag(bool keep_compressed=false)
Read the tag from the table and store it in current_tag.
bool find_entry(const string &key)
Position the cursor on the highest entry with key <= key.
bool next()
Advance to the next key.
string current_key
Current key pointed to by cursor.
A backend designed for efficient indexing and retrieval, using compressed posting lists and a btree s...
ChertTermListTable termlist_table
Table storing term lists.
virtual bool has_uncommitted_changes() const
Return true if there are uncommitted changes.
ChertSynonymTable synonym_table
Table storing synonym data.
ChertPostListTable postlist_table
Table storing posting lists.
ChertPositionListTable position_table
Table storing position lists.
ChertRecordTable record_table
Table storing records.
ChertSpellingTable spelling_table
Table storing spelling correction data.
static void compact(Xapian::Compactor *compactor, const char *destdir, const std::vector< Xapian::Database::Internal * > &sources, const std::vector< Xapian::docid > &offset, size_t block_size, Xapian::Compactor::compaction_level compaction, unsigned flags, Xapian::docid last_docid)
Class managing a Btree table in a Chert database.
void commit(chert_revision_number_t revision, int changes_fd=-1, const std::string *changes_tail=NULL)
Commit any outstanding changes to the table.
void set_max_item_size(size_t block_capacity)
Set the maximum item size given the block capacity.
void set_block_size(unsigned int block_size_)
Set the block size.
void flush_db()
Flush any outstanding changes to the DB file of the table.
bool empty() const
Return true if there are no entries in the table.
void set_full_compaction(bool parity)
void erase()
Erase this table from disk.
void add(const std::string &key, std::string tag, bool already_compressed=false)
Add a key/tag pair to the table, replacing any existing pair with the same key.
void create_and_open(unsigned int blocksize)
Create a new empty btree structure on disk and open it at the initial revision.
reason lock(bool exclusive, bool wait, std::string &explanation)
Attempt to obtain the lock.
void throw_databaselockerror(FlintLock::reason why, const std::string &db_dir, const std::string &explanation) const
Throw Xapian::DatabaseLockError.
void append(const std::string &word)
Compact a database, or merge and compact several.
virtual void set_status(const std::string &table, const std::string &status)
Update progress.
compaction_level
Compaction level.
@ FULLER
Allow oversize items to save more space (not recommended if you ever plan to update the compacted dat...
@ STANDARD
Don't split items unnecessarily.
virtual std::string resolve_duplicate_metadata(const std::string &key, size_t num_tags, const std::string tags[])
Resolve multiple user metadata entries with the same key.
DatabaseCorruptError indicates database corruption was detected.
InvalidOperationError indicates the API was used in an invalid way.
RangeError indicates an attempt to access outside the bounds of a container.
Compact a database, or merge and compact several.
Constants in the Xapian namespace.
Hierarchy of classes which Xapian can throw as exceptions.
Utility functions for testing files.
off_t file_size(const char *path)
Returns the size of a file.
static void merge_synonyms(ChertTable *out, vector< ChertTable * >::const_iterator b, vector< ChertTable * >::const_iterator e)
static bool is_doclenchunk_key(const string &key)
static bool is_metainfo_key(const string &key)
static void merge_docid_keyed(ChertTable *out, const vector< ChertTable * > &inputs, const vector< Xapian::docid > &offset)
static void merge_postlists(Xapian::Compactor *compactor, ChertTable *out, vector< Xapian::docid >::const_iterator offset, vector< ChertTable * >::const_iterator b, vector< ChertTable * >::const_iterator e, Xapian::docid last_docid)
static bool is_valuechunk_key(const string &key)
static string encode_valuestats(Xapian::doccount freq, const string &lbound, const string &ubound)
static void multimerge_postlists(Xapian::Compactor *compactor, ChertTable *out, const char *tmpdir, vector< ChertTable * > tmp, vector< Xapian::docid > off, Xapian::docid last_docid)
static bool is_valuestats_key(const string &key)
static bool is_user_metadata_key(const string &key)
static void merge_spellings(ChertTable *out, vector< ChertTable * >::const_iterator b, vector< ChertTable * >::const_iterator e)
string str(int value)
Convert int to std::string.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
const int DBCOMPACT_MULTIPASS
If merging more than 3 databases, merge the postlists in multiple passes.
unsigned valueno
The number for a value slot in a document.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
const int DBCOMPACT_SINGLE_FILE
Produce a single-file database.
Pack types into strings and unpack them again.
std::string pack_chert_postlist_key(const std::string &term)
void C_pack_uint_preserving_sort(std::string &s, U value)
Append an encoded unsigned integer to a string, preserving the sort order.
bool unpack_uint_last(const char **p, const char *end, U *result)
Decode an unsigned integer as the last item in a string.
bool unpack_string_preserving_sort(const char **p, const char *end, std::string &result)
Decode a "sort preserved" std::string from a string.
bool unpack_string(const char **p, const char *end, std::string &result)
Decode a std::string from a string.
void pack_uint_last(std::string &s, U value)
Append an encoded unsigned integer to a string as the last item.
bool C_unpack_uint_preserving_sort(const char **p, const char *end, U *result)
Decode an "sort preserved" unsigned integer from a string.
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
void pack_uint(std::string &s, U value)
Append an encoded unsigned integer to a string.
void pack_string(std::string &s, const std::string &value)
Append an encoded std::string to a string.
<unistd.h>, but with compat.
bool operator()(const ChertCursor *a, const ChertCursor *b) const
Return true if and only if a's key is strictly greater than b's key.
MergeCursor(ChertTable *in)