00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <config.h>
00022
00023 #include "safeerrno.h"
00024
00025 #include <fstream>
00026 #include <iostream>
00027 #include <queue>
00028
00029 #include <stdio.h>
00030 #include <string.h>
00031 #include <sys/types.h>
00032 #include "utils.h"
00033
00034 #include "flint_table.h"
00035 #include "flint_cursor.h"
00036 #include "flint_utils.h"
00037
00038 #include <xapian.h>
00039
00040 #include "gnu_getopt.h"
00041
00042 using namespace std;
00043
00044 #define PROG_NAME "xapian-compact"
00045 #define PROG_DESC "Compact a flint database, or merge and compact several"
00046
00047 #define OPT_HELP 1
00048 #define OPT_VERSION 2
00049 #define OPT_NO_RENUMBER 3
00050
00051 static void show_usage() {
00052 cout << "Usage: "PROG_NAME" [OPTIONS] SOURCE_DATABASE... DESTINATION_DATABASE\n\n"
00053 "Options:\n"
00054 " -b, --blocksize Set the blocksize in bytes (e.g. 4096) or K (e.g. 4K)\n"
00055 " (must be between 2K and 64K and a power of 2, default 8K)\n"
00056 " -n, --no-full Disable full compaction\n"
00057 " -F, --fuller Enable fuller compaction (not recommended if you plan to\n"
00058 " update the compacted database)\n"
00059 " -m, --multipass If merging more than 3 databases, merge the postlists in\n"
00060 " multiple passes (which is generally faster but requires\n"
00061 " more disk space for temporary files)\n"
00062 " --no-renumber Preserve the numbering of document ids (useful if you have\n"
00063 " external references to them, or have set them to match\n"
00064 " unique ids from an external source). Currently this\n"
00065 " option isn't supported when merging databases.\n"
00066 " --help display this help and exit\n"
00067 " --version output version information and exit" << endl;
00068 }
00069
00070 static inline bool
00071 is_metainfo_key(const string & key)
00072 {
00073 return key.size() == 1 && key[0] == '\0';
00074 }
00075
00076 static inline bool
00077 is_user_metadata_key(const string & key)
00078 {
00079 return key.size() > 1 && key[0] == '\0' && key[1] == '\xc0';
00080 }
00081
00082 class PostlistCursor : private FlintCursor {
00083 Xapian::docid offset;
00084
00085 public:
00086 string key, tag;
00087 Xapian::docid firstdid;
00088 Xapian::termcount tf, cf;
00089
00090 PostlistCursor(FlintTable *in, Xapian::docid offset_)
00091 : FlintCursor(in), offset(offset_), firstdid(0)
00092 {
00093 find_entry("");
00094 next();
00095 }
00096
00097 ~PostlistCursor()
00098 {
00099 delete FlintCursor::get_table();
00100 }
00101
00102 bool next() {
00103 if (!FlintCursor::next()) return false;
00104
00105
00106 read_tag();
00107 key = current_key;
00108 tag = current_tag;
00109 tf = cf = 0;
00110 if (is_metainfo_key(key)) return true;
00111 if (is_user_metadata_key(key)) return true;
00112
00113
00114
00115 const char * d = key.data();
00116 const char * e = d + key.size();
00117 string tname;
00118 if (!unpack_string_preserving_sort(&d, e, tname))
00119 throw Xapian::DatabaseCorruptError("Bad postlist key");
00120 if (d == e) {
00121
00122 d = tag.data();
00123 e = d + tag.size();
00124 if (!unpack_uint(&d, e, &tf) ||
00125 !unpack_uint(&d, e, &cf) ||
00126 !unpack_uint(&d, e, &firstdid)) {
00127 throw Xapian::DatabaseCorruptError("Bad postlist tag");
00128 }
00129 ++firstdid;
00130 tag.erase(0, d - tag.data());
00131 } else {
00132
00133 size_t tmp = d - key.data();
00134 if (!unpack_uint_preserving_sort(&d, e, &firstdid) || d != e)
00135 throw Xapian::DatabaseCorruptError("Bad postlist key");
00136 key.erase(tmp);
00137 }
00138 firstdid += offset;
00139 return true;
00140 }
00141 };
00142
00143 class PostlistCursorGt {
00144 public:
00147 bool operator()(const PostlistCursor *a, const PostlistCursor *b) {
00148 if (a->key > b->key) return true;
00149 if (a->key != b->key) return false;
00150 return (a->firstdid > b->firstdid);
00151 }
00152 };
00153
00154 static void
00155 merge_postlists(FlintTable * out, vector<Xapian::docid>::const_iterator offset,
00156 vector<string>::const_iterator b, vector<string>::const_iterator e,
00157 Xapian::docid tot_off)
00158 {
00159 flint_totlen_t tot_totlen = 0;
00160 priority_queue<PostlistCursor *, vector<PostlistCursor *>, PostlistCursorGt> pq;
00161 for ( ; b != e; ++b, ++offset) {
00162 FlintTable *in = new FlintTable(*b, true);
00163 in->open();
00164 if (in->empty()) {
00165
00166 delete in;
00167 continue;
00168 }
00169
00170
00171
00172 PostlistCursor * cur = new PostlistCursor(in, *offset);
00173
00174
00175
00176
00177 if (is_metainfo_key(cur->key)) {
00178 const char * data = cur->tag.data();
00179 const char * end = data + cur->tag.size();
00180 Xapian::docid dummy_did = 0;
00181 if (!unpack_uint(&data, end, &dummy_did)) {
00182 throw Xapian::DatabaseCorruptError("Tag containing meta information is corrupt.");
00183 }
00184 flint_totlen_t totlen = 0;
00185 if (!unpack_uint_last(&data, end, &totlen)) {
00186 throw Xapian::DatabaseCorruptError("Tag containing meta information is corrupt.");
00187 }
00188 tot_totlen += totlen;
00189 if (tot_totlen < totlen) {
00190 throw "totlen wrapped!";
00191 }
00192 }
00193 if (cur->next()) {
00194 pq.push(cur);
00195 } else {
00196 delete cur;
00197 }
00198 }
00199
00200 {
00201 string tag = pack_uint(tot_off);
00202 tag += pack_uint_last(tot_totlen);
00203 out->add(string("", 1), tag);
00204 }
00205
00206 string last_key;
00207 {
00208
00209 string last_tag;
00210 while (!pq.empty()) {
00211 PostlistCursor * cur = pq.top();
00212 const string& key = cur->key;
00213 if (!is_user_metadata_key(key)) break;
00214
00215 const string & tag = cur->tag;
00216 if (key == last_key) {
00217 if (tag != last_tag)
00218 cerr << "Warning: duplicate user metadata key with different tag value - picking arbitrary tag value" << endl;
00219 } else {
00220 out->add(key, tag);
00221 last_key = key;
00222 last_tag = tag;
00223 }
00224
00225 pq.pop();
00226 if (cur->next()) {
00227 pq.push(cur);
00228 } else {
00229 delete cur;
00230 }
00231 }
00232 }
00233
00234 Xapian::termcount tf = 0, cf = 0;
00235 vector<pair<Xapian::docid, string> > tags;
00236 while (true) {
00237 PostlistCursor * cur = NULL;
00238 if (!pq.empty()) {
00239 cur = pq.top();
00240 pq.pop();
00241 }
00242 Assert(cur == NULL || !is_user_metadata_key(cur->key));
00243 if (cur == NULL || cur->key != last_key) {
00244 if (!tags.empty()) {
00245 string first_tag = pack_uint(tf);
00246 first_tag += pack_uint(cf);
00247 first_tag += pack_uint(tags[0].first - 1);
00248 string tag = tags[0].second;
00249 tag[0] = (tags.size() == 1) ? '1' : '0';
00250 first_tag += tag;
00251 out->add(last_key, first_tag);
00252 vector<pair<Xapian::docid, string> >::const_iterator i;
00253 i = tags.begin();
00254 while (++i != tags.end()) {
00255 string key = last_key;
00256 key += pack_uint_preserving_sort(i->first);
00257 tag = i->second;
00258 tag[0] = (i + 1 == tags.end()) ? '1' : '0';
00259 out->add(key, tag);
00260 }
00261 }
00262 tags.clear();
00263 if (cur == NULL) break;
00264 tf = cf = 0;
00265 last_key = cur->key;
00266 }
00267 tf += cur->tf;
00268 cf += cur->cf;
00269 tags.push_back(make_pair(cur->firstdid, cur->tag));
00270 if (cur->next()) {
00271 pq.push(cur);
00272 } else {
00273 delete cur;
00274 }
00275 }
00276 }
00277
00278 struct MergeCursor : public FlintCursor {
00279 MergeCursor(FlintTable *in) : FlintCursor(in) {
00280 find_entry("");
00281 next();
00282 }
00283
00284 ~MergeCursor() {
00285 delete FlintCursor::get_table();
00286 }
00287 };
00288
00289 struct CursorGt {
00291 bool operator()(const FlintCursor *a, const FlintCursor *b) {
00292 if (b->after_end()) return false;
00293 if (a->after_end()) return true;
00294 return (a->current_key > b->current_key);
00295 }
00296 };
00297
00298 #define MAGIC_XOR_VALUE 96
00299
00300
00301 class PrefixCompressedStringItor {
00302 const unsigned char * p;
00303 size_t left;
00304 string current;
00305
00306 PrefixCompressedStringItor(const unsigned char * p_, size_t left_,
00307 const string ¤t_)
00308 : p(p_), left(left_), current(current_) { }
00309
00310 public:
00311 PrefixCompressedStringItor(const std::string & s)
00312 : p(reinterpret_cast<const unsigned char *>(s.data())),
00313 left(s.size()) {
00314 if (left) {
00315 operator++();
00316 } else {
00317 p = NULL;
00318 }
00319 }
00320
00321 const string & operator*() const {
00322 return current;
00323 }
00324
00325 PrefixCompressedStringItor operator++(int) {
00326 const unsigned char * old_p = p;
00327 size_t old_left = left;
00328 string old_current = current;
00329 operator++();
00330 return PrefixCompressedStringItor(old_p, old_left, old_current);
00331 }
00332
00333 PrefixCompressedStringItor & operator++() {
00334 if (left == 0) {
00335 p = NULL;
00336 } else {
00337 if (!current.empty()) {
00338 current.resize(*p++ ^ MAGIC_XOR_VALUE);
00339 --left;
00340 }
00341 size_t add;
00342 if (left == 0 || (add = *p ^ MAGIC_XOR_VALUE) >= left)
00343 throw Xapian::DatabaseCorruptError("Bad spelling data (too little left)");
00344 current.append(reinterpret_cast<const char *>(p + 1), add);
00345 p += add + 1;
00346 left -= add + 1;
00347 }
00348 return *this;
00349 }
00350
00351 bool at_end() const {
00352 return p == NULL;
00353 }
00354 };
00355
00356
00357 class PrefixCompressedStringWriter {
00358 string current;
00359 string & out;
00360
00361 public:
00362 PrefixCompressedStringWriter(string & out_) : out(out_) { }
00363
00364 void append(const string & word) {
00365
00366
00367 if (!current.empty()) {
00368 size_t len = min(current.size(), word.size());
00369 size_t i;
00370 for (i = 0; i < len; ++i) {
00371 if (current[i] != word[i]) break;
00372 }
00373 out += char(i ^ MAGIC_XOR_VALUE);
00374 out += char((word.size() - i) ^ MAGIC_XOR_VALUE);
00375 out.append(word.data() + i, word.size() - i);
00376 } else {
00377 out += char(word.size() ^ MAGIC_XOR_VALUE);
00378 out += word;
00379 }
00380 current = word;
00381 }
00382 };
00383
00384 struct PrefixCompressedStringItorGt {
00386 bool operator()(const PrefixCompressedStringItor *a,
00387 const PrefixCompressedStringItor *b) {
00388 return (**a > **b);
00389 }
00390 };
00391
00392 static void
00393 merge_spellings(FlintTable * out,
00394 vector<string>::const_iterator b,
00395 vector<string>::const_iterator e)
00396 {
00397 priority_queue<MergeCursor *, vector<MergeCursor *>, CursorGt> pq;
00398 for ( ; b != e; ++b) {
00399 FlintTable *in = new FlintTable(*b, true, DONT_COMPRESS, true);
00400 in->open();
00401 if (!in->empty()) {
00402
00403
00404 pq.push(new MergeCursor(in));
00405 } else {
00406 delete in;
00407 }
00408 }
00409
00410 while (!pq.empty()) {
00411 MergeCursor * cur = pq.top();
00412 pq.pop();
00413
00414 string key = cur->current_key;
00415 if (pq.empty() || pq.top()->current_key > key) {
00416
00417
00418 bool compressed = cur->read_tag(true);
00419 out->add(key, cur->current_tag, compressed);
00420 if (cur->next()) {
00421 pq.push(cur);
00422 } else {
00423 delete cur;
00424 }
00425 continue;
00426 }
00427
00428
00429 string tag;
00430 if (key[0] != 'W') {
00431
00432
00433 priority_queue<PrefixCompressedStringItor *,
00434 vector<PrefixCompressedStringItor *>,
00435 PrefixCompressedStringItorGt> pqtag;
00436
00437
00438
00439 vector<MergeCursor *> vec;
00440 vec.reserve(pq.size());
00441
00442 while (true) {
00443 cur->read_tag();
00444 pqtag.push(new PrefixCompressedStringItor(cur->current_tag));
00445 vec.push_back(cur);
00446 if (pq.empty() || pq.top()->current_key != key) break;
00447 cur = pq.top();
00448 pq.pop();
00449 }
00450
00451 PrefixCompressedStringWriter wr(tag);
00452 string lastword;
00453 while (!pqtag.empty()) {
00454 PrefixCompressedStringItor * it = pqtag.top();
00455 string word = **it;
00456 if (word != lastword) {
00457 lastword = word;
00458 wr.append(lastword);
00459 }
00460 ++*it;
00461 pqtag.pop();
00462 if (!it->at_end()) {
00463 pqtag.push(it);
00464 } else {
00465 delete it;
00466 }
00467 }
00468
00469 vector<MergeCursor *>::const_iterator i;
00470 for (i = vec.begin(); i != vec.end(); ++i) {
00471 cur = *i;
00472 if (cur->next()) {
00473 pq.push(cur);
00474 } else {
00475 delete cur;
00476 }
00477 }
00478 } else {
00479
00480 Xapian::termcount tot_freq = 0;
00481 while (true) {
00482 cur->read_tag();
00483 Xapian::termcount freq;
00484 const char * p = cur->current_tag.data();
00485 const char * end = p + cur->current_tag.size();
00486 if (!unpack_uint_last(&p, end, &freq) || freq == 0) {
00487 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
00488 }
00489 tot_freq += freq;
00490 if (cur->next()) {
00491 pq.push(cur);
00492 } else {
00493 delete cur;
00494 }
00495 if (pq.empty() || pq.top()->current_key != key) break;
00496 cur = pq.top();
00497 pq.pop();
00498 }
00499 tag = pack_uint_last(tot_freq);
00500 }
00501 out->add(key, tag);
00502 }
00503 }
00504
00505 class ByteLengthPrefixedStringItor {
00506 const unsigned char * p;
00507 size_t left;
00508
00509 ByteLengthPrefixedStringItor(const unsigned char * p_, size_t left_)
00510 : p(p_), left(left_) { }
00511
00512 public:
00513 ByteLengthPrefixedStringItor(const std::string & s)
00514 : p(reinterpret_cast<const unsigned char *>(s.data())),
00515 left(s.size()) { }
00516
00517 string operator*() const {
00518 size_t len = *p ^ MAGIC_XOR_VALUE;
00519 return string(reinterpret_cast<const char *>(p + 1), len);
00520 }
00521
00522 ByteLengthPrefixedStringItor operator++(int) {
00523 const unsigned char * old_p = p;
00524 size_t old_left = left;
00525 operator++();
00526 return ByteLengthPrefixedStringItor(old_p, old_left);
00527 }
00528
00529 ByteLengthPrefixedStringItor & operator++() {
00530 if (!left) {
00531 throw Xapian::DatabaseCorruptError("Bad synonym data (none left)");
00532 }
00533 size_t add = (*p ^ MAGIC_XOR_VALUE) + 1;
00534 if (left < add) {
00535 throw Xapian::DatabaseCorruptError("Bad synonym data (too little left)");
00536 }
00537 p += add;
00538 left -= add;
00539 return *this;
00540 }
00541
00542 bool at_end() const {
00543 return left == 0;
00544 }
00545 };
00546
00547 struct ByteLengthPrefixedStringItorGt {
00549 bool operator()(const ByteLengthPrefixedStringItor *a,
00550 const ByteLengthPrefixedStringItor *b) {
00551 return (**a > **b);
00552 }
00553 };
00554
00555 static void
00556 merge_synonyms(FlintTable * out,
00557 vector<string>::const_iterator b,
00558 vector<string>::const_iterator e)
00559 {
00560 priority_queue<MergeCursor *, vector<MergeCursor *>, CursorGt> pq;
00561 for ( ; b != e; ++b) {
00562 FlintTable *in = new FlintTable(*b, true, DONT_COMPRESS, true);
00563 in->open();
00564 if (!in->empty()) {
00565
00566
00567 pq.push(new MergeCursor(in));
00568 } else {
00569 delete in;
00570 }
00571 }
00572
00573 while (!pq.empty()) {
00574 MergeCursor * cur = pq.top();
00575 pq.pop();
00576
00577 string key = cur->current_key;
00578 if (pq.top()->current_key > key) {
00579
00580
00581 bool compressed = cur->read_tag(true);
00582 out->add(key, cur->current_tag, compressed);
00583 if (cur->next()) {
00584 pq.push(cur);
00585 } else {
00586 delete cur;
00587 }
00588 continue;
00589 }
00590
00591
00592 string tag;
00593
00594
00595
00596 priority_queue<ByteLengthPrefixedStringItor *,
00597 vector<ByteLengthPrefixedStringItor *>,
00598 ByteLengthPrefixedStringItorGt> pqtag;
00599 vector<MergeCursor *> vec;
00600
00601 while (true) {
00602 cur->read_tag();
00603 pqtag.push(new ByteLengthPrefixedStringItor(cur->current_tag));
00604 vec.push_back(cur);
00605 if (pq.empty() || pq.top()->current_key != key) break;
00606 cur = pq.top();
00607 pq.pop();
00608 }
00609
00610 string lastword;
00611 while (!pqtag.empty()) {
00612 ByteLengthPrefixedStringItor * it = pqtag.top();
00613 if (**it != lastword) {
00614 lastword = **it;
00615 tag += byte(lastword.size() ^ MAGIC_XOR_VALUE);
00616 tag += lastword;
00617 }
00618 ++*it;
00619 pqtag.pop();
00620 if (!it->at_end()) {
00621 pqtag.push(it);
00622 } else {
00623 delete it;
00624 }
00625 }
00626
00627 vector<MergeCursor *>::const_iterator i;
00628 for (i = vec.begin(); i != vec.end(); ++i) {
00629 cur = *i;
00630 if (cur->next()) {
00631 pq.push(cur);
00632 } else {
00633 delete cur;
00634 }
00635 }
00636
00637 out->add(key, tag);
00638 }
00639 }
00640
00641 static void
00642 multimerge_postlists(FlintTable * out, const char * tmpdir,
00643 Xapian::docid tot_off,
00644 vector<string> tmp, vector<Xapian::docid> off)
00645 {
00646 unsigned int c = 0;
00647 while (tmp.size() > 3) {
00648 vector<string> tmpout;
00649 tmpout.reserve(tmp.size() / 2);
00650 vector<Xapian::docid> newoff;
00651 newoff.resize(tmp.size() / 2);
00652 for (unsigned int i = 0, j; i < tmp.size(); i = j) {
00653 j = i + 2;
00654 if (j == tmp.size() - 1) ++j;
00655
00656 string dest = tmpdir;
00657 char buf[64];
00658 sprintf(buf, "/tmp%u_%u.", c, i / 2);
00659 dest += buf;
00660
00661
00662
00663 FlintTable tmptab(dest, false);
00664
00665 tmptab.create_and_open(65536);
00666
00667 merge_postlists(&tmptab, off.begin() + i, tmp.begin() + i, tmp.begin() + j, 0);
00668 if (c > 0) {
00669 for (unsigned int k = i; k < j; ++k) {
00670 unlink((tmp[k] + "DB").c_str());
00671 unlink((tmp[k] + "baseA").c_str());
00672 unlink((tmp[k] + "baseB").c_str());
00673 }
00674 }
00675 tmpout.push_back(dest);
00676 tmptab.commit(1);
00677 }
00678 swap(tmp, tmpout);
00679 swap(off, newoff);
00680 ++c;
00681 }
00682 merge_postlists(out, off.begin(), tmp.begin(), tmp.end(), tot_off);
00683 if (c > 0) {
00684 for (size_t k = 0; k < tmp.size(); ++k) {
00685 unlink((tmp[k] + "DB").c_str());
00686 unlink((tmp[k] + "baseA").c_str());
00687 unlink((tmp[k] + "baseB").c_str());
00688 }
00689 }
00690 }
00691
00692 static void
00693 merge_docid_keyed(FlintTable *out, const vector<string> & inputs,
00694 const vector<Xapian::docid> & offset, bool lazy)
00695 {
00696 for (size_t i = 0; i < inputs.size(); ++i) {
00697 Xapian::docid off = offset[i];
00698
00699 FlintTable in(inputs[i], true, DONT_COMPRESS, lazy);
00700 in.open();
00701 if (in.empty()) continue;
00702
00703 FlintCursor cur(&in);
00704 cur.find_entry("");
00705
00706 string key;
00707 while (cur.next()) {
00708
00709 if (off) {
00710 Xapian::docid did;
00711 const char * d = cur.current_key.data();
00712 const char * e = d + cur.current_key.size();
00713 if (!unpack_uint_preserving_sort(&d, e, &did)) {
00714 string msg = "Bad key in ";
00715 msg += inputs[i];
00716 throw Xapian::DatabaseCorruptError(msg);
00717 }
00718 did += off;
00719 key = pack_uint_preserving_sort(did);
00720 if (d != e) {
00721
00722 key.append(d, e - d);
00723 }
00724 } else {
00725 key = cur.current_key;
00726 }
00727 bool compressed = cur.read_tag(true);
00728 out->add(key, cur.current_tag, compressed);
00729 }
00730 }
00731 }
00732
00733 int
00734 main(int argc, char **argv)
00735 {
00736 const char * opts = "b:nFm";
00737 const struct option long_opts[] = {
00738 {"fuller", no_argument, 0, 'F'},
00739 {"no-full", no_argument, 0, 'n'},
00740 {"multipass", no_argument, 0, 'm'},
00741 {"blocksize", required_argument, 0, 'b'},
00742 {"no-renumber", no_argument, 0, OPT_NO_RENUMBER},
00743 {"help", no_argument, 0, OPT_HELP},
00744 {"version", no_argument, 0, OPT_VERSION},
00745 {NULL, 0, 0, 0}
00746 };
00747
00748 enum { STANDARD, FULL, FULLER } compaction = FULL;
00749 size_t block_size = 8192;
00750 bool multipass = false;
00751 bool renumber = true;
00752
00753 int c;
00754 while ((c = gnu_getopt_long(argc, argv, opts, long_opts, 0)) != -1) {
00755 switch (c) {
00756 case 'b': {
00757 char *p;
00758 block_size = strtoul(optarg, &p, 10);
00759 if (block_size <= 64 && (*p == 'K' || *p == 'k')) {
00760 ++p;
00761 block_size *= 1024;
00762 }
00763 if (*p || block_size < 2048 || block_size > 65536 ||
00764 (block_size & (block_size - 1)) != 0) {
00765 cerr << PROG_NAME": Bad value '" << optarg
00766 << "' passed for blocksize, must be a power of 2 between 2K and 64K"
00767 << endl;
00768 exit(1);
00769 }
00770 break;
00771 }
00772 case 'n':
00773 compaction = STANDARD;
00774 break;
00775 case 'F':
00776 compaction = FULLER;
00777 break;
00778 case 'm':
00779 multipass = true;
00780 break;
00781 case OPT_NO_RENUMBER:
00782 renumber = false;
00783 break;
00784 case OPT_HELP:
00785 cout << PROG_NAME" - "PROG_DESC"\n\n";
00786 show_usage();
00787 exit(0);
00788 case OPT_VERSION:
00789 cout << PROG_NAME" - "PACKAGE_STRING << endl;
00790 exit(0);
00791 default:
00792 show_usage();
00793 exit(1);
00794 }
00795 }
00796
00797 if (argc - optind < 2) {
00798 show_usage();
00799 exit(1);
00800 }
00801
00802 if (!renumber && argc - optind > 2) {
00803 cout << argv[0]
00804 << ": --no-renumber isn't currently supported when merging databases."
00805 << endl;
00806 exit(1);
00807 }
00808
00809
00810 const char *destdir = argv[argc - 1];
00811
00812 try {
00813 vector<string> sources;
00814 vector<Xapian::docid> offset;
00815 sources.reserve(argc - 1 - optind);
00816 offset.reserve(argc - 1 - optind);
00817 Xapian::docid tot_off = 0;
00818 for (int i = optind; i < argc - 1; ++i) {
00819 const char *srcdir = argv[i];
00820
00821 if (strcmp(srcdir, destdir) == 0) {
00822 cout << argv[0]
00823 << ": destination may not be the same as any source directory."
00824 << endl;
00825 exit(1);
00826 }
00827
00828 struct stat sb;
00829 if (stat(string(srcdir) + "/iamflint", &sb) != 0) {
00830 cout << argv[0] << ": '" << srcdir
00831 << "' is not a flint database directory" << endl;
00832 exit(1);
00833 }
00834
00835 Xapian::Database db(srcdir);
00836 Xapian::docid last = 0;
00837
00838
00839
00840 if (db.get_doccount() != 0) {
00841 last = db.get_lastdocid();
00842
00843 if (renumber) {
00844
00845
00846 Xapian::PostingIterator it = db.postlist_begin("");
00847
00848
00849 if (it != db.postlist_end("")) {
00850
00851
00852 tot_off -= (*it - 1);
00853 }
00854
00855
00856
00857
00858 }
00859 }
00860 offset.push_back(tot_off);
00861 tot_off += last;
00862
00863 sources.push_back(string(srcdir) + '/');
00864 }
00865
00866
00867 if (mkdir(destdir, 0755) < 0) {
00868
00869
00870
00871 if (errno == EEXIST) {
00872 struct stat sb;
00873 if (stat(destdir, &sb) == 0 && S_ISDIR(sb.st_mode))
00874 errno = 0;
00875 else
00876 errno = EEXIST;
00877 }
00878 if (errno) {
00879 cerr << argv[0] << ": cannot create directory '"
00880 << destdir << "': " << strerror(errno) << endl;
00881 exit(1);
00882 }
00883 }
00884
00885 enum table_type {
00886 POSTLIST, RECORD, TERMLIST, POSITION, VALUE, SPELLING, SYNONYM
00887 };
00888 struct table_list {
00889
00890 const char * name;
00891
00892 table_type type;
00893
00894 int compress_strategy;
00895
00896 bool lazy;
00897 };
00898
00899 static const table_list tables[] = {
00900
00901 { "postlist", POSTLIST, DONT_COMPRESS, false },
00902 { "record", RECORD, Z_DEFAULT_STRATEGY, false },
00903 { "termlist", TERMLIST, Z_DEFAULT_STRATEGY, false },
00904 { "position", POSITION, DONT_COMPRESS, true },
00905 { "value", VALUE, DONT_COMPRESS, true },
00906 { "spelling", SPELLING, Z_DEFAULT_STRATEGY, true },
00907 { "synonym", SYNONYM, Z_DEFAULT_STRATEGY, true }
00908 };
00909 const table_list * tables_end = tables +
00910 (sizeof(tables) / sizeof(tables[0]));
00911
00912 for (const table_list * t = tables; t < tables_end; ++t) {
00913
00914
00915
00916
00917 cout << t->name << " ..." << flush;
00918
00919 string dest = destdir;
00920 dest += '/';
00921 dest += t->name;
00922 dest += '.';
00923
00924 FlintTable out(dest, false, t->compress_strategy, t->lazy);
00925 if (!t->lazy) {
00926 out.create_and_open(block_size);
00927 } else {
00928 out.erase();
00929 out.set_block_size(block_size);
00930 }
00931
00932 out.set_full_compaction(compaction != STANDARD);
00933 if (compaction == FULLER) out.set_max_item_size(1);
00934
00935
00936
00937 bool bad_stat = false;
00938
00939 off_t in_size = 0;
00940
00941 vector<string> inputs;
00942 inputs.reserve(sources.size());
00943 for (vector<string>::const_iterator src = sources.begin();
00944 src != sources.end(); ++src) {
00945 string s(*src);
00946 s += t->name;
00947 s += '.';
00948
00949 struct stat sb;
00950 if (stat(s + "DB", &sb) == 0) {
00951 in_size += sb.st_size / 1024;
00952 } else {
00953
00954 bad_stat = (errno != ENOENT);
00955 }
00956 inputs.push_back(s);
00957 }
00958
00959 if (inputs.empty()) continue;
00960
00961 switch (t->type) {
00962 case POSTLIST:
00963 if (multipass && inputs.size() > 3) {
00964 multimerge_postlists(&out, destdir, tot_off,
00965 inputs, offset);
00966 } else {
00967 merge_postlists(&out, offset.begin(),
00968 inputs.begin(), inputs.end(),
00969 tot_off);
00970 }
00971 break;
00972 case SPELLING:
00973 merge_spellings(&out, inputs.begin(), inputs.end());
00974 break;
00975 case SYNONYM:
00976 merge_synonyms(&out, inputs.begin(), inputs.end());
00977 break;
00978 default:
00979
00980 merge_docid_keyed(&out, inputs, offset, t->lazy);
00981 break;
00982 }
00983
00984
00985 out.commit(1);
00986
00987 cout << '\r' << t->name << ": ";
00988 off_t out_size = 0;
00989 if (!bad_stat) {
00990 struct stat sb;
00991 if (stat(dest + "DB", &sb) == 0) {
00992 out_size = sb.st_size / 1024;
00993 } else {
00994 bad_stat = (errno != ENOENT);
00995 }
00996 }
00997 if (bad_stat) {
00998 cout << "Done (couldn't stat all the DB files)";
00999 } else {
01000 if (out_size == in_size) {
01001 cout << "Size unchanged (";
01002 } else if (out_size < in_size) {
01003 cout << "Reduced by "
01004 << 100 * double(in_size - out_size) / in_size << "% "
01005 << in_size - out_size << "K (" << in_size << "K -> ";
01006 } else {
01007 cout << "INCREASED by "
01008 << 100 * double(out_size - in_size) / in_size << "% "
01009 << out_size - in_size << "K (" << in_size << "K -> ";
01010 }
01011 cout << out_size << "K)";
01012 }
01013 cout << endl;
01014 }
01015
01016
01017
01018
01019 string dest = destdir;
01020 dest += "/iamflint.tmp";
01021
01022 string src(argv[optind]);
01023 src += "/iamflint";
01024
01025 ifstream input(src.c_str());
01026 char buf[1024];
01027 input.read(buf, sizeof(buf));
01028 if (!input.eof()) {
01029 if (!input) {
01030 cerr << argv[0] << ": error reading '" << src << "': "
01031 << strerror(errno) << endl;
01032 exit(1);
01033 }
01034
01035 cerr << argv[0] << ": version file '" << src << "' too large!"
01036 << endl;
01037 exit(1);
01038 }
01039 ofstream output(dest.c_str());
01040 if (!output.write(buf, input.gcount())) {
01041 cerr << argv[0] << ": error writing '" << dest << "': "
01042 << strerror(errno) << endl;
01043 exit(1);
01044 }
01045 output.close();
01046
01047 string version = destdir;
01048 version += "/iamflint";
01049 if (rename(dest.c_str(), version.c_str()) == -1) {
01050 cerr << argv[0] << ": cannot rename '" << dest << "' to '"
01051 << version << "': " << strerror(errno) << endl;
01052 exit(1);
01053 }
01054 } catch (const Xapian::Error &error) {
01055 cerr << argv[0] << ": " << error.get_description() << endl;
01056 exit(1);
01057 } catch (const char * msg) {
01058 cerr << argv[0] << ": " << msg << endl;
01059 exit(1);
01060 }
01061 }