bin/xapian-compact.cc

Go to the documentation of this file.
00001 /* xapian-compact.cc: Compact a flint database, or merge and compact several.
00002  *
00003  * Copyright (C) 2004,2005,2006,2007,2008 Olly Betts
00004  *
00005  * This program is free software; you can redistribute it and/or
00006  * modify it under the terms of the GNU General Public License as
00007  * published by the Free Software Foundation; either version 2 of the
00008  * License, or (at your option) any later version.
00009  *
00010  * This program is distributed in the hope that it will be useful,
00011  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013  * GNU General Public License for more details.
00014  *
00015  * You should have received a copy of the GNU General Public License
00016  * along with this program; if not, write to the Free Software
00017  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
00018  * USA
00019  */
00020 
00021 #include <config.h>
00022 
00023 #include "safeerrno.h"
00024 
00025 #include <fstream>
00026 #include <iostream>
00027 #include <queue>
00028 
00029 #include <stdio.h> // for rename()
00030 #include <string.h>
00031 #include <sys/types.h>
00032 #include "utils.h"
00033 
00034 #include "flint_table.h"
00035 #include "flint_cursor.h"
00036 #include "flint_utils.h"
00037 
00038 #include <xapian.h>
00039 
00040 #include "gnu_getopt.h"
00041 
00042 using namespace std;
00043 
00044 #define PROG_NAME "xapian-compact"
00045 #define PROG_DESC "Compact a flint database, or merge and compact several"
00046 
00047 #define OPT_HELP 1
00048 #define OPT_VERSION 2
00049 #define OPT_NO_RENUMBER 3
00050 
00051 static void show_usage() {
00052     cout << "Usage: "PROG_NAME" [OPTIONS] SOURCE_DATABASE... DESTINATION_DATABASE\n\n"
00053 "Options:\n"
00054 "  -b, --blocksize   Set the blocksize in bytes (e.g. 4096) or K (e.g. 4K)\n"
00055 "                    (must be between 2K and 64K and a power of 2, default 8K)\n"
00056 "  -n, --no-full     Disable full compaction\n"
00057 "  -F, --fuller      Enable fuller compaction (not recommended if you plan to\n"
00058 "                    update the compacted database)\n"
00059 "  -m, --multipass   If merging more than 3 databases, merge the postlists in\n"
00060 "                    multiple passes (which is generally faster but requires\n"
00061 "                    more disk space for temporary files)\n"
00062 "      --no-renumber Preserve the numbering of document ids (useful if you have\n"
00063 "                    external references to them, or have set them to match\n"
00064 "                    unique ids from an external source).  Currently this\n"
00065 "                    option isn't supported when merging databases.\n"
00066 "  --help            display this help and exit\n"
00067 "  --version         output version information and exit" << endl;
00068 }
00069 
00070 static inline bool
00071 is_metainfo_key(const string & key)
00072 {
00073     return key.size() == 1 && key[0] == '\0';
00074 }
00075 
00076 static inline bool
00077 is_user_metadata_key(const string & key)
00078 {
00079     return key.size() > 1 && key[0] == '\0' && key[1] == '\xc0';
00080 }
00081 
00082 class PostlistCursor : private FlintCursor {
00083     Xapian::docid offset;
00084 
00085   public:
00086     string key, tag;
00087     Xapian::docid firstdid;
00088     Xapian::termcount tf, cf;
00089 
00090     PostlistCursor(FlintTable *in, Xapian::docid offset_)
00091         : FlintCursor(in), offset(offset_), firstdid(0)
00092     {
00093         find_entry("");
00094         next();
00095     }
00096 
00097     ~PostlistCursor()
00098     {
00099         delete FlintCursor::get_table();
00100     }
00101 
00102     bool next() {
00103         if (!FlintCursor::next()) return false;
00104         // We put all chunks into the non-initial chunk form here, then fix up
00105         // the first chunk for each term in the merged database as we merge.
00106         read_tag();
00107         key = current_key;
00108         tag = current_tag;
00109         tf = cf = 0;
00110         if (is_metainfo_key(key)) return true;
00111         if (is_user_metadata_key(key)) return true;
00112         // Adjust key if this is *NOT* an initial chunk.
00113         // key is: pack_string_preserving_sort(tname)
00114         // plus optionally: pack_uint_preserving_sort(did)
00115         const char * d = key.data();
00116         const char * e = d + key.size();
00117         string tname;
00118         if (!unpack_string_preserving_sort(&d, e, tname))
00119             throw Xapian::DatabaseCorruptError("Bad postlist key");
00120         if (d == e) {
00121             // This is an initial chunk for a term, so adjust tag header.
00122             d = tag.data();
00123             e = d + tag.size();
00124             if (!unpack_uint(&d, e, &tf) ||
00125                 !unpack_uint(&d, e, &cf) ||
00126                 !unpack_uint(&d, e, &firstdid)) {
00127                 throw Xapian::DatabaseCorruptError("Bad postlist tag");
00128             }
00129             ++firstdid;
00130             tag.erase(0, d - tag.data());
00131         } else {
00132             // Not an initial chunk, so adjust key.
00133             size_t tmp = d - key.data();
00134             if (!unpack_uint_preserving_sort(&d, e, &firstdid) || d != e)
00135                 throw Xapian::DatabaseCorruptError("Bad postlist key");
00136             key.erase(tmp);
00137         }
00138         firstdid += offset;
00139         return true;
00140     }
00141 };
00142 
00143 class PostlistCursorGt {
00144   public:
00147     bool operator()(const PostlistCursor *a, const PostlistCursor *b) {
00148         if (a->key > b->key) return true;
00149         if (a->key != b->key) return false;
00150         return (a->firstdid > b->firstdid);
00151     }
00152 };
00153 
00154 static void
00155 merge_postlists(FlintTable * out, vector<Xapian::docid>::const_iterator offset,
00156                 vector<string>::const_iterator b, vector<string>::const_iterator e,
00157                 Xapian::docid tot_off)
00158 {
00159     flint_totlen_t tot_totlen = 0;
00160     priority_queue<PostlistCursor *, vector<PostlistCursor *>, PostlistCursorGt> pq;
00161     for ( ; b != e; ++b, ++offset) {
00162         FlintTable *in = new FlintTable(*b, true);
00163         in->open();
00164         if (in->empty()) {
00165             // Skip empty tables.
00166             delete in;
00167             continue;
00168         }
00169 
00170         // PostlistCursor takes ownership of FlintTable in and is
00171         // responsible for deleting it.
00172         PostlistCursor * cur = new PostlistCursor(in, *offset);
00173         // Merge the METAINFO tags from each database into one.
00174         // They have a key consisting of a single zero byte.
00175         // They may be absent, if the database contains no documents.  If it
00176         // has user metadata we'll still get here.
00177         if (is_metainfo_key(cur->key)) {
00178             const char * data = cur->tag.data();
00179             const char * end = data + cur->tag.size();
00180             Xapian::docid dummy_did = 0;
00181             if (!unpack_uint(&data, end, &dummy_did)) {
00182                 throw Xapian::DatabaseCorruptError("Tag containing meta information is corrupt.");
00183             }
00184             flint_totlen_t totlen = 0;
00185             if (!unpack_uint_last(&data, end, &totlen)) {
00186                 throw Xapian::DatabaseCorruptError("Tag containing meta information is corrupt.");
00187             }
00188             tot_totlen += totlen;
00189             if (tot_totlen < totlen) {
00190                 throw "totlen wrapped!";
00191             }
00192         }
00193         if (cur->next()) {
00194             pq.push(cur);
00195         } else {
00196             delete cur;
00197         }
00198     }
00199 
00200     {
00201         string tag = pack_uint(tot_off);
00202         tag += pack_uint_last(tot_totlen);
00203         out->add(string("", 1), tag);
00204     }
00205 
00206     string last_key;
00207     {
00208         // Merge user metadata.
00209         string last_tag;
00210         while (!pq.empty()) {
00211             PostlistCursor * cur = pq.top();
00212             const string& key = cur->key;
00213             if (!is_user_metadata_key(key)) break;
00214 
00215             const string & tag = cur->tag;
00216             if (key == last_key) {
00217                 if (tag != last_tag)
00218                     cerr << "Warning: duplicate user metadata key with different tag value - picking arbitrary tag value" << endl;
00219             } else {
00220                 out->add(key, tag);
00221                 last_key = key;
00222                 last_tag = tag;
00223             }
00224 
00225             pq.pop();
00226             if (cur->next()) {
00227                 pq.push(cur);
00228             } else {
00229                 delete cur;
00230             }
00231         }
00232     }
00233 
00234     Xapian::termcount tf = 0, cf = 0; // Initialise to avoid warnings.
00235     vector<pair<Xapian::docid, string> > tags;
00236     while (true) {
00237         PostlistCursor * cur = NULL;
00238         if (!pq.empty()) {
00239             cur = pq.top();
00240             pq.pop();
00241         }
00242         Assert(cur == NULL || !is_user_metadata_key(cur->key));
00243         if (cur == NULL || cur->key != last_key) {
00244             if (!tags.empty()) {
00245                 string first_tag = pack_uint(tf);
00246                 first_tag += pack_uint(cf);
00247                 first_tag += pack_uint(tags[0].first - 1);
00248                 string tag = tags[0].second;
00249                 tag[0] = (tags.size() == 1) ? '1' : '0';
00250                 first_tag += tag;
00251                 out->add(last_key, first_tag);
00252                 vector<pair<Xapian::docid, string> >::const_iterator i;
00253                 i = tags.begin();
00254                 while (++i != tags.end()) {
00255                     string key = last_key;
00256                     key += pack_uint_preserving_sort(i->first);
00257                     tag = i->second;
00258                     tag[0] = (i + 1 == tags.end()) ? '1' : '0';
00259                     out->add(key, tag);
00260                 }
00261             }
00262             tags.clear();
00263             if (cur == NULL) break;
00264             tf = cf = 0;
00265             last_key = cur->key;
00266         }
00267         tf += cur->tf;
00268         cf += cur->cf;
00269         tags.push_back(make_pair(cur->firstdid, cur->tag));
00270         if (cur->next()) {
00271             pq.push(cur);
00272         } else {
00273             delete cur;
00274         }
00275     }
00276 }
00277 
00278 struct MergeCursor : public FlintCursor {
00279     MergeCursor(FlintTable *in) : FlintCursor(in) {
00280         find_entry("");
00281         next();
00282     }
00283 
00284     ~MergeCursor() {
00285         delete FlintCursor::get_table();
00286     }
00287 };
00288 
00289 struct CursorGt {
00291     bool operator()(const FlintCursor *a, const FlintCursor *b) {
00292         if (b->after_end()) return false;
00293         if (a->after_end()) return true;
00294         return (a->current_key > b->current_key);
00295     }
00296 };
00297 
00298 #define MAGIC_XOR_VALUE 96
00299 
00300 // FIXME: copied from backends/flint/flint_spelling.cc.
00301 class PrefixCompressedStringItor {
00302     const unsigned char * p;
00303     size_t left;
00304     string current;
00305 
00306     PrefixCompressedStringItor(const unsigned char * p_, size_t left_,
00307                                const string &current_)
00308         : p(p_), left(left_), current(current_) { }
00309 
00310   public:
00311     PrefixCompressedStringItor(const std::string & s)
00312         : p(reinterpret_cast<const unsigned char *>(s.data())),
00313           left(s.size()) {
00314         if (left) {
00315             operator++();
00316         } else {
00317             p = NULL;
00318         }
00319     }
00320 
00321     const string & operator*() const {
00322         return current;
00323     }
00324 
00325     PrefixCompressedStringItor operator++(int) {
00326         const unsigned char * old_p = p;
00327         size_t old_left = left;
00328         string old_current = current;
00329         operator++();
00330         return PrefixCompressedStringItor(old_p, old_left, old_current);
00331     }
00332 
00333     PrefixCompressedStringItor & operator++() {
00334         if (left == 0) {
00335             p = NULL;
00336         } else {
00337             if (!current.empty()) {
00338                 current.resize(*p++ ^ MAGIC_XOR_VALUE);
00339                 --left;
00340             }
00341             size_t add;
00342             if (left == 0 || (add = *p ^ MAGIC_XOR_VALUE) >= left)
00343                 throw Xapian::DatabaseCorruptError("Bad spelling data (too little left)");
00344             current.append(reinterpret_cast<const char *>(p + 1), add);
00345             p += add + 1;
00346             left -= add + 1;
00347         }
00348         return *this;
00349     }
00350 
00351     bool at_end() const {
00352         return p == NULL;
00353     }
00354 };
00355 
00356 // FIXME: copied from backends/flint/flint_spelling.cc.
00357 class PrefixCompressedStringWriter {
00358     string current;
00359     string & out;
00360 
00361   public:
00362     PrefixCompressedStringWriter(string & out_) : out(out_) { }
00363 
00364     void append(const string & word) {
00365         // If this isn't the first entry, see how much of the previous one
00366         // we can reuse.
00367         if (!current.empty()) {
00368             size_t len = min(current.size(), word.size());
00369             size_t i;
00370             for (i = 0; i < len; ++i) {
00371                 if (current[i] != word[i]) break;
00372             }
00373             out += char(i ^ MAGIC_XOR_VALUE);
00374             out += char((word.size() - i) ^ MAGIC_XOR_VALUE);
00375             out.append(word.data() + i, word.size() - i);
00376         } else {
00377             out += char(word.size() ^ MAGIC_XOR_VALUE);
00378             out += word;
00379         }
00380         current = word;
00381     }
00382 };
00383 
00384 struct PrefixCompressedStringItorGt {
00386     bool operator()(const PrefixCompressedStringItor *a,
00387                     const PrefixCompressedStringItor *b) {
00388         return (**a > **b);
00389     }
00390 };
00391 
00392 static void
00393 merge_spellings(FlintTable * out,
00394                 vector<string>::const_iterator b,
00395                 vector<string>::const_iterator e)
00396 {
00397     priority_queue<MergeCursor *, vector<MergeCursor *>, CursorGt> pq;
00398     for ( ; b != e; ++b) {
00399         FlintTable *in = new FlintTable(*b, true, DONT_COMPRESS, true);
00400         in->open();
00401         if (!in->empty()) {
00402             // The MergeCursor takes ownership of FlintTable in and is
00403             // responsible for deleting it.
00404             pq.push(new MergeCursor(in));
00405         } else {
00406             delete in;
00407         }
00408     }
00409 
00410     while (!pq.empty()) {
00411         MergeCursor * cur = pq.top();
00412         pq.pop();
00413 
00414         string key = cur->current_key;
00415         if (pq.empty() || pq.top()->current_key > key) {
00416             // No need to merge the tags, just copy the (possibly compressed)
00417             // tag value.
00418             bool compressed = cur->read_tag(true);
00419             out->add(key, cur->current_tag, compressed);
00420             if (cur->next()) {
00421                 pq.push(cur);
00422             } else {
00423                 delete cur;
00424             }
00425             continue;
00426         }
00427 
00428         // Merge tag values with the same key:
00429         string tag;
00430         if (key[0] != 'W') {
00431             // We just want the union of words, so copy over the first instance
00432             // and skip any identical ones.
00433             priority_queue<PrefixCompressedStringItor *,
00434                            vector<PrefixCompressedStringItor *>,
00435                            PrefixCompressedStringItorGt> pqtag;
00436             // Stick all the MergeCursor pointers in a vector because their
00437             // current_tag members must remain valid while we're merging their
00438             // tags, but we need to call next() on them all afterwards.
00439             vector<MergeCursor *> vec;
00440             vec.reserve(pq.size());
00441 
00442             while (true) {
00443                 cur->read_tag();
00444                 pqtag.push(new PrefixCompressedStringItor(cur->current_tag));
00445                 vec.push_back(cur);
00446                 if (pq.empty() || pq.top()->current_key != key) break;
00447                 cur = pq.top();
00448                 pq.pop();
00449             }
00450 
00451             PrefixCompressedStringWriter wr(tag);
00452             string lastword;
00453             while (!pqtag.empty()) {
00454                 PrefixCompressedStringItor * it = pqtag.top();
00455                 string word = **it;
00456                 if (word != lastword) {
00457                     lastword = word;
00458                     wr.append(lastword);
00459                 }
00460                 ++*it;
00461                 pqtag.pop();
00462                 if (!it->at_end()) {
00463                     pqtag.push(it);
00464                 } else {
00465                     delete it;
00466                 }
00467             }
00468 
00469             vector<MergeCursor *>::const_iterator i;
00470             for (i = vec.begin(); i != vec.end(); ++i) {
00471                 cur = *i;
00472                 if (cur->next()) {
00473                     pq.push(cur);
00474                 } else {
00475                     delete cur;
00476                 }
00477             }
00478         } else {
00479             // We want to sum the frequencies from tags for the same key.
00480             Xapian::termcount tot_freq = 0;
00481             while (true) {
00482                 cur->read_tag();
00483                 Xapian::termcount freq;
00484                 const char * p = cur->current_tag.data();
00485                 const char * end = p + cur->current_tag.size();
00486                 if (!unpack_uint_last(&p, end, &freq) || freq == 0) {
00487                     throw Xapian::DatabaseCorruptError("Bad spelling word freq");
00488                 }
00489                 tot_freq += freq;
00490                 if (cur->next()) {
00491                     pq.push(cur);
00492                 } else {
00493                     delete cur;
00494                 }
00495                 if (pq.empty() || pq.top()->current_key != key) break;
00496                 cur = pq.top();
00497                 pq.pop();
00498             }
00499             tag = pack_uint_last(tot_freq);
00500         }
00501         out->add(key, tag);
00502     }
00503 }
00504 
00505 class ByteLengthPrefixedStringItor {
00506     const unsigned char * p;
00507     size_t left;
00508 
00509     ByteLengthPrefixedStringItor(const unsigned char * p_, size_t left_)
00510         : p(p_), left(left_) { }
00511 
00512   public:
00513     ByteLengthPrefixedStringItor(const std::string & s)
00514         : p(reinterpret_cast<const unsigned char *>(s.data())),
00515           left(s.size()) { }
00516 
00517     string operator*() const {
00518         size_t len = *p ^ MAGIC_XOR_VALUE;
00519         return string(reinterpret_cast<const char *>(p + 1), len);
00520     }
00521 
00522     ByteLengthPrefixedStringItor operator++(int) {
00523         const unsigned char * old_p = p;
00524         size_t old_left = left;
00525         operator++();
00526         return ByteLengthPrefixedStringItor(old_p, old_left);
00527     }
00528 
00529     ByteLengthPrefixedStringItor & operator++() {
00530         if (!left) {
00531             throw Xapian::DatabaseCorruptError("Bad synonym data (none left)");
00532         }
00533         size_t add = (*p ^ MAGIC_XOR_VALUE) + 1;
00534         if (left < add) {
00535             throw Xapian::DatabaseCorruptError("Bad synonym data (too little left)");
00536         }
00537         p += add;
00538         left -= add;
00539         return *this;
00540     }
00541 
00542     bool at_end() const {
00543         return left == 0;
00544     }
00545 };
00546 
00547 struct ByteLengthPrefixedStringItorGt {
00549     bool operator()(const ByteLengthPrefixedStringItor *a,
00550                     const ByteLengthPrefixedStringItor *b) {
00551         return (**a > **b);
00552     }
00553 };
00554 
00555 static void
00556 merge_synonyms(FlintTable * out,
00557                vector<string>::const_iterator b,
00558                vector<string>::const_iterator e)
00559 {
00560     priority_queue<MergeCursor *, vector<MergeCursor *>, CursorGt> pq;
00561     for ( ; b != e; ++b) {
00562         FlintTable *in = new FlintTable(*b, true, DONT_COMPRESS, true);
00563         in->open();
00564         if (!in->empty()) {
00565             // The MergeCursor takes ownership of FlintTable in and is
00566             // responsible for deleting it.
00567             pq.push(new MergeCursor(in));
00568         } else {
00569             delete in;
00570         }
00571     }
00572 
00573     while (!pq.empty()) {
00574         MergeCursor * cur = pq.top();
00575         pq.pop();
00576 
00577         string key = cur->current_key;
00578         if (pq.top()->current_key > key) {
00579             // No need to merge the tags, just copy the (possibly compressed)
00580             // tag value.
00581             bool compressed = cur->read_tag(true);
00582             out->add(key, cur->current_tag, compressed);
00583             if (cur->next()) {
00584                 pq.push(cur);
00585             } else {
00586                 delete cur;
00587             }
00588             continue;
00589         }
00590 
00591         // Merge tag values with the same key:
00592         string tag;
00593 
00594         // We just want the union of words, so copy over the first instance
00595         // and skip any identical ones.
00596         priority_queue<ByteLengthPrefixedStringItor *,
00597                        vector<ByteLengthPrefixedStringItor *>,
00598                        ByteLengthPrefixedStringItorGt> pqtag;
00599         vector<MergeCursor *> vec;
00600 
00601         while (true) {
00602             cur->read_tag();
00603             pqtag.push(new ByteLengthPrefixedStringItor(cur->current_tag));
00604             vec.push_back(cur);
00605             if (pq.empty() || pq.top()->current_key != key) break;
00606             cur = pq.top();
00607             pq.pop();
00608         }
00609 
00610         string lastword;
00611         while (!pqtag.empty()) {
00612             ByteLengthPrefixedStringItor * it = pqtag.top();
00613             if (**it != lastword) {
00614                 lastword = **it;
00615                 tag += byte(lastword.size() ^ MAGIC_XOR_VALUE);
00616                 tag += lastword;
00617             }
00618             ++*it;
00619             pqtag.pop();
00620             if (!it->at_end()) {
00621                 pqtag.push(it);
00622             } else {
00623                 delete it;
00624             }
00625         }
00626 
00627         vector<MergeCursor *>::const_iterator i;
00628         for (i = vec.begin(); i != vec.end(); ++i) {
00629             cur = *i;
00630             if (cur->next()) {
00631                 pq.push(cur);
00632             } else {
00633                 delete cur;
00634             }
00635         }
00636 
00637         out->add(key, tag);
00638     }
00639 }
00640 
00641 static void
00642 multimerge_postlists(FlintTable * out, const char * tmpdir,
00643                      Xapian::docid tot_off,
00644                      vector<string> tmp, vector<Xapian::docid> off)
00645 {
00646     unsigned int c = 0;
00647     while (tmp.size() > 3) {
00648         vector<string> tmpout;
00649         tmpout.reserve(tmp.size() / 2);
00650         vector<Xapian::docid> newoff;
00651         newoff.resize(tmp.size() / 2);
00652         for (unsigned int i = 0, j; i < tmp.size(); i = j) {
00653             j = i + 2;
00654             if (j == tmp.size() - 1) ++j;
00655 
00656             string dest = tmpdir;
00657             char buf[64];
00658             sprintf(buf, "/tmp%u_%u.", c, i / 2);
00659             dest += buf;
00660 
00661             // Don't compress temporary tables, even if the final table would
00662             // be.
00663             FlintTable tmptab(dest, false);
00664             // Use maximum blocksize for temporary tables.
00665             tmptab.create_and_open(65536);
00666 
00667             merge_postlists(&tmptab, off.begin() + i, tmp.begin() + i, tmp.begin() + j, 0);
00668             if (c > 0) {
00669                 for (unsigned int k = i; k < j; ++k) {
00670                     unlink((tmp[k] + "DB").c_str());
00671                     unlink((tmp[k] + "baseA").c_str());
00672                     unlink((tmp[k] + "baseB").c_str());
00673                 }
00674             }
00675             tmpout.push_back(dest);
00676             tmptab.commit(1);
00677         }
00678         swap(tmp, tmpout);
00679         swap(off, newoff);
00680         ++c;
00681     }
00682     merge_postlists(out, off.begin(), tmp.begin(), tmp.end(), tot_off);
00683     if (c > 0) {
00684         for (size_t k = 0; k < tmp.size(); ++k) {
00685             unlink((tmp[k] + "DB").c_str());
00686             unlink((tmp[k] + "baseA").c_str());
00687             unlink((tmp[k] + "baseB").c_str());
00688         }
00689     }
00690 }
00691 
00692 static void
00693 merge_docid_keyed(FlintTable *out, const vector<string> & inputs,
00694                   const vector<Xapian::docid> & offset, bool lazy)
00695 {
00696     for (size_t i = 0; i < inputs.size(); ++i) {
00697         Xapian::docid off = offset[i];
00698 
00699         FlintTable in(inputs[i], true, DONT_COMPRESS, lazy);
00700         in.open();
00701         if (in.empty()) continue;
00702 
00703         FlintCursor cur(&in);
00704         cur.find_entry("");
00705 
00706         string key;
00707         while (cur.next()) {
00708             // Adjust the key if this isn't the first database.
00709             if (off) {
00710                 Xapian::docid did;
00711                 const char * d = cur.current_key.data();
00712                 const char * e = d + cur.current_key.size();
00713                 if (!unpack_uint_preserving_sort(&d, e, &did)) {
00714                     string msg = "Bad key in ";
00715                     msg += inputs[i];
00716                     throw Xapian::DatabaseCorruptError(msg);
00717                 }
00718                 did += off;
00719                 key = pack_uint_preserving_sort(did);
00720                 if (d != e) {
00721                     // Copy over the termname for the position table.
00722                     key.append(d, e - d);
00723                 }
00724             } else {
00725                 key = cur.current_key;
00726             }
00727             bool compressed = cur.read_tag(true);
00728             out->add(key, cur.current_tag, compressed);
00729         }
00730     }
00731 }
00732 
00733 int
00734 main(int argc, char **argv)
00735 {
00736     const char * opts = "b:nFm";
00737     const struct option long_opts[] = {
00738         {"fuller",      no_argument, 0, 'F'},
00739         {"no-full",     no_argument, 0, 'n'},
00740         {"multipass",   no_argument, 0, 'm'},
00741         {"blocksize",   required_argument, 0, 'b'},
00742         {"no-renumber", no_argument, 0, OPT_NO_RENUMBER},
00743         {"help",        no_argument, 0, OPT_HELP},
00744         {"version",     no_argument, 0, OPT_VERSION},
00745         {NULL,          0, 0, 0}
00746     };
00747 
00748     enum { STANDARD, FULL, FULLER } compaction = FULL;
00749     size_t block_size = 8192;
00750     bool multipass = false;
00751     bool renumber = true;
00752 
00753     int c;
00754     while ((c = gnu_getopt_long(argc, argv, opts, long_opts, 0)) != -1) {
00755         switch (c) {
00756             case 'b': {
00757                 char *p;
00758                 block_size = strtoul(optarg, &p, 10);
00759                 if (block_size <= 64 && (*p == 'K' || *p == 'k')) {
00760                     ++p;
00761                     block_size *= 1024;
00762                 }
00763                 if (*p || block_size < 2048 || block_size > 65536 ||
00764                     (block_size & (block_size - 1)) != 0) {
00765                     cerr << PROG_NAME": Bad value '" << optarg
00766                          << "' passed for blocksize, must be a power of 2 between 2K and 64K"
00767                          << endl;
00768                     exit(1);
00769                 }
00770                 break;
00771             }
00772             case 'n':
00773                 compaction = STANDARD;
00774                 break;
00775             case 'F':
00776                 compaction = FULLER;
00777                 break;
00778             case 'm':
00779                 multipass = true;
00780                 break;
00781             case OPT_NO_RENUMBER:
00782                 renumber = false;
00783                 break;
00784             case OPT_HELP:
00785                 cout << PROG_NAME" - "PROG_DESC"\n\n";
00786                 show_usage();
00787                 exit(0);
00788             case OPT_VERSION:
00789                 cout << PROG_NAME" - "PACKAGE_STRING << endl;
00790                 exit(0);
00791             default:
00792                 show_usage();
00793                 exit(1);
00794         }
00795     }
00796 
00797     if (argc - optind < 2) {
00798         show_usage();
00799         exit(1);
00800     }
00801 
00802     if (!renumber && argc - optind > 2) {
00803         cout << argv[0]
00804              << ": --no-renumber isn't currently supported when merging databases."
00805              << endl;
00806         exit(1);
00807     }
00808 
00809     // Path to the database to create.
00810     const char *destdir = argv[argc - 1];
00811 
00812     try {
00813         vector<string> sources;
00814         vector<Xapian::docid> offset;
00815         sources.reserve(argc - 1 - optind);
00816         offset.reserve(argc - 1 - optind);
00817         Xapian::docid tot_off = 0;
00818         for (int i = optind; i < argc - 1; ++i) {
00819             const char *srcdir = argv[i];
00820             // Check destdir isn't the same as any source directory...
00821             if (strcmp(srcdir, destdir) == 0) {
00822                 cout << argv[0]
00823                      << ": destination may not be the same as any source directory."
00824                      << endl;
00825                 exit(1);
00826             }
00827 
00828             struct stat sb;
00829             if (stat(string(srcdir) + "/iamflint", &sb) != 0) {
00830                 cout << argv[0] << ": '" << srcdir
00831                      << "' is not a flint database directory" << endl;
00832                 exit(1);
00833             }
00834 
00835             Xapian::Database db(srcdir);
00836             Xapian::docid last = 0;
00837 
00838             // "Empty" databases might have spelling or synonym data so can't
00839             // just be completely ignored.
00840             if (db.get_doccount() != 0) {
00841                 last = db.get_lastdocid();
00842 
00843                 if (renumber) {
00844                     // Prune any unused docids off the start of this source
00845                     // database.
00846                     Xapian::PostingIterator it = db.postlist_begin("");
00847                     // This test should never fail, since db.get_doccount() is
00848                     // non-zero!
00849                     if (it != db.postlist_end("")) {
00850                         // tot_off could wrap here, but it's unsigned, so
00851                         // that's OK.
00852                         tot_off -= (*it - 1);
00853                     }
00854 
00855                     // FIXME: get_lastdocid() returns a "high water mark" - we
00856                     // should prune unused docids off the end of each source
00857                     // database as well as off the start.
00858                 }
00859             }
00860             offset.push_back(tot_off);
00861             tot_off += last;
00862 
00863             sources.push_back(string(srcdir) + '/');
00864         }
00865 
00866         // If the destination database directory doesn't exist, create it.
00867         if (mkdir(destdir, 0755) < 0) {
00868             // Check why mkdir failed.  It's ok if the directory already
00869             // exists, but we also get EEXIST if there's an existing file with
00870             // that name.
00871             if (errno == EEXIST) {
00872                 struct stat sb;
00873                 if (stat(destdir, &sb) == 0 && S_ISDIR(sb.st_mode))
00874                     errno = 0;
00875                 else
00876                     errno = EEXIST; // stat might have changed it
00877             }
00878             if (errno) {
00879                 cerr << argv[0] << ": cannot create directory '"
00880                      << destdir << "': " << strerror(errno) << endl;
00881                 exit(1);
00882             }
00883         }
00884 
00885         enum table_type {
00886             POSTLIST, RECORD, TERMLIST, POSITION, VALUE, SPELLING, SYNONYM
00887         };
00888         struct table_list {
00889             // The "base name" of the table.
00890             const char * name;
00891             // The type.
00892             table_type type;
00893             // zlib compression strategy to use on tags.
00894             int compress_strategy;
00895             // Create tables after position lazily.
00896             bool lazy;
00897         };
00898 
00899         static const table_list tables[] = {
00900             // name         type        compress_strategy       lazy
00901             { "postlist",   POSTLIST,   DONT_COMPRESS,          false },
00902             { "record",     RECORD,     Z_DEFAULT_STRATEGY,     false },
00903             { "termlist",   TERMLIST,   Z_DEFAULT_STRATEGY,     false },
00904             { "position",   POSITION,   DONT_COMPRESS,          true },
00905             { "value",      VALUE,      DONT_COMPRESS,          true },
00906             { "spelling",   SPELLING,   Z_DEFAULT_STRATEGY,     true },
00907             { "synonym",    SYNONYM,    Z_DEFAULT_STRATEGY,     true }
00908         };
00909         const table_list * tables_end = tables +
00910             (sizeof(tables) / sizeof(tables[0]));
00911 
00912         for (const table_list * t = tables; t < tables_end; ++t) {
00913             // The postlist requires an N-way merge, adjusting the headers of
00914             // various blocks.  The other tables have keys sorted in docid
00915             // order, so we can merge them by simply copying all the keys from
00916             // each source table in turn.
00917             cout << t->name << " ..." << flush;
00918 
00919             string dest = destdir;
00920             dest += '/';
00921             dest += t->name;
00922             dest += '.';
00923 
00924             FlintTable out(dest, false, t->compress_strategy, t->lazy);
00925             if (!t->lazy) {
00926                 out.create_and_open(block_size);
00927             } else {
00928                 out.erase();
00929                 out.set_block_size(block_size);
00930             }
00931 
00932             out.set_full_compaction(compaction != STANDARD);
00933             if (compaction == FULLER) out.set_max_item_size(1);
00934 
00935             // Sometimes stat can fail for benign reasons (e.g. >= 2GB file
00936             // on certain systems).
00937             bool bad_stat = false;
00938 
00939             off_t in_size = 0;
00940 
00941             vector<string> inputs;
00942             inputs.reserve(sources.size());
00943             for (vector<string>::const_iterator src = sources.begin();
00944                  src != sources.end(); ++src) {
00945                 string s(*src);
00946                 s += t->name;
00947                 s += '.';
00948 
00949                 struct stat sb;
00950                 if (stat(s + "DB", &sb) == 0) {
00951                     in_size += sb.st_size / 1024;
00952                 } else {
00953                     // We get ENOENT for an optional table.
00954                     bad_stat = (errno != ENOENT);
00955                 }
00956                 inputs.push_back(s);
00957             }
00958 
00959             if (inputs.empty()) continue;
00960 
00961             switch (t->type) {
00962                 case POSTLIST:
00963                     if (multipass && inputs.size() > 3) {
00964                         multimerge_postlists(&out, destdir, tot_off,
00965                                              inputs, offset);
00966                     } else {
00967                         merge_postlists(&out, offset.begin(),
00968                                         inputs.begin(), inputs.end(),
00969                                         tot_off);
00970                     }
00971                     break;
00972                 case SPELLING:
00973                     merge_spellings(&out, inputs.begin(), inputs.end());
00974                     break;
00975                 case SYNONYM:
00976                     merge_synonyms(&out, inputs.begin(), inputs.end());
00977                     break;
00978                 default:
00979                     // Position, Record, Termlist, Value
00980                     merge_docid_keyed(&out, inputs, offset, t->lazy);
00981                     break;
00982             }
00983 
00984             // Commit as revision 1.
00985             out.commit(1);
00986 
00987             cout << '\r' << t->name << ": ";
00988             off_t out_size = 0;
00989             if (!bad_stat) {
00990                 struct stat sb;
00991                 if (stat(dest + "DB", &sb) == 0) {
00992                     out_size = sb.st_size / 1024;
00993                 } else {
00994                     bad_stat = (errno != ENOENT);
00995                 }
00996             }
00997             if (bad_stat) {
00998                 cout << "Done (couldn't stat all the DB files)";
00999             } else {
01000                 if (out_size == in_size) {
01001                     cout << "Size unchanged (";
01002                 } else if (out_size < in_size) {
01003                     cout << "Reduced by "
01004                          << 100 * double(in_size - out_size) / in_size << "% "
01005                          << in_size - out_size << "K (" << in_size << "K -> ";
01006                 } else {
01007                     cout << "INCREASED by "
01008                          << 100 * double(out_size - in_size) / in_size << "% "
01009                          << out_size - in_size << "K (" << in_size << "K -> ";
01010                 }
01011                 cout << out_size << "K)";
01012             }
01013             cout << endl;
01014         }
01015 
01016         // Copy over the version file ("iamflint").
01017         // FIXME: We may need to do something smarter that just copying an
01018         // arbitrary version file if the version file format changes...
01019         string dest = destdir;
01020         dest += "/iamflint.tmp";
01021 
01022         string src(argv[optind]);
01023         src += "/iamflint";
01024 
01025         ifstream input(src.c_str());
01026         char buf[1024];
01027         input.read(buf, sizeof(buf));
01028         if (!input.eof()) {
01029             if (!input) {
01030                 cerr << argv[0] << ": error reading '" << src << "': "
01031                      << strerror(errno) << endl;
01032                 exit(1);
01033             }
01034             // Version file should be about 12 bytes, not > 1024!
01035             cerr << argv[0] << ": version file '" << src << "' too large!"
01036                  << endl;
01037             exit(1);
01038         }
01039         ofstream output(dest.c_str());
01040         if (!output.write(buf, input.gcount())) {
01041             cerr << argv[0] << ": error writing '" << dest << "': "
01042                  << strerror(errno) << endl;
01043             exit(1);
01044         }
01045         output.close();
01046 
01047         string version = destdir;
01048         version += "/iamflint";
01049         if (rename(dest.c_str(), version.c_str()) == -1) {
01050             cerr << argv[0] << ": cannot rename '" << dest << "' to '"
01051                  << version << "': " << strerror(errno) << endl;
01052             exit(1);
01053         }
01054     } catch (const Xapian::Error &error) {
01055         cerr << argv[0] << ": " << error.get_description() << endl;
01056         exit(1);
01057     } catch (const char * msg) {
01058         cerr << argv[0] << ": " << msg << endl;
01059         exit(1);
01060     }
01061 }

Documentation for Xapian (version 1.0.20).
Generated on 28 Apr 2010 by Doxygen 1.5.2.