00001
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <config.h>
00023
00024 #include <xapian/compactor.h>
00025
00026 #include <algorithm>
00027 #include <queue>
00028
00029 #include <cstdio>
00030
00031 #include "safeerrno.h"
00032 #include <sys/types.h>
00033 #include "safesysstat.h"
00034
00035 #include "brass_table.h"
00036 #include "brass_compact.h"
00037 #include "brass_cursor.h"
00038 #include "internaltypes.h"
00039 #include "pack.h"
00040 #include "utils.h"
00041 #include "valuestats.h"
00042
00043 #include "../byte_length_strings.h"
00044 #include "../prefix_compressed_strings.h"
00045 #include <xapian.h>
00046
00047 using namespace std;
00048
00049
00050
00051 namespace BrassCompact {
00052
00053 static inline bool
00054 is_metainfo_key(const string & key)
00055 {
00056 return key.size() == 1 && key[0] == '\0';
00057 }
00058
00059 static inline bool
00060 is_user_metadata_key(const string & key)
00061 {
00062 return key.size() > 1 && key[0] == '\0' && key[1] == '\xc0';
00063 }
00064
00065 static inline bool
00066 is_valuestats_key(const string & key)
00067 {
00068 return key.size() > 1 && key[0] == '\0' && key[1] == '\xd0';
00069 }
00070
00071 static inline bool
00072 is_valuechunk_key(const string & key)
00073 {
00074 return key.size() > 1 && key[0] == '\0' && key[1] == '\xd8';
00075 }
00076
00077 static inline bool
00078 is_doclenchunk_key(const string & key)
00079 {
00080 return key.size() > 1 && key[0] == '\0' && key[1] == '\xe0';
00081 }
00082
00083 class PostlistCursor : private BrassCursor {
00084 Xapian::docid offset;
00085
00086 public:
00087 string key, tag;
00088 Xapian::docid firstdid;
00089 Xapian::termcount tf, cf;
00090
00091 PostlistCursor(BrassTable *in, Xapian::docid offset_)
00092 : BrassCursor(in), offset(offset_), firstdid(0)
00093 {
00094 find_entry(string());
00095 next();
00096 }
00097
00098 ~PostlistCursor()
00099 {
00100 delete BrassCursor::get_table();
00101 }
00102
00103 bool next() {
00104 if (!BrassCursor::next()) return false;
00105
00106
00107 read_tag();
00108 key = current_key;
00109 tag = current_tag;
00110 tf = cf = 0;
00111 if (is_metainfo_key(key)) return true;
00112 if (is_user_metadata_key(key)) return true;
00113 if (is_valuestats_key(key)) return true;
00114 if (is_valuechunk_key(key)) {
00115 const char * p = key.data();
00116 const char * end = p + key.length();
00117 p += 2;
00118 Xapian::valueno slot;
00119 if (!unpack_uint(&p, end, &slot))
00120 throw Xapian::DatabaseCorruptError("bad value key");
00121 Xapian::docid did;
00122 if (!unpack_uint_preserving_sort(&p, end, &did))
00123 throw Xapian::DatabaseCorruptError("bad value key");
00124 did += offset;
00125
00126 key.assign("\0\xd8", 2);
00127 pack_uint(key, slot);
00128 pack_uint_preserving_sort(key, did);
00129 return true;
00130 }
00131
00132
00133
00134
00135 const char * d = key.data();
00136 const char * e = d + key.size();
00137 if (is_doclenchunk_key(key)) {
00138 d += 2;
00139 } else {
00140 string tname;
00141 if (!unpack_string_preserving_sort(&d, e, tname))
00142 throw Xapian::DatabaseCorruptError("Bad postlist key");
00143 }
00144
00145 if (d == e) {
00146
00147 d = tag.data();
00148 e = d + tag.size();
00149 if (!unpack_uint(&d, e, &tf) ||
00150 !unpack_uint(&d, e, &cf) ||
00151 !unpack_uint(&d, e, &firstdid)) {
00152 throw Xapian::DatabaseCorruptError("Bad postlist key");
00153 }
00154 ++firstdid;
00155 tag.erase(0, d - tag.data());
00156 } else {
00157
00158 size_t tmp = d - key.data();
00159 if (!unpack_uint_preserving_sort(&d, e, &firstdid) || d != e)
00160 throw Xapian::DatabaseCorruptError("Bad postlist key");
00161 if (is_doclenchunk_key(key)) {
00162 key.erase(tmp);
00163 } else {
00164 key.erase(tmp - 1);
00165 }
00166 }
00167 firstdid += offset;
00168 return true;
00169 }
00170 };
00171
00172 class PostlistCursorGt {
00173 public:
00176 bool operator()(const PostlistCursor *a, const PostlistCursor *b) {
00177 if (a->key > b->key) return true;
00178 if (a->key != b->key) return false;
00179 return (a->firstdid > b->firstdid);
00180 }
00181 };
00182
00183 static string
00184 encode_valuestats(Xapian::doccount freq,
00185 const string & lbound, const string & ubound)
00186 {
00187 string value;
00188 pack_uint(value, freq);
00189 pack_string(value, lbound);
00190
00191
00192
00193 if (lbound != ubound) value += ubound;
00194 return value;
00195 }
00196
00197 static void
00198 merge_postlists(Xapian::Compactor & compactor,
00199 BrassTable * out, vector<Xapian::docid>::const_iterator offset,
00200 vector<string>::const_iterator b,
00201 vector<string>::const_iterator e,
00202 Xapian::docid last_docid)
00203 {
00204 totlen_t tot_totlen = 0;
00205 Xapian::termcount doclen_lbound = static_cast<Xapian::termcount>(-1);
00206 Xapian::termcount wdf_ubound = 0;
00207 Xapian::termcount doclen_ubound = 0;
00208 priority_queue<PostlistCursor *, vector<PostlistCursor *>, PostlistCursorGt> pq;
00209 for ( ; b != e; ++b, ++offset) {
00210 BrassTable *in = new BrassTable("postlist", *b, true);
00211 in->open();
00212 if (in->empty()) {
00213
00214 delete in;
00215 continue;
00216 }
00217
00218
00219
00220 PostlistCursor * cur = new PostlistCursor(in, *offset);
00221
00222
00223
00224
00225 if (is_metainfo_key(cur->key)) {
00226 const char * data = cur->tag.data();
00227 const char * end = data + cur->tag.size();
00228 Xapian::docid dummy_did = 0;
00229 if (!unpack_uint(&data, end, &dummy_did)) {
00230 throw Xapian::DatabaseCorruptError("Tag containing meta information is corrupt.");
00231 }
00232
00233 Xapian::termcount doclen_lbound_tmp;
00234 if (!unpack_uint(&data, end, &doclen_lbound_tmp)) {
00235 throw Xapian::DatabaseCorruptError("Tag containing meta information is corrupt.");
00236 }
00237 doclen_lbound = min(doclen_lbound, doclen_lbound_tmp);
00238
00239 Xapian::termcount wdf_ubound_tmp;
00240 if (!unpack_uint(&data, end, &wdf_ubound_tmp)) {
00241 throw Xapian::DatabaseCorruptError("Tag containing meta information is corrupt.");
00242 }
00243 wdf_ubound = max(wdf_ubound, wdf_ubound_tmp);
00244
00245 Xapian::termcount doclen_ubound_tmp;
00246 if (!unpack_uint(&data, end, &doclen_ubound_tmp)) {
00247 throw Xapian::DatabaseCorruptError("Tag containing meta information is corrupt.");
00248 }
00249 doclen_ubound_tmp += wdf_ubound_tmp;
00250 doclen_ubound = max(doclen_ubound, doclen_ubound_tmp);
00251
00252 totlen_t totlen = 0;
00253 if (!unpack_uint_last(&data, end, &totlen)) {
00254 throw Xapian::DatabaseCorruptError("Tag containing meta information is corrupt.");
00255 }
00256 tot_totlen += totlen;
00257 if (tot_totlen < totlen) {
00258 throw "totlen wrapped!";
00259 }
00260 }
00261 if (cur->next()) {
00262 pq.push(cur);
00263 } else {
00264 delete cur;
00265 }
00266 }
00267
00268 {
00269 string tag;
00270 pack_uint(tag, last_docid);
00271 pack_uint(tag, doclen_lbound);
00272 pack_uint(tag, wdf_ubound);
00273 pack_uint(tag, doclen_ubound - wdf_ubound);
00274 pack_uint_last(tag, tot_totlen);
00275 out->add(string(1, '\0'), tag);
00276 }
00277
00278 string last_key;
00279 {
00280
00281 vector<string> tags;
00282 while (!pq.empty()) {
00283 PostlistCursor * cur = pq.top();
00284 const string& key = cur->key;
00285 if (!is_user_metadata_key(key)) break;
00286
00287 if (key != last_key) {
00288 if (tags.size() > 1) {
00289 Assert(!last_key.empty());
00290 out->add(last_key,
00291 compactor.resolve_duplicate_metadata(last_key,
00292 tags.size(),
00293 &tags[0]));
00294 } else if (tags.size() == 1) {
00295 Assert(!last_key.empty());
00296 out->add(last_key, tags[0]);
00297 }
00298 tags.resize(0);
00299 last_key = key;
00300 }
00301 tags.push_back(cur->tag);
00302
00303 pq.pop();
00304 if (cur->next()) {
00305 pq.push(cur);
00306 } else {
00307 delete cur;
00308 }
00309 }
00310 if (tags.size() > 1) {
00311 Assert(!last_key.empty());
00312 out->add(last_key,
00313 compactor.resolve_duplicate_metadata(last_key,
00314 tags.size(),
00315 &tags[0]));
00316 } else if (tags.size() == 1) {
00317 Assert(!last_key.empty());
00318 out->add(last_key, tags[0]);
00319 }
00320 }
00321
00322 {
00323
00324 Xapian::doccount freq = 0;
00325 string lbound, ubound;
00326
00327 string last_tag;
00328 while (!pq.empty()) {
00329 PostlistCursor * cur = pq.top();
00330 const string& key = cur->key;
00331 if (!is_valuestats_key(key)) break;
00332 if (key != last_key) {
00333
00334
00335
00336 if (freq) {
00337 out->add(last_key, encode_valuestats(freq, lbound, ubound));
00338 freq = 0;
00339 }
00340 last_key = key;
00341 }
00342
00343 const string & tag = cur->tag;
00344
00345 const char * pos = tag.data();
00346 const char * end = pos + tag.size();
00347
00348 Xapian::doccount f;
00349 string l, u;
00350 if (!unpack_uint(&pos, end, &f)) {
00351 if (*pos == 0) throw Xapian::DatabaseCorruptError("Incomplete stats item in value table");
00352 throw Xapian::RangeError("Frequency statistic in value table is too large");
00353 }
00354 if (!unpack_string(&pos, end, l)) {
00355 if (*pos == 0) throw Xapian::DatabaseCorruptError("Incomplete stats item in value table");
00356 throw Xapian::RangeError("Lower bound in value table is too large");
00357 }
00358 size_t len = end - pos;
00359 if (len == 0) {
00360 u = l;
00361 } else {
00362 u.assign(pos, len);
00363 }
00364 if (freq == 0) {
00365 freq = f;
00366 lbound = l;
00367 ubound = u;
00368 } else {
00369 freq += f;
00370 if (l < lbound) lbound = l;
00371 if (u > ubound) ubound = u;
00372 }
00373
00374 pq.pop();
00375 if (cur->next()) {
00376 pq.push(cur);
00377 } else {
00378 delete cur;
00379 }
00380 }
00381
00382 if (freq) {
00383 out->add(last_key, encode_valuestats(freq, lbound, ubound));
00384 }
00385 }
00386
00387
00388 while (!pq.empty()) {
00389 PostlistCursor * cur = pq.top();
00390 const string & key = cur->key;
00391 if (!is_valuechunk_key(key)) break;
00392 Assert(!is_user_metadata_key(key));
00393 out->add(key, cur->tag);
00394 pq.pop();
00395 if (cur->next()) {
00396 pq.push(cur);
00397 } else {
00398 delete cur;
00399 }
00400 }
00401
00402 Xapian::termcount tf = 0, cf = 0;
00403 vector<pair<Xapian::docid, string> > tags;
00404 while (true) {
00405 PostlistCursor * cur = NULL;
00406 if (!pq.empty()) {
00407 cur = pq.top();
00408 pq.pop();
00409 }
00410 Assert(cur == NULL || !is_user_metadata_key(cur->key));
00411 if (cur == NULL || cur->key != last_key) {
00412 if (!tags.empty()) {
00413 string first_tag;
00414 pack_uint(first_tag, tf);
00415 pack_uint(first_tag, cf);
00416 pack_uint(first_tag, tags[0].first - 1);
00417 string tag = tags[0].second;
00418 tag[0] = (tags.size() == 1) ? '1' : '0';
00419 first_tag += tag;
00420 out->add(last_key, first_tag);
00421
00422 string term;
00423 if (!is_doclenchunk_key(last_key)) {
00424 const char * p = last_key.data();
00425 const char * end = p + last_key.size();
00426 if (!unpack_string_preserving_sort(&p, end, term) || p != end)
00427 throw Xapian::DatabaseCorruptError("Bad postlist chunk key");
00428 }
00429
00430 vector<pair<Xapian::docid, string> >::const_iterator i;
00431 i = tags.begin();
00432 while (++i != tags.end()) {
00433 tag = i->second;
00434 tag[0] = (i + 1 == tags.end()) ? '1' : '0';
00435 out->add(pack_brass_postlist_key(term, i->first), tag);
00436 }
00437 }
00438 tags.clear();
00439 if (cur == NULL) break;
00440 tf = cf = 0;
00441 last_key = cur->key;
00442 }
00443 tf += cur->tf;
00444 cf += cur->cf;
00445 tags.push_back(make_pair(cur->firstdid, cur->tag));
00446 if (cur->next()) {
00447 pq.push(cur);
00448 } else {
00449 delete cur;
00450 }
00451 }
00452 }
00453
00454 struct MergeCursor : public BrassCursor {
00455 MergeCursor(BrassTable *in) : BrassCursor(in) {
00456 find_entry(string());
00457 next();
00458 }
00459
00460 ~MergeCursor() {
00461 delete BrassCursor::get_table();
00462 }
00463 };
00464
00465 struct CursorGt {
00467 bool operator()(const BrassCursor *a, const BrassCursor *b) {
00468 if (b->after_end()) return false;
00469 if (a->after_end()) return true;
00470 return (a->current_key > b->current_key);
00471 }
00472 };
00473
00474 static void
00475 merge_spellings(BrassTable * out,
00476 vector<string>::const_iterator b,
00477 vector<string>::const_iterator e)
00478 {
00479 priority_queue<MergeCursor *, vector<MergeCursor *>, CursorGt> pq;
00480 for ( ; b != e; ++b) {
00481 BrassTable *in = new BrassTable("spelling", *b, true, DONT_COMPRESS, true);
00482 in->open();
00483 if (!in->empty()) {
00484
00485
00486 pq.push(new MergeCursor(in));
00487 } else {
00488 delete in;
00489 }
00490 }
00491
00492 while (!pq.empty()) {
00493 MergeCursor * cur = pq.top();
00494 pq.pop();
00495
00496 string key = cur->current_key;
00497 if (pq.empty() || pq.top()->current_key > key) {
00498
00499
00500 bool compressed = cur->read_tag(true);
00501 out->add(key, cur->current_tag, compressed);
00502 if (cur->next()) {
00503 pq.push(cur);
00504 } else {
00505 delete cur;
00506 }
00507 continue;
00508 }
00509
00510
00511 string tag;
00512 if (key[0] != 'W') {
00513
00514
00515 priority_queue<PrefixCompressedStringItor *,
00516 vector<PrefixCompressedStringItor *>,
00517 PrefixCompressedStringItorGt> pqtag;
00518
00519
00520
00521 vector<MergeCursor *> vec;
00522 vec.reserve(pq.size());
00523
00524 while (true) {
00525 cur->read_tag();
00526 pqtag.push(new PrefixCompressedStringItor(cur->current_tag));
00527 vec.push_back(cur);
00528 if (pq.empty() || pq.top()->current_key != key) break;
00529 cur = pq.top();
00530 pq.pop();
00531 }
00532
00533 PrefixCompressedStringWriter wr(tag);
00534 string lastword;
00535 while (!pqtag.empty()) {
00536 PrefixCompressedStringItor * it = pqtag.top();
00537 string word = **it;
00538 if (word != lastword) {
00539 lastword = word;
00540 wr.append(lastword);
00541 }
00542 ++*it;
00543 pqtag.pop();
00544 if (!it->at_end()) {
00545 pqtag.push(it);
00546 } else {
00547 delete it;
00548 }
00549 }
00550
00551 vector<MergeCursor *>::const_iterator i;
00552 for (i = vec.begin(); i != vec.end(); ++i) {
00553 cur = *i;
00554 if (cur->next()) {
00555 pq.push(cur);
00556 } else {
00557 delete cur;
00558 }
00559 }
00560 } else {
00561
00562 Xapian::termcount tot_freq = 0;
00563 while (true) {
00564 cur->read_tag();
00565 Xapian::termcount freq;
00566 const char * p = cur->current_tag.data();
00567 const char * end = p + cur->current_tag.size();
00568 if (!unpack_uint_last(&p, end, &freq) || freq == 0) {
00569 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
00570 }
00571 tot_freq += freq;
00572 if (cur->next()) {
00573 pq.push(cur);
00574 } else {
00575 delete cur;
00576 }
00577 if (pq.empty() || pq.top()->current_key != key) break;
00578 cur = pq.top();
00579 pq.pop();
00580 }
00581 tag.resize(0);
00582 pack_uint_last(tag, tot_freq);
00583 }
00584 out->add(key, tag);
00585 }
00586 }
00587
00588 static void
00589 merge_synonyms(BrassTable * out,
00590 vector<string>::const_iterator b,
00591 vector<string>::const_iterator e)
00592 {
00593 priority_queue<MergeCursor *, vector<MergeCursor *>, CursorGt> pq;
00594 for ( ; b != e; ++b) {
00595 BrassTable *in = new BrassTable("synonym", *b, true, DONT_COMPRESS, true);
00596 in->open();
00597 if (!in->empty()) {
00598
00599
00600 pq.push(new MergeCursor(in));
00601 } else {
00602 delete in;
00603 }
00604 }
00605
00606 while (!pq.empty()) {
00607 MergeCursor * cur = pq.top();
00608 pq.pop();
00609
00610 string key = cur->current_key;
00611 if (pq.empty() || pq.top()->current_key > key) {
00612
00613
00614 bool compressed = cur->read_tag(true);
00615 out->add(key, cur->current_tag, compressed);
00616 if (cur->next()) {
00617 pq.push(cur);
00618 } else {
00619 delete cur;
00620 }
00621 continue;
00622 }
00623
00624
00625 string tag;
00626
00627
00628
00629 priority_queue<ByteLengthPrefixedStringItor *,
00630 vector<ByteLengthPrefixedStringItor *>,
00631 ByteLengthPrefixedStringItorGt> pqtag;
00632 vector<MergeCursor *> vec;
00633
00634 while (true) {
00635 cur->read_tag();
00636 pqtag.push(new ByteLengthPrefixedStringItor(cur->current_tag));
00637 vec.push_back(cur);
00638 if (pq.empty() || pq.top()->current_key != key) break;
00639 cur = pq.top();
00640 pq.pop();
00641 }
00642
00643 string lastword;
00644 while (!pqtag.empty()) {
00645 ByteLengthPrefixedStringItor * it = pqtag.top();
00646 if (**it != lastword) {
00647 lastword = **it;
00648 tag += byte(lastword.size() ^ MAGIC_XOR_VALUE);
00649 tag += lastword;
00650 }
00651 ++*it;
00652 pqtag.pop();
00653 if (!it->at_end()) {
00654 pqtag.push(it);
00655 } else {
00656 delete it;
00657 }
00658 }
00659
00660 vector<MergeCursor *>::const_iterator i;
00661 for (i = vec.begin(); i != vec.end(); ++i) {
00662 cur = *i;
00663 if (cur->next()) {
00664 pq.push(cur);
00665 } else {
00666 delete cur;
00667 }
00668 }
00669
00670 out->add(key, tag);
00671 }
00672 }
00673
00674 static void
00675 multimerge_postlists(Xapian::Compactor & compactor,
00676 BrassTable * out, const char * tmpdir,
00677 Xapian::docid last_docid,
00678 vector<string> tmp, vector<Xapian::docid> off)
00679 {
00680 unsigned int c = 0;
00681 while (tmp.size() > 3) {
00682 vector<string> tmpout;
00683 tmpout.reserve(tmp.size() / 2);
00684 vector<Xapian::docid> newoff;
00685 newoff.resize(tmp.size() / 2);
00686 for (unsigned int i = 0, j; i < tmp.size(); i = j) {
00687 j = i + 2;
00688 if (j == tmp.size() - 1) ++j;
00689
00690 string dest = tmpdir;
00691 char buf[64];
00692 sprintf(buf, "/tmp%u_%u.", c, i / 2);
00693 dest += buf;
00694
00695
00696
00697 BrassTable tmptab("postlist", dest, false);
00698
00699 tmptab.create_and_open(65536);
00700
00701 merge_postlists(compactor, &tmptab, off.begin() + i,
00702 tmp.begin() + i, tmp.begin() + j, 0);
00703 if (c > 0) {
00704 for (unsigned int k = i; k < j; ++k) {
00705 unlink((tmp[k] + "DB").c_str());
00706 unlink((tmp[k] + "baseA").c_str());
00707 unlink((tmp[k] + "baseB").c_str());
00708 }
00709 }
00710 tmpout.push_back(dest);
00711 tmptab.flush_db();
00712 tmptab.commit(1);
00713 }
00714 swap(tmp, tmpout);
00715 swap(off, newoff);
00716 ++c;
00717 }
00718 merge_postlists(compactor,
00719 out, off.begin(), tmp.begin(), tmp.end(), last_docid);
00720 if (c > 0) {
00721 for (size_t k = 0; k < tmp.size(); ++k) {
00722 unlink((tmp[k] + "DB").c_str());
00723 unlink((tmp[k] + "baseA").c_str());
00724 unlink((tmp[k] + "baseB").c_str());
00725 }
00726 }
00727 }
00728
00729 static void
00730 merge_docid_keyed(const char * tablename,
00731 BrassTable *out, const vector<string> & inputs,
00732 const vector<Xapian::docid> & offset, bool lazy)
00733 {
00734 for (size_t i = 0; i < inputs.size(); ++i) {
00735 Xapian::docid off = offset[i];
00736
00737 BrassTable in(tablename, inputs[i], true, DONT_COMPRESS, lazy);
00738 in.open();
00739 if (in.empty()) continue;
00740
00741 BrassCursor cur(&in);
00742 cur.find_entry(string());
00743
00744 string key;
00745 while (cur.next()) {
00746
00747 if (off) {
00748 Xapian::docid did;
00749 const char * d = cur.current_key.data();
00750 const char * e = d + cur.current_key.size();
00751 if (!unpack_uint_preserving_sort(&d, e, &did)) {
00752 string msg = "Bad key in ";
00753 msg += inputs[i];
00754 throw Xapian::DatabaseCorruptError(msg);
00755 }
00756 did += off;
00757 key.resize(0);
00758 pack_uint_preserving_sort(key, did);
00759 if (d != e) {
00760
00761 key.append(d, e - d);
00762 }
00763 } else {
00764 key = cur.current_key;
00765 }
00766 bool compressed = cur.read_tag(true);
00767 out->add(key, cur.current_tag, compressed);
00768 }
00769 }
00770 }
00771
00772 }
00773
00774 using namespace BrassCompact;
00775
00776 void
00777 compact_brass(Xapian::Compactor & compactor,
00778 const char * destdir, const vector<string> & sources,
00779 const vector<Xapian::docid> & offset, size_t block_size,
00780 Xapian::Compactor::compaction_level compaction, bool multipass,
00781 Xapian::docid last_docid) {
00782 enum table_type {
00783 POSTLIST, RECORD, TERMLIST, POSITION, VALUE, SPELLING, SYNONYM
00784 };
00785 struct table_list {
00786
00787 const char * name;
00788
00789 table_type type;
00790
00791 int compress_strategy;
00792
00793 bool lazy;
00794 };
00795
00796 static const table_list tables[] = {
00797
00798 { "postlist", POSTLIST, DONT_COMPRESS, false },
00799 { "record", RECORD, Z_DEFAULT_STRATEGY, false },
00800 { "termlist", TERMLIST, Z_DEFAULT_STRATEGY, false },
00801 { "position", POSITION, DONT_COMPRESS, true },
00802 { "spelling", SPELLING, Z_DEFAULT_STRATEGY, true },
00803 { "synonym", SYNONYM, Z_DEFAULT_STRATEGY, true }
00804 };
00805 const table_list * tables_end = tables +
00806 (sizeof(tables) / sizeof(tables[0]));
00807
00808 for (const table_list * t = tables; t < tables_end; ++t) {
00809
00810
00811
00812
00813
00814 compactor.set_status(t->name, string());
00815
00816 string dest = destdir;
00817 dest += '/';
00818 dest += t->name;
00819 dest += '.';
00820
00821 bool output_will_exist = !t->lazy;
00822
00823
00824
00825 bool bad_stat = false;
00826
00827 off_t in_size = 0;
00828
00829 vector<string> inputs;
00830 inputs.reserve(sources.size());
00831 size_t inputs_present = 0;
00832 for (vector<string>::const_iterator src = sources.begin();
00833 src != sources.end(); ++src) {
00834 string s(*src);
00835 s += t->name;
00836 s += '.';
00837
00838 struct stat sb;
00839 if (stat(s + "DB", &sb) == 0) {
00840 in_size += sb.st_size / 1024;
00841 output_will_exist = true;
00842 ++inputs_present;
00843 } else if (errno != ENOENT) {
00844
00845 bad_stat = true;
00846 output_will_exist = true;
00847 ++inputs_present;
00848 }
00849 inputs.push_back(s);
00850 }
00851
00852
00853 if (t->type == TERMLIST && inputs_present != sources.size()) {
00854 if (inputs_present != 0) {
00855 string m = str(inputs_present);
00856 m += " of ";
00857 m += str(sources.size());
00858 m += " inputs present, so suppressing output";
00859 compactor.set_status(t->name, m);
00860 continue;
00861 }
00862 output_will_exist = false;
00863 }
00864
00865 if (!output_will_exist) {
00866 compactor.set_status(t->name, "doesn't exist");
00867 continue;
00868 }
00869
00870 BrassTable out(t->name, dest, false, t->compress_strategy, t->lazy);
00871 if (!t->lazy) {
00872 out.create_and_open(block_size);
00873 } else {
00874 out.erase();
00875 out.set_block_size(block_size);
00876 }
00877
00878 out.set_full_compaction(compaction != compactor.STANDARD);
00879 if (compaction == compactor.FULLER) out.set_max_item_size(1);
00880
00881 switch (t->type) {
00882 case POSTLIST:
00883 if (multipass && inputs.size() > 3) {
00884 multimerge_postlists(compactor, &out, destdir, last_docid,
00885 inputs, offset);
00886 } else {
00887 merge_postlists(compactor, &out, offset.begin(),
00888 inputs.begin(), inputs.end(),
00889 last_docid);
00890 }
00891 break;
00892 case SPELLING:
00893 merge_spellings(&out, inputs.begin(), inputs.end());
00894 break;
00895 case SYNONYM:
00896 merge_synonyms(&out, inputs.begin(), inputs.end());
00897 break;
00898 default:
00899
00900 merge_docid_keyed(t->name, &out, inputs, offset, t->lazy);
00901 break;
00902 }
00903
00904
00905 out.flush_db();
00906 out.commit(1);
00907
00908 off_t out_size = 0;
00909 if (!bad_stat) {
00910 struct stat sb;
00911 if (stat(dest + "DB", &sb) == 0) {
00912 out_size = sb.st_size / 1024;
00913 } else {
00914 bad_stat = (errno != ENOENT);
00915 }
00916 }
00917 if (bad_stat) {
00918 compactor.set_status(t->name, "Done (couldn't stat all the DB files)");
00919 } else {
00920 string status;
00921 if (out_size == in_size) {
00922 status = "Size unchanged (";
00923 } else {
00924 off_t delta;
00925 if (out_size < in_size) {
00926 delta = in_size - out_size;
00927 status = "Reduced by ";
00928 } else {
00929 delta = out_size - in_size;
00930 status = "INCREASED by ";
00931 }
00932 status += str(100 * delta / in_size);
00933 status += "% ";
00934 status += str(delta);
00935 status += "K (";
00936 status += str(in_size);
00937 status += "K -> ";
00938 }
00939 status += str(out_size);
00940 status += "K)";
00941 compactor.set_status(t->name, status);
00942 }
00943 }
00944 }