00001
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <config.h>
00023
00024 #include <xapian/compactor.h>
00025
00026 #include <algorithm>
00027 #include <queue>
00028
00029 #include <cstdio>
00030
00031 #include "safeerrno.h"
00032 #include <sys/types.h>
00033 #include "safesysstat.h"
00034
00035 #include "chert_table.h"
00036 #include "chert_compact.h"
00037 #include "chert_cursor.h"
00038 #include "internaltypes.h"
00039 #include "pack.h"
00040 #include "utils.h"
00041 #include "valuestats.h"
00042
00043 #include "../byte_length_strings.h"
00044 #include "../prefix_compressed_strings.h"
00045 #include <xapian.h>
00046
00047 using namespace std;
00048
00049
00050
00051 namespace ChertCompact {
00052
00053 static inline bool
00054 is_metainfo_key(const string & key)
00055 {
00056 return key.size() == 1 && key[0] == '\0';
00057 }
00058
00059 static inline bool
00060 is_user_metadata_key(const string & key)
00061 {
00062 return key.size() > 1 && key[0] == '\0' && key[1] == '\xc0';
00063 }
00064
00065 static inline bool
00066 is_valuestats_key(const string & key)
00067 {
00068 return key.size() > 1 && key[0] == '\0' && key[1] == '\xd0';
00069 }
00070
00071 static inline bool
00072 is_valuechunk_key(const string & key)
00073 {
00074 return key.size() > 1 && key[0] == '\0' && key[1] == '\xd8';
00075 }
00076
00077 static inline bool
00078 is_doclenchunk_key(const string & key)
00079 {
00080 return key.size() > 1 && key[0] == '\0' && key[1] == '\xe0';
00081 }
00082
00083 class PostlistCursor : private ChertCursor {
00084 Xapian::docid offset;
00085
00086 public:
00087 string key, tag;
00088 Xapian::docid firstdid;
00089 Xapian::termcount tf, cf;
00090
00091 PostlistCursor(ChertTable *in, Xapian::docid offset_)
00092 : ChertCursor(in), offset(offset_), firstdid(0)
00093 {
00094 find_entry(string());
00095 next();
00096 }
00097
00098 ~PostlistCursor()
00099 {
00100 delete ChertCursor::get_table();
00101 }
00102
00103 bool next() {
00104 if (!ChertCursor::next()) return false;
00105
00106
00107 read_tag();
00108 key = current_key;
00109 tag = current_tag;
00110 tf = cf = 0;
00111 if (is_metainfo_key(key)) return true;
00112 if (is_user_metadata_key(key)) return true;
00113 if (is_valuestats_key(key)) return true;
00114 if (is_valuechunk_key(key)) {
00115 const char * p = key.data();
00116 const char * end = p + key.length();
00117 p += 2;
00118 Xapian::valueno slot;
00119 if (!unpack_uint(&p, end, &slot))
00120 throw Xapian::DatabaseCorruptError("bad value key");
00121 Xapian::docid did;
00122 if (!unpack_uint_preserving_sort(&p, end, &did))
00123 throw Xapian::DatabaseCorruptError("bad value key");
00124 did += offset;
00125
00126 key.assign("\0\xd8", 2);
00127 pack_uint(key, slot);
00128 pack_uint_preserving_sort(key, did);
00129 return true;
00130 }
00131
00132
00133
00134
00135 const char * d = key.data();
00136 const char * e = d + key.size();
00137 if (is_doclenchunk_key(key)) {
00138 d += 2;
00139 } else {
00140 string tname;
00141 if (!unpack_string_preserving_sort(&d, e, tname))
00142 throw Xapian::DatabaseCorruptError("Bad postlist key");
00143 }
00144
00145 if (d == e) {
00146
00147 d = tag.data();
00148 e = d + tag.size();
00149 if (!unpack_uint(&d, e, &tf) ||
00150 !unpack_uint(&d, e, &cf) ||
00151 !unpack_uint(&d, e, &firstdid)) {
00152 throw Xapian::DatabaseCorruptError("Bad postlist key");
00153 }
00154 ++firstdid;
00155 tag.erase(0, d - tag.data());
00156 } else {
00157
00158 size_t tmp = d - key.data();
00159 if (!unpack_uint_preserving_sort(&d, e, &firstdid) || d != e)
00160 throw Xapian::DatabaseCorruptError("Bad postlist key");
00161 if (is_doclenchunk_key(key)) {
00162 key.erase(tmp);
00163 } else {
00164 key.erase(tmp - 1);
00165 }
00166 }
00167 firstdid += offset;
00168 return true;
00169 }
00170 };
00171
00172 class PostlistCursorGt {
00173 public:
00176 bool operator()(const PostlistCursor *a, const PostlistCursor *b) {
00177 if (a->key > b->key) return true;
00178 if (a->key != b->key) return false;
00179 return (a->firstdid > b->firstdid);
00180 }
00181 };
00182
00183 static string
00184 encode_valuestats(Xapian::doccount freq,
00185 const string & lbound, const string & ubound)
00186 {
00187 string value;
00188 pack_uint(value, freq);
00189 pack_string(value, lbound);
00190
00191
00192
00193 if (lbound != ubound) value += ubound;
00194 return value;
00195 }
00196
00197 static void
00198 merge_postlists(Xapian::Compactor & compactor,
00199 ChertTable * out, vector<Xapian::docid>::const_iterator offset,
00200 vector<string>::const_iterator b,
00201 vector<string>::const_iterator e,
00202 Xapian::docid last_docid)
00203 {
00204 totlen_t tot_totlen = 0;
00205 Xapian::termcount doclen_lbound = static_cast<Xapian::termcount>(-1);
00206 Xapian::termcount wdf_ubound = 0;
00207 Xapian::termcount doclen_ubound = 0;
00208 priority_queue<PostlistCursor *, vector<PostlistCursor *>, PostlistCursorGt> pq;
00209 for ( ; b != e; ++b, ++offset) {
00210 ChertTable *in = new ChertTable("postlist", *b, true);
00211 in->open();
00212 if (in->empty()) {
00213
00214 delete in;
00215 continue;
00216 }
00217
00218
00219
00220 PostlistCursor * cur = new PostlistCursor(in, *offset);
00221
00222
00223
00224
00225 if (is_metainfo_key(cur->key)) {
00226 const char * data = cur->tag.data();
00227 const char * end = data + cur->tag.size();
00228 Xapian::docid dummy_did = 0;
00229 if (!unpack_uint(&data, end, &dummy_did)) {
00230 throw Xapian::DatabaseCorruptError("Tag containing meta information is corrupt.");
00231 }
00232
00233 Xapian::termcount doclen_lbound_tmp;
00234 if (!unpack_uint(&data, end, &doclen_lbound_tmp)) {
00235 throw Xapian::DatabaseCorruptError("Tag containing meta information is corrupt.");
00236 }
00237 doclen_lbound = min(doclen_lbound, doclen_lbound_tmp);
00238
00239 Xapian::termcount wdf_ubound_tmp;
00240 if (!unpack_uint(&data, end, &wdf_ubound_tmp)) {
00241 throw Xapian::DatabaseCorruptError("Tag containing meta information is corrupt.");
00242 }
00243 wdf_ubound = max(wdf_ubound, wdf_ubound_tmp);
00244
00245 Xapian::termcount doclen_ubound_tmp;
00246 if (!unpack_uint(&data, end, &doclen_ubound_tmp)) {
00247 throw Xapian::DatabaseCorruptError("Tag containing meta information is corrupt.");
00248 }
00249 doclen_ubound_tmp += wdf_ubound_tmp;
00250 doclen_ubound = max(doclen_ubound, doclen_ubound_tmp);
00251
00252 totlen_t totlen = 0;
00253 if (!unpack_uint_last(&data, end, &totlen)) {
00254 throw Xapian::DatabaseCorruptError("Tag containing meta information is corrupt.");
00255 }
00256 tot_totlen += totlen;
00257 if (tot_totlen < totlen) {
00258 throw "totlen wrapped!";
00259 }
00260 }
00261 if (cur->next()) {
00262 pq.push(cur);
00263 } else {
00264 delete cur;
00265 }
00266 }
00267
00268 {
00269 string tag;
00270 pack_uint(tag, last_docid);
00271 pack_uint(tag, doclen_lbound);
00272 pack_uint(tag, wdf_ubound);
00273 pack_uint(tag, doclen_ubound - wdf_ubound);
00274 pack_uint_last(tag, tot_totlen);
00275 out->add(string(1, '\0'), tag);
00276 }
00277
00278 string last_key;
00279 {
00280
00281 vector<string> tags;
00282 while (!pq.empty()) {
00283 PostlistCursor * cur = pq.top();
00284 const string& key = cur->key;
00285 if (!is_user_metadata_key(key)) break;
00286
00287 if (key != last_key) {
00288 if (tags.size() > 1) {
00289 Assert(!last_key.empty());
00290
00291
00292
00293 out->add(last_key,
00294 compactor.resolve_duplicate_metadata(last_key,
00295 tags.size(),
00296 &tags[0]));
00297 } else if (tags.size() == 1) {
00298 Assert(!last_key.empty());
00299 out->add(last_key, tags[0]);
00300 }
00301 tags.resize(0);
00302 last_key = key;
00303 }
00304 tags.push_back(cur->tag);
00305
00306 pq.pop();
00307 if (cur->next()) {
00308 pq.push(cur);
00309 } else {
00310 delete cur;
00311 }
00312 }
00313 if (tags.size() > 1) {
00314 Assert(!last_key.empty());
00315 out->add(last_key,
00316 compactor.resolve_duplicate_metadata(last_key,
00317 tags.size(),
00318 &tags[0]));
00319 } else if (tags.size() == 1) {
00320 Assert(!last_key.empty());
00321 out->add(last_key, tags[0]);
00322 }
00323 }
00324
00325 {
00326
00327 Xapian::doccount freq = 0;
00328 string lbound, ubound;
00329
00330 string last_tag;
00331 while (!pq.empty()) {
00332 PostlistCursor * cur = pq.top();
00333 const string& key = cur->key;
00334 if (!is_valuestats_key(key)) break;
00335 if (key != last_key) {
00336
00337
00338
00339 if (freq) {
00340 out->add(last_key, encode_valuestats(freq, lbound, ubound));
00341 freq = 0;
00342 }
00343 last_key = key;
00344 }
00345
00346 const string & tag = cur->tag;
00347
00348 const char * pos = tag.data();
00349 const char * end = pos + tag.size();
00350
00351 Xapian::doccount f;
00352 string l, u;
00353 if (!unpack_uint(&pos, end, &f)) {
00354 if (*pos == 0) throw Xapian::DatabaseCorruptError("Incomplete stats item in value table");
00355 throw Xapian::RangeError("Frequency statistic in value table is too large");
00356 }
00357 if (!unpack_string(&pos, end, l)) {
00358 if (*pos == 0) throw Xapian::DatabaseCorruptError("Incomplete stats item in value table");
00359 throw Xapian::RangeError("Lower bound in value table is too large");
00360 }
00361 size_t len = end - pos;
00362 if (len == 0) {
00363 u = l;
00364 } else {
00365 u.assign(pos, len);
00366 }
00367 if (freq == 0) {
00368 freq = f;
00369 lbound = l;
00370 ubound = u;
00371 } else {
00372 freq += f;
00373 if (l < lbound) lbound = l;
00374 if (u > ubound) ubound = u;
00375 }
00376
00377 pq.pop();
00378 if (cur->next()) {
00379 pq.push(cur);
00380 } else {
00381 delete cur;
00382 }
00383 }
00384
00385 if (freq) {
00386 out->add(last_key, encode_valuestats(freq, lbound, ubound));
00387 }
00388 }
00389
00390
00391 while (!pq.empty()) {
00392 PostlistCursor * cur = pq.top();
00393 const string & key = cur->key;
00394 if (!is_valuechunk_key(key)) break;
00395 Assert(!is_user_metadata_key(key));
00396 out->add(key, cur->tag);
00397 pq.pop();
00398 if (cur->next()) {
00399 pq.push(cur);
00400 } else {
00401 delete cur;
00402 }
00403 }
00404
00405 Xapian::termcount tf = 0, cf = 0;
00406 vector<pair<Xapian::docid, string> > tags;
00407 while (true) {
00408 PostlistCursor * cur = NULL;
00409 if (!pq.empty()) {
00410 cur = pq.top();
00411 pq.pop();
00412 }
00413 Assert(cur == NULL || !is_user_metadata_key(cur->key));
00414 if (cur == NULL || cur->key != last_key) {
00415 if (!tags.empty()) {
00416 string first_tag;
00417 pack_uint(first_tag, tf);
00418 pack_uint(first_tag, cf);
00419 pack_uint(first_tag, tags[0].first - 1);
00420 string tag = tags[0].second;
00421 tag[0] = (tags.size() == 1) ? '1' : '0';
00422 first_tag += tag;
00423 out->add(last_key, first_tag);
00424
00425 string term;
00426 if (!is_doclenchunk_key(last_key)) {
00427 const char * p = last_key.data();
00428 const char * end = p + last_key.size();
00429 if (!unpack_string_preserving_sort(&p, end, term) || p != end)
00430 throw Xapian::DatabaseCorruptError("Bad postlist chunk key");
00431 }
00432
00433 vector<pair<Xapian::docid, string> >::const_iterator i;
00434 i = tags.begin();
00435 while (++i != tags.end()) {
00436 tag = i->second;
00437 tag[0] = (i + 1 == tags.end()) ? '1' : '0';
00438 out->add(pack_chert_postlist_key(term, i->first), tag);
00439 }
00440 }
00441 tags.clear();
00442 if (cur == NULL) break;
00443 tf = cf = 0;
00444 last_key = cur->key;
00445 }
00446 tf += cur->tf;
00447 cf += cur->cf;
00448 tags.push_back(make_pair(cur->firstdid, cur->tag));
00449 if (cur->next()) {
00450 pq.push(cur);
00451 } else {
00452 delete cur;
00453 }
00454 }
00455 }
00456
00457 struct MergeCursor : public ChertCursor {
00458 MergeCursor(ChertTable *in) : ChertCursor(in) {
00459 find_entry(string());
00460 next();
00461 }
00462
00463 ~MergeCursor() {
00464 delete ChertCursor::get_table();
00465 }
00466 };
00467
00468 struct CursorGt {
00470 bool operator()(const ChertCursor *a, const ChertCursor *b) {
00471 if (b->after_end()) return false;
00472 if (a->after_end()) return true;
00473 return (a->current_key > b->current_key);
00474 }
00475 };
00476
00477 static void
00478 merge_spellings(ChertTable * out,
00479 vector<string>::const_iterator b,
00480 vector<string>::const_iterator e)
00481 {
00482 priority_queue<MergeCursor *, vector<MergeCursor *>, CursorGt> pq;
00483 for ( ; b != e; ++b) {
00484 ChertTable *in = new ChertTable("spelling", *b, true, DONT_COMPRESS, true);
00485 in->open();
00486 if (!in->empty()) {
00487
00488
00489 pq.push(new MergeCursor(in));
00490 } else {
00491 delete in;
00492 }
00493 }
00494
00495 while (!pq.empty()) {
00496 MergeCursor * cur = pq.top();
00497 pq.pop();
00498
00499 string key = cur->current_key;
00500 if (pq.empty() || pq.top()->current_key > key) {
00501
00502
00503 bool compressed = cur->read_tag(true);
00504 out->add(key, cur->current_tag, compressed);
00505 if (cur->next()) {
00506 pq.push(cur);
00507 } else {
00508 delete cur;
00509 }
00510 continue;
00511 }
00512
00513
00514 string tag;
00515 if (key[0] != 'W') {
00516
00517
00518 priority_queue<PrefixCompressedStringItor *,
00519 vector<PrefixCompressedStringItor *>,
00520 PrefixCompressedStringItorGt> pqtag;
00521
00522
00523
00524 vector<MergeCursor *> vec;
00525 vec.reserve(pq.size());
00526
00527 while (true) {
00528 cur->read_tag();
00529 pqtag.push(new PrefixCompressedStringItor(cur->current_tag));
00530 vec.push_back(cur);
00531 if (pq.empty() || pq.top()->current_key != key) break;
00532 cur = pq.top();
00533 pq.pop();
00534 }
00535
00536 PrefixCompressedStringWriter wr(tag);
00537 string lastword;
00538 while (!pqtag.empty()) {
00539 PrefixCompressedStringItor * it = pqtag.top();
00540 string word = **it;
00541 if (word != lastword) {
00542 lastword = word;
00543 wr.append(lastword);
00544 }
00545 ++*it;
00546 pqtag.pop();
00547 if (!it->at_end()) {
00548 pqtag.push(it);
00549 } else {
00550 delete it;
00551 }
00552 }
00553
00554 vector<MergeCursor *>::const_iterator i;
00555 for (i = vec.begin(); i != vec.end(); ++i) {
00556 cur = *i;
00557 if (cur->next()) {
00558 pq.push(cur);
00559 } else {
00560 delete cur;
00561 }
00562 }
00563 } else {
00564
00565 Xapian::termcount tot_freq = 0;
00566 while (true) {
00567 cur->read_tag();
00568 Xapian::termcount freq;
00569 const char * p = cur->current_tag.data();
00570 const char * end = p + cur->current_tag.size();
00571 if (!unpack_uint_last(&p, end, &freq) || freq == 0) {
00572 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
00573 }
00574 tot_freq += freq;
00575 if (cur->next()) {
00576 pq.push(cur);
00577 } else {
00578 delete cur;
00579 }
00580 if (pq.empty() || pq.top()->current_key != key) break;
00581 cur = pq.top();
00582 pq.pop();
00583 }
00584 tag.resize(0);
00585 pack_uint_last(tag, tot_freq);
00586 }
00587 out->add(key, tag);
00588 }
00589 }
00590
00591 static void
00592 merge_synonyms(ChertTable * out,
00593 vector<string>::const_iterator b,
00594 vector<string>::const_iterator e)
00595 {
00596 priority_queue<MergeCursor *, vector<MergeCursor *>, CursorGt> pq;
00597 for ( ; b != e; ++b) {
00598 ChertTable *in = new ChertTable("synonym", *b, true, DONT_COMPRESS, true);
00599 in->open();
00600 if (!in->empty()) {
00601
00602
00603 pq.push(new MergeCursor(in));
00604 } else {
00605 delete in;
00606 }
00607 }
00608
00609 while (!pq.empty()) {
00610 MergeCursor * cur = pq.top();
00611 pq.pop();
00612
00613 string key = cur->current_key;
00614 if (pq.empty() || pq.top()->current_key > key) {
00615
00616
00617 bool compressed = cur->read_tag(true);
00618 out->add(key, cur->current_tag, compressed);
00619 if (cur->next()) {
00620 pq.push(cur);
00621 } else {
00622 delete cur;
00623 }
00624 continue;
00625 }
00626
00627
00628 string tag;
00629
00630
00631
00632 priority_queue<ByteLengthPrefixedStringItor *,
00633 vector<ByteLengthPrefixedStringItor *>,
00634 ByteLengthPrefixedStringItorGt> pqtag;
00635 vector<MergeCursor *> vec;
00636
00637 while (true) {
00638 cur->read_tag();
00639 pqtag.push(new ByteLengthPrefixedStringItor(cur->current_tag));
00640 vec.push_back(cur);
00641 if (pq.empty() || pq.top()->current_key != key) break;
00642 cur = pq.top();
00643 pq.pop();
00644 }
00645
00646 string lastword;
00647 while (!pqtag.empty()) {
00648 ByteLengthPrefixedStringItor * it = pqtag.top();
00649 if (**it != lastword) {
00650 lastword = **it;
00651 tag += byte(lastword.size() ^ MAGIC_XOR_VALUE);
00652 tag += lastword;
00653 }
00654 ++*it;
00655 pqtag.pop();
00656 if (!it->at_end()) {
00657 pqtag.push(it);
00658 } else {
00659 delete it;
00660 }
00661 }
00662
00663 vector<MergeCursor *>::const_iterator i;
00664 for (i = vec.begin(); i != vec.end(); ++i) {
00665 cur = *i;
00666 if (cur->next()) {
00667 pq.push(cur);
00668 } else {
00669 delete cur;
00670 }
00671 }
00672
00673 out->add(key, tag);
00674 }
00675 }
00676
00677 static void
00678 multimerge_postlists(Xapian::Compactor & compactor,
00679 ChertTable * out, const char * tmpdir,
00680 Xapian::docid last_docid,
00681 vector<string> tmp, vector<Xapian::docid> off)
00682 {
00683 unsigned int c = 0;
00684 while (tmp.size() > 3) {
00685 vector<string> tmpout;
00686 tmpout.reserve(tmp.size() / 2);
00687 vector<Xapian::docid> newoff;
00688 newoff.resize(tmp.size() / 2);
00689 for (unsigned int i = 0, j; i < tmp.size(); i = j) {
00690 j = i + 2;
00691 if (j == tmp.size() - 1) ++j;
00692
00693 string dest = tmpdir;
00694 char buf[64];
00695 sprintf(buf, "/tmp%u_%u.", c, i / 2);
00696 dest += buf;
00697
00698
00699
00700 ChertTable tmptab("postlist", dest, false);
00701
00702 tmptab.create_and_open(65536);
00703
00704 merge_postlists(compactor, &tmptab, off.begin() + i,
00705 tmp.begin() + i, tmp.begin() + j, 0);
00706 if (c > 0) {
00707 for (unsigned int k = i; k < j; ++k) {
00708 unlink((tmp[k] + "DB").c_str());
00709 unlink((tmp[k] + "baseA").c_str());
00710 unlink((tmp[k] + "baseB").c_str());
00711 }
00712 }
00713 tmpout.push_back(dest);
00714 tmptab.flush_db();
00715 tmptab.commit(1);
00716 }
00717 swap(tmp, tmpout);
00718 swap(off, newoff);
00719 ++c;
00720 }
00721 merge_postlists(compactor,
00722 out, off.begin(), tmp.begin(), tmp.end(), last_docid);
00723 if (c > 0) {
00724 for (size_t k = 0; k < tmp.size(); ++k) {
00725 unlink((tmp[k] + "DB").c_str());
00726 unlink((tmp[k] + "baseA").c_str());
00727 unlink((tmp[k] + "baseB").c_str());
00728 }
00729 }
00730 }
00731
00732 static void
00733 merge_docid_keyed(const char * tablename,
00734 ChertTable *out, const vector<string> & inputs,
00735 const vector<Xapian::docid> & offset, bool lazy)
00736 {
00737 for (size_t i = 0; i < inputs.size(); ++i) {
00738 Xapian::docid off = offset[i];
00739
00740 ChertTable in(tablename, inputs[i], true, DONT_COMPRESS, lazy);
00741 in.open();
00742 if (in.empty()) continue;
00743
00744 ChertCursor cur(&in);
00745 cur.find_entry(string());
00746
00747 string key;
00748 while (cur.next()) {
00749
00750 if (off) {
00751 Xapian::docid did;
00752 const char * d = cur.current_key.data();
00753 const char * e = d + cur.current_key.size();
00754 if (!unpack_uint_preserving_sort(&d, e, &did)) {
00755 string msg = "Bad key in ";
00756 msg += inputs[i];
00757 throw Xapian::DatabaseCorruptError(msg);
00758 }
00759 did += off;
00760 key.resize(0);
00761 pack_uint_preserving_sort(key, did);
00762 if (d != e) {
00763
00764 key.append(d, e - d);
00765 }
00766 } else {
00767 key = cur.current_key;
00768 }
00769 bool compressed = cur.read_tag(true);
00770 out->add(key, cur.current_tag, compressed);
00771 }
00772 }
00773 }
00774
00775 }
00776
00777 using namespace ChertCompact;
00778
00779 void
00780 compact_chert(Xapian::Compactor & compactor,
00781 const char * destdir, const vector<string> & sources,
00782 const vector<Xapian::docid> & offset, size_t block_size,
00783 Xapian::Compactor::compaction_level compaction, bool multipass,
00784 Xapian::docid last_docid) {
00785 enum table_type {
00786 POSTLIST, RECORD, TERMLIST, POSITION, VALUE, SPELLING, SYNONYM
00787 };
00788 struct table_list {
00789
00790 const char * name;
00791
00792 table_type type;
00793
00794 int compress_strategy;
00795
00796 bool lazy;
00797 };
00798
00799 static const table_list tables[] = {
00800
00801 { "postlist", POSTLIST, DONT_COMPRESS, false },
00802 { "record", RECORD, Z_DEFAULT_STRATEGY, false },
00803 { "termlist", TERMLIST, Z_DEFAULT_STRATEGY, false },
00804 { "position", POSITION, DONT_COMPRESS, true },
00805 { "spelling", SPELLING, Z_DEFAULT_STRATEGY, true },
00806 { "synonym", SYNONYM, Z_DEFAULT_STRATEGY, true }
00807 };
00808 const table_list * tables_end = tables +
00809 (sizeof(tables) / sizeof(tables[0]));
00810
00811 for (const table_list * t = tables; t < tables_end; ++t) {
00812
00813
00814
00815
00816
00817 compactor.set_status(t->name, string());
00818
00819 string dest = destdir;
00820 dest += '/';
00821 dest += t->name;
00822 dest += '.';
00823
00824 bool output_will_exist = !t->lazy;
00825
00826
00827
00828 bool bad_stat = false;
00829
00830 off_t in_size = 0;
00831
00832 vector<string> inputs;
00833 inputs.reserve(sources.size());
00834 size_t inputs_present = 0;
00835 for (vector<string>::const_iterator src = sources.begin();
00836 src != sources.end(); ++src) {
00837 string s(*src);
00838 s += t->name;
00839 s += '.';
00840
00841 struct stat sb;
00842 if (stat(s + "DB", &sb) == 0) {
00843 in_size += sb.st_size / 1024;
00844 output_will_exist = true;
00845 ++inputs_present;
00846 } else if (errno != ENOENT) {
00847
00848 bad_stat = true;
00849 output_will_exist = true;
00850 ++inputs_present;
00851 }
00852 inputs.push_back(s);
00853 }
00854
00855
00856 if (t->type == TERMLIST && inputs_present != sources.size()) {
00857 if (inputs_present != 0) {
00858 string m = str(inputs_present);
00859 m += " of ";
00860 m += str(sources.size());
00861 m += " inputs present, so suppressing output";
00862 compactor.set_status(t->name, m);
00863 continue;
00864 }
00865 output_will_exist = false;
00866 }
00867
00868 if (!output_will_exist) {
00869 compactor.set_status(t->name, "doesn't exist");
00870 continue;
00871 }
00872
00873 ChertTable out(t->name, dest, false, t->compress_strategy, t->lazy);
00874 if (!t->lazy) {
00875 out.create_and_open(block_size);
00876 } else {
00877 out.erase();
00878 out.set_block_size(block_size);
00879 }
00880
00881 out.set_full_compaction(compaction != compactor.STANDARD);
00882 if (compaction == compactor.FULLER) out.set_max_item_size(1);
00883
00884 switch (t->type) {
00885 case POSTLIST:
00886 if (multipass && inputs.size() > 3) {
00887 multimerge_postlists(compactor, &out, destdir, last_docid,
00888 inputs, offset);
00889 } else {
00890 merge_postlists(compactor, &out, offset.begin(),
00891 inputs.begin(), inputs.end(),
00892 last_docid);
00893 }
00894 break;
00895 case SPELLING:
00896 merge_spellings(&out, inputs.begin(), inputs.end());
00897 break;
00898 case SYNONYM:
00899 merge_synonyms(&out, inputs.begin(), inputs.end());
00900 break;
00901 default:
00902
00903 merge_docid_keyed(t->name, &out, inputs, offset, t->lazy);
00904 break;
00905 }
00906
00907
00908 out.flush_db();
00909 out.commit(1);
00910
00911 off_t out_size = 0;
00912 if (!bad_stat) {
00913 struct stat sb;
00914 if (stat(dest + "DB", &sb) == 0) {
00915 out_size = sb.st_size / 1024;
00916 } else {
00917 bad_stat = (errno != ENOENT);
00918 }
00919 }
00920 if (bad_stat) {
00921 compactor.set_status(t->name, "Done (couldn't stat all the DB files)");
00922 } else {
00923 string status;
00924 if (out_size == in_size) {
00925 status = "Size unchanged (";
00926 } else {
00927 off_t delta;
00928 if (out_size < in_size) {
00929 delta = in_size - out_size;
00930 status = "Reduced by ";
00931 } else {
00932 delta = out_size - in_size;
00933 status = "INCREASED by ";
00934 }
00935 status += str(100 * delta / in_size);
00936 status += "% ";
00937 status += str(delta);
00938 status += "K (";
00939 status += str(in_size);
00940 status += "K -> ";
00941 }
00942 status += str(out_size);
00943 status += "K)";
00944 compactor.set_status(t->name, status);
00945 }
00946 }
00947 }