00001
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include <config.h>
00024
00025 #include "xapian-check-chert.h"
00026
00027 #include "bitstream.h"
00028
00029 #include "internaltypes.h"
00030
00031 #include "chert_check.h"
00032 #include "chert_cursor.h"
00033 #include "chert_table.h"
00034 #include "chert_types.h"
00035 #include "pack.h"
00036 #include "valuestats.h"
00037
00038 #include <xapian.h>
00039
00040 #include "autoptr.h"
00041 #include <iostream>
00042
00043 using namespace std;
00044
00045 static inline bool
00046 is_user_metadata_key(const string & key)
00047 {
00048 return key.size() > 1 && key[0] == '\0' && key[1] == '\xc0';
00049 }
00050
00051 struct VStats : public ValueStats {
00052 Xapian::doccount freq_real;
00053
00054 VStats() : ValueStats(), freq_real(0) {}
00055 };
00056
00057 size_t
00058 check_chert_table(const char * tablename, string filename, int opts,
00059 vector<Xapian::termcount> & doclens,
00060 Xapian::docid db_last_docid)
00061 {
00062 filename += '.';
00063
00064
00065 ChertTableCheck::check(tablename, filename, opts);
00066
00067
00068 ChertTable table(tablename, filename, true);
00069 table.open();
00070 AutoPtr<ChertCursor> cursor(table.cursor_get());
00071
00072 size_t errors = 0;
00073
00074 cursor->find_entry(string());
00075 cursor->next();
00076
00077 if (strcmp(tablename, "postlist") == 0) {
00078
00079 map<Xapian::valueno, VStats> valuestats;
00080 string current_term;
00081 Xapian::docid lastdid = 0;
00082 Xapian::termcount termfreq = 0, collfreq = 0;
00083 Xapian::termcount tf = 0, cf = 0;
00084 bool have_metainfo_key = false;
00085
00086
00087
00088 if (!cursor->after_end()) {
00089 if (cursor->current_key == string("", 1)) {
00090 have_metainfo_key = true;
00091 cursor->read_tag();
00092
00093 totlen_t total_doclen;
00094 Xapian::docid last_docid;
00095 Xapian::termcount doclen_lbound;
00096 Xapian::termcount doclen_ubound;
00097 Xapian::termcount wdf_ubound;
00098
00099 const char * data = cursor->current_tag.data();
00100 const char * end = data + cursor->current_tag.size();
00101 if (!unpack_uint(&data, end, &last_docid)) {
00102 cout << "Tag containing meta information is corrupt (couldn't read last_docid)." << endl;
00103 ++errors;
00104 } else if (!unpack_uint(&data, end, &doclen_lbound)) {
00105 cout << "Tag containing meta information is corrupt (couldn't read doclen_lbound)." << endl;
00106 ++errors;
00107 } else if (!unpack_uint(&data, end, &wdf_ubound)) {
00108 cout << "Tag containing meta information is corrupt (couldn't read wdf_ubound)." << endl;
00109 ++errors;
00110 } else if (!unpack_uint(&data, end, &doclen_ubound)) {
00111 cout << "Tag containing meta information is corrupt (couldn't read doclen_ubound)." << endl;
00112 ++errors;
00113 } else if (!unpack_uint_last(&data, end, &total_doclen)) {
00114 cout << "Tag containing meta information is corrupt (couldn't read total_doclen)." << endl;
00115 ++errors;
00116 } else if (data != end) {
00117 cout << "Tag containing meta information is corrupt (junk at end)." << endl;
00118 ++errors;
00119 }
00120 cursor->next();
00121 }
00122 }
00123
00124 bool seen_doclen_initial_chunk = false;
00125 for ( ; !cursor->after_end(); cursor->next()) {
00126 string & key = cursor->current_key;
00127
00128 if (is_user_metadata_key(key)) {
00129
00130
00131 cursor->read_tag();
00132 if (cursor->current_tag.empty()) {
00133 cout << "User metadata item is empty" << endl;
00134 ++errors;
00135 }
00136 continue;
00137 }
00138
00139 if (!have_metainfo_key) {
00140 cout << "METAINFO key missing from postlist table" << endl;
00141 ++errors;
00142 }
00143
00144 if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xe0') {
00145
00146 const char * pos, * end;
00147 Xapian::docid did = 1;
00148 if (key.size() > 2) {
00149
00150 if (!seen_doclen_initial_chunk) {
00151 cout << "Doclen initial chunk missing" << endl;
00152 ++errors;
00153 }
00154 pos = key.data();
00155 end = pos + key.size();
00156 pos += 2;
00157 if (!unpack_uint_preserving_sort(&pos, end, &did)) {
00158 cout << "Error unpacking docid from doclen key" << endl;
00159 ++errors;
00160 continue;
00161 }
00162 }
00163 seen_doclen_initial_chunk = true;
00164
00165 cursor->read_tag();
00166 pos = cursor->current_tag.data();
00167 end = pos + cursor->current_tag.size();
00168 if (key.size() == 2) {
00169
00170 if (end - pos < 2 || pos[0] || pos[1]) {
00171 cout << "Initial doclen chunk has nonzero dummy fields" << endl;
00172 ++errors;
00173 continue;
00174 }
00175 pos += 2;
00176 if (!unpack_uint(&pos, end, &did)) {
00177 cout << "Failed to unpack firstdid for doclen" << endl;
00178 ++errors;
00179 continue;
00180 }
00181 ++did;
00182 if (did <= lastdid) {
00183 cout << "First did in this chunk is <= last in "
00184 "prev chunk" << endl;
00185 ++errors;
00186 }
00187 }
00188
00189 bool is_last_chunk;
00190 if (!unpack_bool(&pos, end, &is_last_chunk)) {
00191 cout << "Failed to unpack last chunk flag for doclen" << endl;
00192 ++errors;
00193 continue;
00194 }
00195
00196 if (!unpack_uint(&pos, end, &lastdid)) {
00197 cout << "Failed to unpack increase to last" << endl;
00198 ++errors;
00199 continue;
00200 }
00201 lastdid += did;
00202 bool bad = false;
00203 while (true) {
00204 Xapian::termcount doclen;
00205 if (!unpack_uint(&pos, end, &doclen)) {
00206 cout << "Failed to unpack doclen" << endl;
00207 ++errors;
00208 bad = true;
00209 break;
00210 }
00211
00212 if (did > db_last_docid) {
00213 cout << "document id " << did << " in doclen stream "
00214 << "is larger than get_last_docid() "
00215 << db_last_docid << endl;
00216 ++errors;
00217 }
00218
00219 if (!doclens.empty()) {
00220
00221
00222 Xapian::termcount termlist_doclen = 0;
00223 if (did < doclens.size())
00224 termlist_doclen = doclens[did];
00225
00226 if (doclen != termlist_doclen) {
00227 cout << "document id " << did << ": length "
00228 << doclen << " doesn't match "
00229 << termlist_doclen << " in the termlist table"
00230 << endl;
00231 ++errors;
00232 }
00233 }
00234
00235 if (pos == end) break;
00236
00237 Xapian::docid inc;
00238 if (!unpack_uint(&pos, end, &inc)) {
00239 cout << "Failed to unpack docid increase" << endl;
00240 ++errors;
00241 bad = true;
00242 break;
00243 }
00244 ++inc;
00245 did += inc;
00246 if (did > lastdid) {
00247 cout << "docid " << did << " > last docid " << lastdid
00248 << endl;
00249 ++errors;
00250 }
00251 }
00252 if (bad) {
00253 continue;
00254 }
00255 if (is_last_chunk) {
00256 if (did != lastdid) {
00257 cout << "lastdid " << lastdid << " != last did " << did
00258 << endl;
00259 ++errors;
00260 }
00261 }
00262
00263 continue;
00264 }
00265
00266 if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xd0') {
00267
00268 const char * p = key.data();
00269 const char * end = p + key.length();
00270 p += 2;
00271 Xapian::valueno slot;
00272 if (!unpack_uint_last(&p, end, &slot)) {
00273 cout << "Bad valuestats key (no slot)" << endl;
00274 ++errors;
00275 continue;
00276 }
00277
00278 cursor->read_tag();
00279 p = cursor->current_tag.data();
00280 end = p + cursor->current_tag.size();
00281
00282 VStats & v = valuestats[slot];
00283 if (!unpack_uint(&p, end, &v.freq)) {
00284 if (*p == 0) {
00285 cout << "Incomplete stats item in value table" << endl;
00286 } else {
00287 cout << "Frequency statistic in value table is too large" << endl;
00288 }
00289 ++errors;
00290 continue;
00291 }
00292 if (!unpack_string(&p, end, v.lower_bound)) {
00293 if (*p == 0) {
00294 cout << "Incomplete stats item in value table" << endl;
00295 } else {
00296 cout << "Lower bound statistic in value table is too large" << endl;
00297 }
00298 ++errors;
00299 continue;
00300 }
00301 size_t len = end - p;
00302 if (len == 0) {
00303 v.upper_bound = v.lower_bound;
00304 } else {
00305 v.upper_bound.assign(p, len);
00306 }
00307
00308 continue;
00309 }
00310
00311 if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xd8') {
00312
00313 const char * p = key.data();
00314 const char * end = p + key.length();
00315 p += 2;
00316 Xapian::valueno slot;
00317 if (!unpack_uint(&p, end, &slot)) {
00318 cout << "Bad value chunk key (no slot)" << endl;
00319 ++errors;
00320 continue;
00321 }
00322 Xapian::docid did;
00323 if (!unpack_uint_preserving_sort(&p, end, &did)) {
00324 cout << "Bad value chunk key (no docid)" << endl;
00325 ++errors;
00326 continue;
00327 }
00328 if (p != end) {
00329 cout << "Bad value chunk key (trailing junk)" << endl;
00330 ++errors;
00331 continue;
00332 }
00333
00334 VStats & v = valuestats[slot];
00335
00336 cursor->read_tag();
00337 p = cursor->current_tag.data();
00338 end = p + cursor->current_tag.size();
00339
00340 while (true) {
00341 string value;
00342 if (!unpack_string(&p, end, value)) {
00343 cout << "Failed to unpack value from chunk" << endl;
00344 ++errors;
00345 break;
00346 }
00347
00348 ++v.freq_real;
00349
00350
00351
00352
00353
00354
00355
00356 if (value < v.lower_bound) {
00357 cout << "Value slot " << slot << " has value below "
00358 "lower bound: '" << value << "' < '"
00359 << v.lower_bound << "'" << endl;
00360 ++errors;
00361 } else if (value > v.upper_bound) {
00362 cout << "Value slot " << slot << " has value above "
00363 "upper bound: '" << value << "' > '"
00364 << v.upper_bound << "'" << endl;
00365 ++errors;
00366 }
00367
00368 if (p == end) break;
00369 Xapian::docid delta;
00370 if (!unpack_uint(&p, end, &delta)) {
00371 cout << "Failed to unpack docid delta from chunk" << endl;
00372 ++errors;
00373 break;
00374 }
00375 Xapian::docid new_did = did + delta + 1;
00376 if (new_did <= did) {
00377 cout << "docid overflowed in value chunk" << endl;
00378 ++errors;
00379 break;
00380 }
00381 did = new_did;
00382
00383 if (did > db_last_docid) {
00384 cout << "document id " << did << " in value chunk "
00385 << "is larger than get_last_docid() "
00386 << db_last_docid << endl;
00387 ++errors;
00388 }
00389 }
00390 continue;
00391 }
00392
00393 const char * pos, * end;
00394
00395
00396 pos = key.data();
00397 end = pos + key.size();
00398
00399 string term;
00400 Xapian::docid did;
00401 if (!unpack_string_preserving_sort(&pos, end, term)) {
00402 cout << "Error unpacking termname from key" << endl;
00403 ++errors;
00404 continue;
00405 }
00406 if (!current_term.empty() && term != current_term) {
00407
00408 if (pos == end) {
00409 cout << "No last chunk for term `" << current_term
00410 << "'" << endl;
00411 current_term.resize(0);
00412 } else {
00413 cout << "Mismatch in follow-on chunk in posting "
00414 "list for term `" << current_term << "' (got `"
00415 << term << "')" << endl;
00416 current_term = term;
00417 tf = cf = 0;
00418 lastdid = 0;
00419 }
00420 ++errors;
00421 }
00422 if (pos == end) {
00423
00424 if (term == current_term) {
00425
00426 cout << "First posting list chunk for term `"
00427 << term << "' follows previous chunk for the same "
00428 "term" << endl;
00429 ++errors;
00430 }
00431 current_term = term;
00432 tf = cf = 0;
00433
00434
00435 cursor->read_tag();
00436 pos = cursor->current_tag.data();
00437 end = pos + cursor->current_tag.size();
00438 if (!unpack_uint(&pos, end, &termfreq)) {
00439 cout << "Failed to unpack termfreq for term `" << term
00440 << "'" << endl;
00441 ++errors;
00442 continue;
00443 }
00444 if (!unpack_uint(&pos, end, &collfreq)) {
00445 cout << "Failed to unpack collfreq for term `" << term
00446 << "'" << endl;
00447 ++errors;
00448 continue;
00449 }
00450 if (!unpack_uint(&pos, end, &did)) {
00451 cout << "Failed to unpack firstdid for term `" << term
00452 << "'" << endl;
00453 ++errors;
00454 continue;
00455 }
00456 ++did;
00457 } else {
00458
00459 if (current_term.empty()) {
00460 cout << "First chunk for term `" << current_term << "' "
00461 "is a continuation chunk" << endl;
00462 ++errors;
00463 current_term = term;
00464 }
00465 AssertEq(current_term, term);
00466 if (!unpack_uint_preserving_sort(&pos, end, &did)) {
00467 cout << "Failed to unpack did from key" << endl;
00468 ++errors;
00469 continue;
00470 }
00471 if (did <= lastdid) {
00472 cout << "First did in this chunk is <= last in "
00473 "prev chunk" << endl;
00474 ++errors;
00475 }
00476 cursor->read_tag();
00477 pos = cursor->current_tag.data();
00478 end = pos + cursor->current_tag.size();
00479 }
00480
00481 bool is_last_chunk;
00482 if (!unpack_bool(&pos, end, &is_last_chunk)) {
00483 cout << "Failed to unpack last chunk flag" << endl;
00484 ++errors;
00485 continue;
00486 }
00487
00488 if (!unpack_uint(&pos, end, &lastdid)) {
00489 cout << "Failed to unpack increase to last" << endl;
00490 ++errors;
00491 continue;
00492 }
00493 lastdid += did;
00494 bool bad = false;
00495 while (true) {
00496 Xapian::termcount wdf;
00497 if (!unpack_uint(&pos, end, &wdf)) {
00498 cout << "Failed to unpack wdf" << endl;
00499 ++errors;
00500 bad = true;
00501 break;
00502 }
00503 ++tf;
00504 cf += wdf;
00505
00506 if (pos == end) break;
00507
00508 Xapian::docid inc;
00509 if (!unpack_uint(&pos, end, &inc)) {
00510 cout << "Failed to unpack docid increase" << endl;
00511 ++errors;
00512 bad = true;
00513 break;
00514 }
00515 ++inc;
00516 did += inc;
00517 if (did > lastdid) {
00518 cout << "docid " << did << " > last docid " << lastdid
00519 << endl;
00520 ++errors;
00521 }
00522 }
00523 if (bad) {
00524 continue;
00525 }
00526 if (is_last_chunk) {
00527 if (tf != termfreq) {
00528 cout << "termfreq " << termfreq << " != # of entries "
00529 << tf << endl;
00530 ++errors;
00531 }
00532 if (cf != collfreq) {
00533 cout << "collfreq " << collfreq << " != sum wdf " << cf
00534 << endl;
00535 ++errors;
00536 }
00537 if (did != lastdid) {
00538 cout << "lastdid " << lastdid << " != last did " << did
00539 << endl;
00540 ++errors;
00541 }
00542 current_term.resize(0);
00543 }
00544 }
00545 if (!current_term.empty()) {
00546 cout << "Last term `" << current_term << "' has no last chunk"
00547 << endl;
00548 ++errors;
00549 }
00550
00551 map<Xapian::valueno, VStats>::const_iterator i;
00552 for (i = valuestats.begin(); i != valuestats.end(); ++i) {
00553 if (i->second.freq != i->second.freq_real) {
00554 cout << "Value stats frequency for slot " << i->first << " is "
00555 << i->second.freq << " but recounting gives "
00556 << i->second.freq_real << endl;
00557 ++errors;
00558 }
00559 }
00560 } else if (strcmp(tablename, "record") == 0) {
00561
00562
00563 for ( ; !cursor->after_end(); cursor->next()) {
00564 string & key = cursor->current_key;
00565
00566
00567 const char * pos = key.data();
00568 const char * end = pos + key.size();
00569
00570 Xapian::docid did;
00571 if (!unpack_uint_preserving_sort(&pos, end, &did)) {
00572 cout << "Error unpacking docid from key" << endl;
00573 ++errors;
00574 } else if (pos != end) {
00575 cout << "Extra junk in key" << endl;
00576 ++errors;
00577 }
00578 }
00579 } else if (strcmp(tablename, "termlist") == 0) {
00580
00581 for ( ; !cursor->after_end(); cursor->next()) {
00582 string & key = cursor->current_key;
00583
00584
00585 const char * pos = key.data();
00586 const char * end = pos + key.size();
00587
00588 Xapian::docid did;
00589 if (!unpack_uint_preserving_sort(&pos, end, &did)) {
00590 cout << "Error unpacking docid from key" << endl;
00591 ++errors;
00592 continue;
00593 }
00594
00595 if (end - pos == 1 && *pos == '\0') {
00596
00597 cursor->read_tag();
00598
00599 pos = cursor->current_tag.data();
00600 end = pos + cursor->current_tag.size();
00601
00602 if (pos == end) {
00603 cout << "Empty value slots used tag" << endl;
00604 ++errors;
00605 continue;
00606 }
00607
00608 Xapian::valueno prev_slot;
00609 if (!unpack_uint(&pos, end, &prev_slot)) {
00610 cout << "Value slot encoding corrupt" << endl;
00611 ++errors;
00612 continue;
00613 }
00614
00615 while (pos != end) {
00616 Xapian::valueno slot;
00617 if (!unpack_uint(&pos, end, &slot)) {
00618 cout << "Value slot encoding corrupt" << endl;
00619 ++errors;
00620 break;
00621 }
00622 slot += prev_slot + 1;
00623 if (slot <= prev_slot) {
00624 cout << "Value slot number overflowed (" << prev_slot << " -> " << slot << ")" << endl;
00625 ++errors;
00626 }
00627 prev_slot = slot;
00628 }
00629 continue;
00630 }
00631
00632 if (pos != end) {
00633 cout << "Extra junk in key" << endl;
00634 ++errors;
00635 continue;
00636 }
00637
00638 cursor->read_tag();
00639
00640 pos = cursor->current_tag.data();
00641 end = pos + cursor->current_tag.size();
00642
00643 if (pos == end) {
00644
00645 continue;
00646 }
00647
00648 Xapian::termcount doclen, termlist_size;
00649
00650
00651 if (!unpack_uint(&pos, end, &doclen)) {
00652 if (pos != 0) {
00653 cout << "doclen out of range" << endl;
00654 } else {
00655 cout << "Unexpected end of data when reading doclen" << endl;
00656 }
00657 ++errors;
00658 continue;
00659 }
00660
00661
00662 if (!unpack_uint(&pos, end, &termlist_size)) {
00663 if (pos != 0) {
00664 cout << "termlist_size out of range" << endl;
00665 } else {
00666 cout << "Unexpected end of data when reading termlist_size" << endl;
00667 }
00668 ++errors;
00669 continue;
00670 }
00671
00672 Xapian::termcount actual_doclen = 0, actual_termlist_size = 0;
00673 string current_tname;
00674
00675 bool bad = false;
00676 while (pos != end) {
00677 Xapian::doccount current_wdf = 0;
00678 bool got_wdf = false;
00679
00680 if (!current_tname.empty()) {
00681 string::size_type len = static_cast<unsigned char>(*pos++);
00682 if (len > current_tname.length()) {
00683
00684 current_wdf = len / (current_tname.length() + 1) - 1;
00685 len %= (current_tname.length() + 1);
00686 got_wdf = true;
00687 }
00688 current_tname.resize(len);
00689 }
00690
00691
00692 string::size_type len = static_cast<unsigned char>(*pos++);
00693 current_tname.append(pos, len);
00694 pos += len;
00695
00696 if (!got_wdf) {
00697
00698 if (!unpack_uint(&pos, end, ¤t_wdf)) {
00699 if (pos == 0) {
00700 cout << "Unexpected end of data when reading termlist current_wdf" << endl;
00701 } else {
00702 cout << "Size of wdf out of range, in termlist" << endl;
00703 }
00704 ++errors;
00705 bad = true;
00706 break;
00707 }
00708 }
00709
00710 ++actual_termlist_size;
00711 actual_doclen += current_wdf;
00712 }
00713 if (bad) {
00714 continue;
00715 }
00716
00717 if (termlist_size != actual_termlist_size) {
00718 cout << "termlist_size != # of entries in termlist" << endl;
00719 ++errors;
00720 }
00721 if (doclen != actual_doclen) {
00722 cout << "doclen != sum(wdf)" << endl;
00723 ++errors;
00724 }
00725
00726
00727 if (doclens.size() <= did) doclens.resize(did + 1);
00728 doclens[did] = actual_doclen;
00729 }
00730 } else if (strcmp(tablename, "position") == 0) {
00731
00732 for ( ; !cursor->after_end(); cursor->next()) {
00733 string & key = cursor->current_key;
00734
00735
00736 const char * pos = key.data();
00737 const char * end = pos + key.size();
00738
00739 Xapian::docid did;
00740 if (!unpack_uint_preserving_sort(&pos, end, &did)) {
00741 cout << "Error unpacking docid from key" << endl;
00742 ++errors;
00743 continue;
00744 }
00745 if (pos == end) {
00746 cout << "No termname in key" << endl;
00747 ++errors;
00748 continue;
00749 }
00750
00751 cursor->read_tag();
00752
00753 const string & data = cursor->current_tag;
00754 pos = data.data();
00755 end = pos + data.size();
00756
00757 Xapian::termpos pos_last;
00758 if (!unpack_uint(&pos, end, &pos_last)) {
00759 cout << tablename << " table: Position list data corrupt" << endl;
00760 ++errors;
00761 continue;
00762 }
00763 if (pos == end) {
00764
00765 } else {
00766
00767 BitReader rd(data, pos - data.data());
00768 Xapian::termpos pos_first = rd.decode(pos_last);
00769 Xapian::termpos pos_size = rd.decode(pos_last - pos_first) + 2;
00770 vector<Xapian::termpos> positions;
00771 positions.resize(pos_size);
00772 positions[0] = pos_first;
00773 positions.back() = pos_last;
00774 rd.decode_interpolative(positions, 0, pos_size - 1);
00775 vector<Xapian::termpos>::const_iterator current_pos = positions.begin();
00776 Xapian::termpos lastpos = *current_pos++;
00777 while (current_pos != positions.end()) {
00778 Xapian::termpos termpos = *current_pos++;
00779 if (termpos <= lastpos) {
00780 cout << tablename << " table: Positions not strictly monotonically increasing" << endl;
00781 ++errors;
00782 break;
00783 }
00784 lastpos = termpos;
00785 }
00786 }
00787 }
00788 } else {
00789 cout << tablename << " table: Don't know how to check structure\n" << endl;
00790 return errors;
00791 }
00792
00793 if (!errors)
00794 cout << tablename << " table structure checked OK\n" << endl;
00795 else
00796 cout << tablename << " table errors found: " << errors << "\n" << endl;
00797
00798 return errors;
00799 }