00001
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include <config.h>
00024
00025 #include "xapian-check-brass.h"
00026
00027 #include "bitstream.h"
00028
00029 #include "internaltypes.h"
00030
00031 #include "brass_check.h"
00032 #include "brass_cursor.h"
00033 #include "brass_table.h"
00034 #include "brass_types.h"
00035 #include "pack.h"
00036 #include "valuestats.h"
00037
00038 #include <xapian.h>
00039
00040 #include "autoptr.h"
00041 #include <iostream>
00042
00043 using namespace std;
00044
00045 static inline bool
00046 is_user_metadata_key(const string & key)
00047 {
00048 return key.size() > 1 && key[0] == '\0' && key[1] == '\xc0';
00049 }
00050
00051 struct VStats : public ValueStats {
00052 Xapian::doccount freq_real;
00053
00054 VStats() : ValueStats(), freq_real(0) {}
00055 };
00056
00057 size_t
00058 check_brass_table(const char * tablename, string filename, int opts,
00059 vector<Xapian::termcount> & doclens,
00060 Xapian::docid db_last_docid)
00061 {
00062 filename += '.';
00063
00064
00065 BrassTableCheck::check(tablename, filename, opts);
00066
00067
00068 BrassTable table(tablename, filename, true);
00069 table.open();
00070 AutoPtr<BrassCursor> cursor(table.cursor_get());
00071
00072 size_t errors = 0;
00073
00074 cursor->find_entry(string());
00075 cursor->next();
00076
00077 if (strcmp(tablename, "postlist") == 0) {
00078
00079 map<Xapian::valueno, VStats> valuestats;
00080 string current_term;
00081 Xapian::docid lastdid = 0;
00082 Xapian::termcount termfreq = 0, collfreq = 0;
00083 Xapian::termcount tf = 0, cf = 0;
00084 bool have_metainfo_key = false;
00085
00086
00087
00088 if (!cursor->after_end()) {
00089 if (cursor->current_key == string("", 1)) {
00090 have_metainfo_key = true;
00091 cursor->read_tag();
00092
00093 totlen_t total_doclen;
00094 Xapian::docid last_docid;
00095 Xapian::termcount doclen_lbound;
00096 Xapian::termcount doclen_ubound;
00097 Xapian::termcount wdf_ubound;
00098
00099 const char * data = cursor->current_tag.data();
00100 const char * end = data + cursor->current_tag.size();
00101 if (!unpack_uint(&data, end, &last_docid)) {
00102 cout << "Tag containing meta information is corrupt (couldn't read last_docid)." << endl;
00103 ++errors;
00104 } else if (!unpack_uint(&data, end, &doclen_lbound)) {
00105 cout << "Tag containing meta information is corrupt (couldn't read doclen_lbound)." << endl;
00106 ++errors;
00107 } else if (!unpack_uint(&data, end, &wdf_ubound)) {
00108 cout << "Tag containing meta information is corrupt (couldn't read wdf_ubound)." << endl;
00109 ++errors;
00110 } else if (!unpack_uint(&data, end, &doclen_ubound)) {
00111 cout << "Tag containing meta information is corrupt (couldn't read doclen_ubound)." << endl;
00112 ++errors;
00113 } else if (!unpack_uint_last(&data, end, &total_doclen)) {
00114 cout << "Tag containing meta information is corrupt (couldn't read total_doclen)." << endl;
00115 ++errors;
00116 } else if (data != end) {
00117 cout << "Tag containing meta information is corrupt (junk at end)." << endl;
00118 ++errors;
00119 }
00120 cursor->next();
00121 }
00122 }
00123
00124 for ( ; !cursor->after_end(); cursor->next()) {
00125 string & key = cursor->current_key;
00126
00127 if (is_user_metadata_key(key)) {
00128
00129
00130 cursor->read_tag();
00131 if (cursor->current_tag.empty()) {
00132 cout << "User metadata item is empty" << endl;
00133 ++errors;
00134 }
00135 continue;
00136 }
00137
00138 if (!have_metainfo_key) {
00139 cout << "METAINFO key missing from postlist table" << endl;
00140 ++errors;
00141 }
00142
00143 if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xe0') {
00144
00145 const char * pos, * end;
00146 Xapian::docid did = 1;
00147 if (key.size() > 2) {
00148
00149 pos = key.data();
00150 end = pos + key.size();
00151 pos += 2;
00152 if (!unpack_uint_preserving_sort(&pos, end, &did)) {
00153 cout << "Error unpacking docid from doclen key" << endl;
00154 ++errors;
00155 continue;
00156 }
00157 }
00158
00159 cursor->read_tag();
00160 pos = cursor->current_tag.data();
00161 end = pos + cursor->current_tag.size();
00162 if (key.size() == 2) {
00163
00164 if (end - pos < 2 || pos[0] || pos[1]) {
00165 cout << "Initial doclen chunk has nonzero dummy fields" << endl;
00166 ++errors;
00167 continue;
00168 }
00169 pos += 2;
00170 if (!unpack_uint(&pos, end, &did)) {
00171 cout << "Failed to unpack firstdid for doclen" << endl;
00172 ++errors;
00173 continue;
00174 }
00175 ++did;
00176 if (did <= lastdid) {
00177 cout << "First did in this chunk is <= last in "
00178 "prev chunk" << endl;
00179 ++errors;
00180 }
00181 }
00182
00183 bool is_last_chunk;
00184 if (!unpack_bool(&pos, end, &is_last_chunk)) {
00185 cout << "Failed to unpack last chunk flag for doclen" << endl;
00186 ++errors;
00187 continue;
00188 }
00189
00190 if (!unpack_uint(&pos, end, &lastdid)) {
00191 cout << "Failed to unpack increase to last" << endl;
00192 ++errors;
00193 continue;
00194 }
00195 lastdid += did;
00196 bool bad = false;
00197 while (true) {
00198 Xapian::termcount doclen;
00199 if (!unpack_uint(&pos, end, &doclen)) {
00200 cout << "Failed to unpack doclen" << endl;
00201 ++errors;
00202 bad = true;
00203 break;
00204 }
00205
00206 if (did > db_last_docid) {
00207 cout << "document id " << did << " in doclen stream "
00208 << "is larger than get_last_docid() "
00209 << db_last_docid << endl;
00210 ++errors;
00211 }
00212
00213 if (!doclens.empty()) {
00214
00215
00216 Xapian::termcount termlist_doclen = 0;
00217 if (did < doclens.size())
00218 termlist_doclen = doclens[did];
00219
00220 if (doclen != termlist_doclen) {
00221 cout << "document id " << did << ": length "
00222 << doclen << " doesn't match "
00223 << termlist_doclen << " in the termlist table"
00224 << endl;
00225 ++errors;
00226 }
00227 }
00228
00229 if (pos == end) break;
00230
00231 Xapian::docid inc;
00232 if (!unpack_uint(&pos, end, &inc)) {
00233 cout << "Failed to unpack docid increase" << endl;
00234 ++errors;
00235 bad = true;
00236 break;
00237 }
00238 ++inc;
00239 did += inc;
00240 if (did > lastdid) {
00241 cout << "docid " << did << " > last docid " << lastdid
00242 << endl;
00243 ++errors;
00244 }
00245 }
00246 if (bad) {
00247 continue;
00248 }
00249 if (is_last_chunk) {
00250 if (did != lastdid) {
00251 cout << "lastdid " << lastdid << " != last did " << did
00252 << endl;
00253 ++errors;
00254 }
00255 }
00256
00257 continue;
00258 }
00259
00260 if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xd0') {
00261
00262 const char * p = key.data();
00263 const char * end = p + key.length();
00264 p += 2;
00265 Xapian::valueno slot;
00266 if (!unpack_uint_last(&p, end, &slot)) {
00267 cout << "Bad valuestats key (no slot)" << endl;
00268 ++errors;
00269 continue;
00270 }
00271
00272 cursor->read_tag();
00273 p = cursor->current_tag.data();
00274 end = p + cursor->current_tag.size();
00275
00276 VStats & v = valuestats[slot];
00277 if (!unpack_uint(&p, end, &v.freq)) {
00278 if (*p == 0) {
00279 cout << "Incomplete stats item in value table" << endl;
00280 } else {
00281 cout << "Frequency statistic in value table is too large" << endl;
00282 }
00283 ++errors;
00284 continue;
00285 }
00286 if (!unpack_string(&p, end, v.lower_bound)) {
00287 if (*p == 0) {
00288 cout << "Incomplete stats item in value table" << endl;
00289 } else {
00290 cout << "Lower bound statistic in value table is too large" << endl;
00291 }
00292 ++errors;
00293 continue;
00294 }
00295 size_t len = end - p;
00296 if (len == 0) {
00297 v.upper_bound = v.lower_bound;
00298 } else {
00299 v.upper_bound.assign(p, len);
00300 }
00301
00302 continue;
00303 }
00304
00305 if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xd8') {
00306
00307 const char * p = key.data();
00308 const char * end = p + key.length();
00309 p += 2;
00310 Xapian::valueno slot;
00311 if (!unpack_uint(&p, end, &slot)) {
00312 cout << "Bad value chunk key (no slot)" << endl;
00313 ++errors;
00314 continue;
00315 }
00316 Xapian::docid did;
00317 if (!unpack_uint_preserving_sort(&p, end, &did)) {
00318 cout << "Bad value chunk key (no docid)" << endl;
00319 ++errors;
00320 continue;
00321 }
00322 if (p != end) {
00323 cout << "Bad value chunk key (trailing junk)" << endl;
00324 ++errors;
00325 continue;
00326 }
00327
00328 VStats & v = valuestats[slot];
00329
00330 cursor->read_tag();
00331 p = cursor->current_tag.data();
00332 end = p + cursor->current_tag.size();
00333
00334 while (true) {
00335 string value;
00336 if (!unpack_string(&p, end, value)) {
00337 cout << "Failed to unpack value from chunk" << endl;
00338 ++errors;
00339 break;
00340 }
00341
00342 ++v.freq_real;
00343
00344
00345
00346
00347
00348
00349
00350 if (value < v.lower_bound) {
00351 cout << "Value slot " << slot << " has value below "
00352 "lower bound: '" << value << "' < '"
00353 << v.lower_bound << "'" << endl;
00354 ++errors;
00355 } else if (value > v.upper_bound) {
00356 cout << "Value slot " << slot << " has value above "
00357 "upper bound: '" << value << "' > '"
00358 << v.upper_bound << "'" << endl;
00359 ++errors;
00360 }
00361
00362 if (p == end) break;
00363 Xapian::docid delta;
00364 if (!unpack_uint(&p, end, &delta)) {
00365 cout << "Failed to unpack docid delta from chunk" << endl;
00366 ++errors;
00367 break;
00368 }
00369 Xapian::docid new_did = did + delta + 1;
00370 if (new_did <= did) {
00371 cout << "docid overflowed in value chunk" << endl;
00372 ++errors;
00373 break;
00374 }
00375 did = new_did;
00376
00377 if (did > db_last_docid) {
00378 cout << "document id " << did << " in value chunk "
00379 << "is larger than get_last_docid() "
00380 << db_last_docid << endl;
00381 ++errors;
00382 }
00383 }
00384 continue;
00385 }
00386
00387 const char * pos, * end;
00388
00389
00390 pos = key.data();
00391 end = pos + key.size();
00392
00393 string term;
00394 Xapian::docid did;
00395 if (!unpack_string_preserving_sort(&pos, end, term)) {
00396 cout << "Error unpacking termname from key" << endl;
00397 ++errors;
00398 continue;
00399 }
00400 if (!current_term.empty() && term != current_term) {
00401
00402 if (pos == end) {
00403 cout << "No last chunk for term `" << current_term
00404 << "'" << endl;
00405 current_term.resize(0);
00406 } else {
00407 cout << "Mismatch in follow-on chunk in posting "
00408 "list for term `" << current_term << "' (got `"
00409 << term << "')" << endl;
00410 current_term = term;
00411 tf = cf = 0;
00412 lastdid = 0;
00413 }
00414 ++errors;
00415 }
00416 if (pos == end) {
00417
00418 if (term == current_term) {
00419
00420 cout << "First posting list chunk for term `"
00421 << term << "' follows previous chunk for the same "
00422 "term" << endl;
00423 ++errors;
00424 }
00425 current_term = term;
00426 tf = cf = 0;
00427
00428
00429 cursor->read_tag();
00430 pos = cursor->current_tag.data();
00431 end = pos + cursor->current_tag.size();
00432 if (!unpack_uint(&pos, end, &termfreq)) {
00433 cout << "Failed to unpack termfreq for term `" << term
00434 << "'" << endl;
00435 ++errors;
00436 continue;
00437 }
00438 if (!unpack_uint(&pos, end, &collfreq)) {
00439 cout << "Failed to unpack collfreq for term `" << term
00440 << "'" << endl;
00441 ++errors;
00442 continue;
00443 }
00444 if (!unpack_uint(&pos, end, &did)) {
00445 cout << "Failed to unpack firstdid for term `" << term
00446 << "'" << endl;
00447 ++errors;
00448 continue;
00449 }
00450 ++did;
00451 } else {
00452
00453 if (current_term.empty()) {
00454 cout << "First chunk for term `" << current_term << "' "
00455 "is a continuation chunk" << endl;
00456 ++errors;
00457 current_term = term;
00458 }
00459 AssertEq(current_term, term);
00460 if (!unpack_uint_preserving_sort(&pos, end, &did)) {
00461 cout << "Failed to unpack did from key" << endl;
00462 ++errors;
00463 continue;
00464 }
00465 if (did <= lastdid) {
00466 cout << "First did in this chunk is <= last in "
00467 "prev chunk" << endl;
00468 ++errors;
00469 }
00470 cursor->read_tag();
00471 pos = cursor->current_tag.data();
00472 end = pos + cursor->current_tag.size();
00473 }
00474
00475 bool is_last_chunk;
00476 if (!unpack_bool(&pos, end, &is_last_chunk)) {
00477 cout << "Failed to unpack last chunk flag" << endl;
00478 ++errors;
00479 continue;
00480 }
00481
00482 if (!unpack_uint(&pos, end, &lastdid)) {
00483 cout << "Failed to unpack increase to last" << endl;
00484 ++errors;
00485 continue;
00486 }
00487 lastdid += did;
00488 bool bad = false;
00489 while (true) {
00490 Xapian::termcount wdf;
00491 if (!unpack_uint(&pos, end, &wdf)) {
00492 cout << "Failed to unpack wdf" << endl;
00493 ++errors;
00494 bad = true;
00495 break;
00496 }
00497 ++tf;
00498 cf += wdf;
00499
00500 if (pos == end) break;
00501
00502 Xapian::docid inc;
00503 if (!unpack_uint(&pos, end, &inc)) {
00504 cout << "Failed to unpack docid increase" << endl;
00505 ++errors;
00506 bad = true;
00507 break;
00508 }
00509 ++inc;
00510 did += inc;
00511 if (did > lastdid) {
00512 cout << "docid " << did << " > last docid " << lastdid
00513 << endl;
00514 ++errors;
00515 }
00516 }
00517 if (bad) {
00518 continue;
00519 }
00520 if (is_last_chunk) {
00521 if (tf != termfreq) {
00522 cout << "termfreq " << termfreq << " != # of entries "
00523 << tf << endl;
00524 ++errors;
00525 }
00526 if (cf != collfreq) {
00527 cout << "collfreq " << collfreq << " != sum wdf " << cf
00528 << endl;
00529 ++errors;
00530 }
00531 if (did != lastdid) {
00532 cout << "lastdid " << lastdid << " != last did " << did
00533 << endl;
00534 ++errors;
00535 }
00536 current_term.resize(0);
00537 }
00538 }
00539 if (!current_term.empty()) {
00540 cout << "Last term `" << current_term << "' has no last chunk"
00541 << endl;
00542 ++errors;
00543 }
00544
00545 map<Xapian::valueno, VStats>::const_iterator i;
00546 for (i = valuestats.begin(); i != valuestats.end(); ++i) {
00547 if (i->second.freq != i->second.freq_real) {
00548 cout << "Value stats frequency for slot " << i->first << " is "
00549 << i->second.freq << " but recounting gives "
00550 << i->second.freq_real << endl;
00551 ++errors;
00552 }
00553 }
00554 } else if (strcmp(tablename, "record") == 0) {
00555
00556
00557 for ( ; !cursor->after_end(); cursor->next()) {
00558 string & key = cursor->current_key;
00559
00560
00561 const char * pos = key.data();
00562 const char * end = pos + key.size();
00563
00564 Xapian::docid did;
00565 if (!unpack_uint_preserving_sort(&pos, end, &did)) {
00566 cout << "Error unpacking docid from key" << endl;
00567 ++errors;
00568 } else if (pos != end) {
00569 cout << "Extra junk in key" << endl;
00570 ++errors;
00571 }
00572 }
00573 } else if (strcmp(tablename, "termlist") == 0) {
00574
00575 for ( ; !cursor->after_end(); cursor->next()) {
00576 string & key = cursor->current_key;
00577
00578
00579 const char * pos = key.data();
00580 const char * end = pos + key.size();
00581
00582 Xapian::docid did;
00583 if (!unpack_uint_preserving_sort(&pos, end, &did)) {
00584 cout << "Error unpacking docid from key" << endl;
00585 ++errors;
00586 continue;
00587 }
00588
00589 if (end - pos == 1 && *pos == '\0') {
00590
00591 cursor->read_tag();
00592
00593 pos = cursor->current_tag.data();
00594 end = pos + cursor->current_tag.size();
00595
00596 if (pos == end) {
00597 cout << "Empty value slots used tag" << endl;
00598 ++errors;
00599 continue;
00600 }
00601
00602 Xapian::valueno prev_slot;
00603 if (!unpack_uint(&pos, end, &prev_slot)) {
00604 cout << "Value slot encoding corrupt" << endl;
00605 ++errors;
00606 continue;
00607 }
00608
00609 while (pos != end) {
00610 Xapian::valueno slot;
00611 if (!unpack_uint(&pos, end, &slot)) {
00612 cout << "Value slot encoding corrupt" << endl;
00613 ++errors;
00614 break;
00615 }
00616 slot += prev_slot + 1;
00617 if (slot <= prev_slot) {
00618 cout << "Value slot number overflowed (" << prev_slot << " -> " << slot << ")" << endl;
00619 ++errors;
00620 }
00621 prev_slot = slot;
00622 }
00623 continue;
00624 }
00625
00626 if (pos != end) {
00627 cout << "Extra junk in key" << endl;
00628 ++errors;
00629 continue;
00630 }
00631
00632 cursor->read_tag();
00633
00634 pos = cursor->current_tag.data();
00635 end = pos + cursor->current_tag.size();
00636
00637 if (pos == end) {
00638
00639 continue;
00640 }
00641
00642 Xapian::termcount doclen, termlist_size;
00643
00644
00645 if (!unpack_uint(&pos, end, &doclen)) {
00646 if (pos != 0) {
00647 cout << "doclen out of range" << endl;
00648 } else {
00649 cout << "Unexpected end of data when reading doclen" << endl;
00650 }
00651 ++errors;
00652 continue;
00653 }
00654
00655
00656 if (!unpack_uint(&pos, end, &termlist_size)) {
00657 if (pos != 0) {
00658 cout << "termlist_size out of range" << endl;
00659 } else {
00660 cout << "Unexpected end of data when reading termlist_size" << endl;
00661 }
00662 ++errors;
00663 continue;
00664 }
00665
00666 Xapian::termcount actual_doclen = 0, actual_termlist_size = 0;
00667 string current_tname;
00668
00669 bool bad = false;
00670 while (pos != end) {
00671 Xapian::doccount current_wdf = 0;
00672 bool got_wdf = false;
00673
00674 if (!current_tname.empty()) {
00675 string::size_type len = static_cast<unsigned char>(*pos++);
00676 if (len > current_tname.length()) {
00677
00678 current_wdf = len / (current_tname.length() + 1) - 1;
00679 len %= (current_tname.length() + 1);
00680 got_wdf = true;
00681 }
00682 current_tname.resize(len);
00683 }
00684
00685
00686 string::size_type len = static_cast<unsigned char>(*pos++);
00687 current_tname.append(pos, len);
00688 pos += len;
00689
00690 if (!got_wdf) {
00691
00692 if (!unpack_uint(&pos, end, ¤t_wdf)) {
00693 if (pos == 0) {
00694 cout << "Unexpected end of data when reading termlist current_wdf" << endl;
00695 } else {
00696 cout << "Size of wdf out of range, in termlist" << endl;
00697 }
00698 ++errors;
00699 bad = true;
00700 break;
00701 }
00702 }
00703
00704 ++actual_termlist_size;
00705 actual_doclen += current_wdf;
00706 }
00707 if (bad) {
00708 continue;
00709 }
00710
00711 if (termlist_size != actual_termlist_size) {
00712 cout << "termlist_size != # of entries in termlist" << endl;
00713 ++errors;
00714 }
00715 if (doclen != actual_doclen) {
00716 cout << "doclen != sum(wdf)" << endl;
00717 ++errors;
00718 }
00719
00720
00721 if (doclens.size() <= did) doclens.resize(did + 1);
00722 doclens[did] = actual_doclen;
00723 }
00724 } else if (strcmp(tablename, "position") == 0) {
00725
00726 for ( ; !cursor->after_end(); cursor->next()) {
00727 string & key = cursor->current_key;
00728
00729
00730 const char * pos = key.data();
00731 const char * end = pos + key.size();
00732
00733 Xapian::docid did;
00734 if (!unpack_uint_preserving_sort(&pos, end, &did)) {
00735 cout << "Error unpacking docid from key" << endl;
00736 ++errors;
00737 continue;
00738 }
00739 if (pos == end) {
00740 cout << "No termname in key" << endl;
00741 ++errors;
00742 continue;
00743 }
00744
00745 cursor->read_tag();
00746
00747 const string & data = cursor->current_tag;
00748 pos = data.data();
00749 end = pos + data.size();
00750
00751 Xapian::termpos pos_last;
00752 if (!unpack_uint(&pos, end, &pos_last)) {
00753 cout << tablename << " table: Position list data corrupt" << endl;
00754 ++errors;
00755 continue;
00756 }
00757 if (pos == end) {
00758
00759 } else {
00760
00761 BitReader rd(data, pos - data.data());
00762 Xapian::termpos pos_first = rd.decode(pos_last);
00763 Xapian::termpos pos_size = rd.decode(pos_last - pos_first) + 2;
00764 vector<Xapian::termpos> positions;
00765 positions.resize(pos_size);
00766 positions[0] = pos_first;
00767 positions.back() = pos_last;
00768 rd.decode_interpolative(positions, 0, pos_size - 1);
00769 vector<Xapian::termpos>::const_iterator current_pos = positions.begin();
00770 Xapian::termpos lastpos = *current_pos++;
00771 while (current_pos != positions.end()) {
00772 Xapian::termpos termpos = *current_pos++;
00773 if (termpos <= lastpos) {
00774 cout << tablename << " table: Positions not strictly monotonically increasing" << endl;
00775 ++errors;
00776 break;
00777 }
00778 lastpos = termpos;
00779 }
00780 }
00781 }
00782 } else {
00783 cout << tablename << " table: Don't know how to check structure\n" << endl;
00784 return errors;
00785 }
00786
00787 if (!errors)
00788 cout << tablename << " table structure checked OK\n" << endl;
00789 else
00790 cout << tablename << " table errors found: " << errors << "\n" << endl;
00791
00792 return errors;
00793 }