00001
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include <config.h>
00024
00025 #include "xapian-check-flint.h"
00026
00027 #include "bitstream.h"
00028
00029 #include "internaltypes.h"
00030
00031 #include "flint_check.h"
00032 #include "flint_cursor.h"
00033 #include "flint_table.h"
00034 #include "flint_types.h"
00035 #include "flint_utils.h"
00036 #include "valuestats.h"
00037
00038 #include <xapian.h>
00039
00040 #include "autoptr.h"
00041 #include <iostream>
00042
00043 using namespace std;
00044
00045 static inline bool
00046 is_user_metadata_key(const string & key)
00047 {
00048 return key.size() > 1 && key[0] == '\0' && key[1] == '\xc0';
00049 }
00050
00051 size_t
00052 check_flint_table(const char * tablename, string filename, int opts,
00053 vector<Xapian::termcount> & doclens)
00054 {
00055 filename += '.';
00056
00057
00058 BtreeCheck::check(tablename, filename, opts);
00059
00060
00061 FlintTable table(tablename, filename, true);
00062 table.open();
00063 AutoPtr<FlintCursor> cursor(table.cursor_get());
00064
00065 size_t errors = 0;
00066
00067 cursor->find_entry(string());
00068 cursor->next();
00069
00070 if (strcmp(tablename, "postlist") == 0) {
00071
00072 string current_term;
00073 Xapian::docid lastdid = 0;
00074 Xapian::termcount termfreq = 0, collfreq = 0;
00075 Xapian::termcount tf = 0, cf = 0;
00076 bool have_metainfo_key = false;
00077
00078
00079
00080 if (!cursor->after_end()) {
00081 if (cursor->current_key == string("", 1)) {
00082 have_metainfo_key = true;
00083 cursor->read_tag();
00084
00085 Xapian::docid did;
00086 totlen_t totlen;
00087 const char * data = cursor->current_tag.data();
00088 const char * end = data + cursor->current_tag.size();
00089 if (!F_unpack_uint(&data, end, &did)) {
00090 cout << "Tag containing meta information is corrupt." << endl;
00091 ++errors;
00092 } else if (!F_unpack_uint_last(&data, end, &totlen)) {
00093 cout << "Tag containing meta information is corrupt." << endl;
00094 ++errors;
00095 } else if (data != end) {
00096 cout << "Tag containing meta information is corrupt." << endl;
00097 ++errors;
00098 }
00099 cursor->next();
00100 }
00101 }
00102
00103 for ( ; !cursor->after_end(); cursor->next()) {
00104 string & key = cursor->current_key;
00105
00106 if (is_user_metadata_key(key)) {
00107
00108
00109 cursor->read_tag();
00110 if (cursor->current_tag.empty()) {
00111 cout << "User metadata item is empty" << endl;
00112 ++errors;
00113 }
00114 continue;
00115 }
00116
00117 if (!have_metainfo_key) {
00118 cout << "METAINFO key missing from postlist table" << endl;
00119 ++errors;
00120 }
00121
00122 if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xe0') {
00123
00124 const char * pos, * end;
00125 Xapian::docid did = 1;
00126 if (key.size() > 2) {
00127
00128 pos = key.data();
00129 end = pos + key.size();
00130 pos += 2;
00131 if (!F_unpack_uint_preserving_sort(&pos, end, &did)) {
00132 cout << "Error unpacking docid from doclen key" << endl;
00133 ++errors;
00134 continue;
00135 }
00136 }
00137
00138 cursor->read_tag();
00139 pos = cursor->current_tag.data();
00140 end = pos + cursor->current_tag.size();
00141 if (key.size() == 2) {
00142
00143 if (end - pos < 2 || pos[0] || pos[1]) {
00144 cout << "Initial doclen chunk has nonzero dummy fields" << endl;
00145 ++errors;
00146 continue;
00147 }
00148 pos += 2;
00149 if (!F_unpack_uint(&pos, end, &did)) {
00150 cout << "Failed to unpack firstdid for doclen" << endl;
00151 ++errors;
00152 continue;
00153 }
00154 ++did;
00155 if (did <= lastdid) {
00156 cout << "First did in this chunk is <= last in "
00157 "prev chunk" << endl;
00158 ++errors;
00159 }
00160 }
00161
00162 bool is_last_chunk;
00163 if (!F_unpack_bool(&pos, end, &is_last_chunk)) {
00164 cout << "Failed to unpack last chunk flag for doclen" << endl;
00165 ++errors;
00166 continue;
00167 }
00168
00169 if (!F_unpack_uint(&pos, end, &lastdid)) {
00170 cout << "Failed to unpack increase to last" << endl;
00171 ++errors;
00172 continue;
00173 }
00174 lastdid += did;
00175 bool bad = false;
00176 while (true) {
00177 Xapian::termcount doclen;
00178 if (!F_unpack_uint(&pos, end, &doclen)) {
00179 cout << "Failed to unpack doclen" << endl;
00180 ++errors;
00181 bad = true;
00182 break;
00183 }
00184
00185 if (!doclens.empty()) {
00186 if (did >= doclens.size()) {
00187 cout << "document id " << did << " is larger than any in the termlist table!" << endl;
00188 ++errors;
00189 } else if (doclens[did] != doclen) {
00190 cout << "document id " << did << ": length " << doclen
00191 << " doesn't match " << doclens[did]
00192 << " in the termlist table" << endl;
00193 ++errors;
00194 }
00195 }
00196
00197 if (pos == end) break;
00198
00199 Xapian::docid inc;
00200 if (!F_unpack_uint(&pos, end, &inc)) {
00201 cout << "Failed to unpack docid increase" << endl;
00202 ++errors;
00203 bad = true;
00204 break;
00205 }
00206 ++inc;
00207 did += inc;
00208 if (did > lastdid) {
00209 cout << "docid " << did << " > last docid " << lastdid
00210 << endl;
00211 ++errors;
00212 }
00213 }
00214 if (bad) {
00215 continue;
00216 }
00217 if (is_last_chunk) {
00218 if (did != lastdid) {
00219 cout << "lastdid " << lastdid << " != last did " << did
00220 << endl;
00221 ++errors;
00222 }
00223 }
00224
00225 continue;
00226 }
00227
00228 const char * pos, * end;
00229
00230
00231 pos = key.data();
00232 end = pos + key.size();
00233
00234 string term;
00235 Xapian::docid did = 0;
00236 if (!F_unpack_string_preserving_sort(&pos, end, term)) {
00237 cout << "Error unpacking termname from key" << endl;
00238 ++errors;
00239 continue;
00240 }
00241 if (current_term.empty()) {
00242 current_term = term;
00243 tf = cf = 0;
00244 if (pos != end) {
00245 cout << "Extra bytes after key for first chunk of "
00246 "posting list for term `" << term << "'" << endl;
00247 ++errors;
00248 continue;
00249 }
00250
00251 cursor->read_tag();
00252 pos = cursor->current_tag.data();
00253 end = pos + cursor->current_tag.size();
00254 if (!F_unpack_uint(&pos, end, &termfreq)) {
00255 cout << "Failed to unpack termfreq for term `" << term
00256 << "'" << endl;
00257 ++errors;
00258 continue;
00259 }
00260 if (!F_unpack_uint(&pos, end, &collfreq)) {
00261 cout << "Failed to unpack collfreq for term `" << term
00262 << "'" << endl;
00263 ++errors;
00264 continue;
00265 }
00266 if (!F_unpack_uint(&pos, end, &did)) {
00267 cout << "Failed to unpack firstdid for term `" << term
00268 << "'" << endl;
00269 ++errors;
00270 continue;
00271 }
00272 ++did;
00273 } else {
00274 if (term != current_term) {
00275 if (pos == end) {
00276 cout << "No last chunk for term `" << term << "'"
00277 << endl;
00278 } else {
00279 cout << "Mismatch in follow-on chunk in posting "
00280 "list for term `" << current_term << "' (got `"
00281 << term << "')" << endl;
00282 }
00283 ++errors;
00284 current_term = term;
00285 }
00286 if (pos != end) {
00287 if (!F_unpack_uint_preserving_sort(&pos, end, &did)) {
00288 cout << "Failed to unpack did from key" << endl;
00289 ++errors;
00290 continue;
00291 }
00292 if (did <= lastdid) {
00293 cout << "First did in this chunk is <= last in "
00294 "prev chunk" << endl;
00295 ++errors;
00296 }
00297 }
00298 cursor->read_tag();
00299 pos = cursor->current_tag.data();
00300 end = pos + cursor->current_tag.size();
00301 }
00302
00303 bool is_last_chunk;
00304 if (!F_unpack_bool(&pos, end, &is_last_chunk)) {
00305 cout << "Failed to unpack last chunk flag" << endl;
00306 ++errors;
00307 continue;
00308 }
00309
00310 if (!F_unpack_uint(&pos, end, &lastdid)) {
00311 cout << "Failed to unpack increase to last" << endl;
00312 ++errors;
00313 continue;
00314 }
00315 ++lastdid;
00316 lastdid += did;
00317 bool bad = false;
00318 while (true) {
00319 Xapian::termcount wdf;
00320 if (!F_unpack_uint(&pos, end, &wdf)) {
00321 cout << "Failed to unpack wdf" << endl;
00322 ++errors;
00323 bad = true;
00324 break;
00325 }
00326 ++tf;
00327 cf += wdf;
00328
00329 Xapian::termcount doclen;
00330 if (!F_unpack_uint(&pos, end, &doclen)) {
00331 cout << "Failed to unpack doc length" << endl;
00332 ++errors;
00333 bad = true;
00334 break;
00335 }
00336
00337 if (!doclens.empty()) {
00338 if (did >= doclens.size()) {
00339 cout << "document id " << did << " is larger than any in the termlist table!" << endl;
00340 } else if (doclens[did] != doclen) {
00341 cout << "doclen " << doclen << " doesn't match " << doclens[did] << " in the termlist table" << endl;
00342 ++errors;
00343 }
00344 }
00345 if (pos == end) break;
00346
00347 Xapian::docid inc;
00348 if (!F_unpack_uint(&pos, end, &inc)) {
00349 cout << "Failed to unpack docid increase" << endl;
00350 ++errors;
00351 bad = true;
00352 break;
00353 }
00354 ++inc;
00355 did += inc;
00356 if (did > lastdid) {
00357 cout << "docid " << did << " > last docid " << lastdid
00358 << endl;
00359 ++errors;
00360 }
00361 }
00362 if (bad) {
00363 continue;
00364 }
00365 if (is_last_chunk) {
00366 if (tf != termfreq) {
00367 cout << "termfreq " << termfreq << " != # of entries "
00368 << tf << endl;
00369 ++errors;
00370 }
00371 if (cf != collfreq) {
00372 cout << "collfreq " << collfreq << " != sum wdf " << cf
00373 << endl;
00374 ++errors;
00375 }
00376 if (did != lastdid) {
00377 cout << "lastdid " << lastdid << " != last did " << did
00378 << endl;
00379 ++errors;
00380 }
00381 current_term.resize(0);
00382 }
00383 }
00384 if (!current_term.empty()) {
00385 cout << "Last term `" << current_term << "' has no last chunk"
00386 << endl;
00387 ++errors;
00388 }
00389 } else if (strcmp(tablename, "record") == 0) {
00390
00391
00392 for ( ; !cursor->after_end(); cursor->next()) {
00393 string & key = cursor->current_key;
00394
00395
00396 const char * pos = key.data();
00397 const char * end = pos + key.size();
00398
00399 Xapian::docid did;
00400 if (!F_unpack_uint_preserving_sort(&pos, end, &did)) {
00401 cout << "Error unpacking docid from key" << endl;
00402 ++errors;
00403 } else if (pos != end) {
00404 cout << "Extra junk in key" << endl;
00405 ++errors;
00406 }
00407 }
00408 } else if (strcmp(tablename, "termlist") == 0) {
00409
00410 for ( ; !cursor->after_end(); cursor->next()) {
00411 string & key = cursor->current_key;
00412
00413
00414 const char * pos = key.data();
00415 const char * end = pos + key.size();
00416
00417 Xapian::docid did;
00418 if (!F_unpack_uint_preserving_sort(&pos, end, &did)) {
00419 cout << "Error unpacking docid from key" << endl;
00420 ++errors;
00421 continue;
00422 }
00423
00424 if (pos != end) {
00425 cout << "Extra junk in key" << endl;
00426 ++errors;
00427 continue;
00428 }
00429
00430 cursor->read_tag();
00431
00432 pos = cursor->current_tag.data();
00433 end = pos + cursor->current_tag.size();
00434
00435 if (pos == end) {
00436
00437 continue;
00438 }
00439
00440 Xapian::termcount doclen, termlist_size;
00441
00442
00443 if (!F_unpack_uint(&pos, end, &doclen)) {
00444 if (pos != 0) {
00445 cout << "doclen out of range" << endl;
00446 } else {
00447 cout << "Unexpected end of data when reading doclen" << endl;
00448 }
00449 ++errors;
00450 continue;
00451 }
00452
00453
00454 if (!F_unpack_uint(&pos, end, &termlist_size)) {
00455 if (pos != 0) {
00456 cout << "termlist_size out of range" << endl;
00457 } else {
00458 cout << "Unexpected end of data when reading termlist_size" << endl;
00459 }
00460 ++errors;
00461 continue;
00462 }
00463
00464
00465
00466 if (pos != end && *pos == '0') ++pos;
00467
00468 Xapian::termcount actual_doclen = 0, actual_termlist_size = 0;
00469 string current_tname;
00470
00471 bool bad = false;
00472 while (pos != end) {
00473
00474 Xapian::doccount current_wdf = 0;
00475 bool got_wdf = false;
00476
00477 if (!current_tname.empty()) {
00478 string::size_type len = static_cast<unsigned char>(*pos++);
00479 if (len > current_tname.length()) {
00480
00481 current_wdf = len / (current_tname.length() + 1) - 1;
00482 len %= (current_tname.length() + 1);
00483 got_wdf = true;
00484 }
00485 current_tname.resize(len);
00486 }
00487
00488
00489 string::size_type len = static_cast<unsigned char>(*pos++);
00490 current_tname.append(pos, len);
00491 pos += len;
00492
00493 if (!got_wdf) {
00494
00495 if (!F_unpack_uint(&pos, end, ¤t_wdf)) {
00496 if (pos == 0) {
00497 cout << "Unexpected end of data when reading termlist current_wdf" << endl;
00498 } else {
00499 cout << "Size of wdf out of range, in termlist" << endl;
00500 }
00501 ++errors;
00502 bad = true;
00503 break;
00504 }
00505 }
00506
00507 ++actual_termlist_size;
00508 actual_doclen += current_wdf;
00509 }
00510 if (bad) {
00511 continue;
00512 }
00513
00514 if (termlist_size != actual_termlist_size) {
00515 cout << "termlist_size != # of entries in termlist" << endl;
00516 ++errors;
00517 }
00518 if (doclen != actual_doclen) {
00519 cout << "doclen != sum(wdf)" << endl;
00520 ++errors;
00521 }
00522
00523
00524 if (doclens.size() <= did) doclens.resize(did + 1);
00525 doclens[did] = actual_doclen;
00526 }
00527 } else if (strcmp(tablename, "value") == 0) {
00528
00529 for ( ; !cursor->after_end(); cursor->next()) {
00530 string & key = cursor->current_key;
00531
00532
00533 const char * pos = key.data();
00534 const char * end = pos + key.size();
00535
00536 Xapian::docid did;
00537 if (!F_unpack_uint_preserving_sort(&pos, end, &did)) {
00538 cout << "Error unpacking docid from key" << endl;
00539 ++errors;
00540 } else if (pos != end) {
00541 cout << "Extra junk in key" << endl;
00542 ++errors;
00543 }
00544
00545 cursor->read_tag();
00546
00547 pos = cursor->current_tag.data();
00548 end = pos + cursor->current_tag.size();
00549
00550 bool first = true;
00551 Xapian::valueno last_value_no = 0;
00552 while (pos && pos != end) {
00553 Xapian::valueno this_value_no;
00554 string this_value;
00555
00556 if (!F_unpack_uint(&pos, end, &this_value_no)) {
00557 if (pos == 0)
00558 cout << "Incomplete item in value table" << endl;
00559 else
00560 cout << "Value number in value table is too large" << endl;
00561 ++errors;
00562 break;
00563 }
00564
00565 if (!F_unpack_string(&pos, end, this_value)) {
00566 if (pos == 0)
00567 cout << "Incomplete item in value table" << endl;
00568 else
00569 cout << "Item in value table is too large" << endl;
00570 ++errors;
00571 break;
00572 }
00573
00574 if (first) {
00575 first = false;
00576 } else if (this_value_no <= last_value_no) {
00577 cout << "Values not in sorted order - valueno " << last_value_no << " comes before valueno " << this_value_no << endl;
00578 ++errors;
00579 }
00580 last_value_no = this_value_no;
00581 }
00582 }
00583 } else if (strcmp(tablename, "position") == 0) {
00584
00585 for ( ; !cursor->after_end(); cursor->next()) {
00586 string & key = cursor->current_key;
00587
00588
00589 const char * pos = key.data();
00590 const char * end = pos + key.size();
00591
00592 Xapian::docid did;
00593 if (!F_unpack_uint_preserving_sort(&pos, end, &did)) {
00594 cout << "Error unpacking docid from key" << endl;
00595 ++errors;
00596 continue;
00597 }
00598 if (pos == end) {
00599 cout << "No termname in key" << endl;
00600 ++errors;
00601 continue;
00602 }
00603
00604 cursor->read_tag();
00605
00606 const string & data = cursor->current_tag;
00607 pos = data.data();
00608 end = pos + data.size();
00609
00610 Xapian::termpos pos_last;
00611 if (!F_unpack_uint(&pos, end, &pos_last)) {
00612 cout << tablename << " table: Position list data corrupt" << endl;
00613 ++errors;
00614 continue;
00615 }
00616 if (pos == end) {
00617
00618 } else {
00619
00620 BitReader rd(data, pos - data.data());
00621 Xapian::termpos pos_first = rd.decode(pos_last);
00622 Xapian::termpos pos_size = rd.decode(pos_last - pos_first) + 2;
00623 vector<Xapian::termpos> positions;
00624 positions.resize(pos_size);
00625 positions[0] = pos_first;
00626 positions.back() = pos_last;
00627 rd.decode_interpolative(positions, 0, pos_size - 1);
00628 vector<Xapian::termpos>::const_iterator current_pos = positions.begin();
00629 Xapian::termpos lastpos = *current_pos++;
00630 while (current_pos != positions.end()) {
00631 Xapian::termpos termpos = *current_pos++;
00632 if (termpos <= lastpos) {
00633 cout << tablename << " table: Positions not strictly monotonically increasing" << endl;
00634 ++errors;
00635 break;
00636 }
00637 lastpos = termpos;
00638 }
00639 }
00640 }
00641 } else {
00642 cout << tablename << " table: Don't know how to check structure\n" << endl;
00643 return errors;
00644 }
00645
00646 if (!errors)
00647 cout << tablename << " table structure checked OK\n" << endl;
00648 else
00649 cout << tablename << " table errors found: " << errors << "\n" << endl;
00650
00651 return errors;
00652 }