49 return key.size() > 1 && key[0] ==
'\0' && key[1] ==
'\xc0';
61 vector<Xapian::termcount> & doclens,
65 string filename = dir;
67 filename += tablename;
81 if (rev_ptr && *rev_ptr) {
82 if (!table.
open(*rev_ptr)) {
84 *out <<
"Failed to reopen table after it checked OK" << endl;
90 AutoPtr<ChertCursor> cursor(table.
cursor_get());
94 cursor->find_entry(
string());
97 if (strcmp(tablename,
"postlist") == 0) {
99 map<Xapian::valueno, VStats> valuestats;
105 bool have_metainfo_key =
false;
109 if (!cursor->after_end()) {
110 if (cursor->current_key ==
string(
"", 1)) {
111 have_metainfo_key =
true;
120 const char * data = cursor->current_tag.data();
121 const char * end = data + cursor->current_tag.size();
124 *out <<
"Tag containing meta information is corrupt (couldn't read last_docid)." << endl;
126 }
else if (!
unpack_uint(&data, end, &doclen_lbound)) {
128 *out <<
"Tag containing meta information is corrupt (couldn't read doclen_lbound)." << endl;
130 }
else if (!
unpack_uint(&data, end, &wdf_ubound)) {
132 *out <<
"Tag containing meta information is corrupt (couldn't read wdf_ubound)." << endl;
134 }
else if (!
unpack_uint(&data, end, &doclen_ubound)) {
136 *out <<
"Tag containing meta information is corrupt (couldn't read doclen_ubound)." << endl;
140 *out <<
"Tag containing meta information is corrupt (couldn't read total_doclen)." << endl;
142 }
else if (data != end) {
144 *out <<
"Tag containing meta information is corrupt (junk at end)." << endl;
151 bool seen_doclen_initial_chunk =
false;
152 for ( ; !cursor->after_end(); cursor->next()) {
153 string & key = cursor->current_key;
159 if (cursor->current_tag.empty()) {
161 *out <<
"User metadata item is empty" << endl;
167 if (!have_metainfo_key) {
168 have_metainfo_key =
true;
170 *out <<
"METAINFO key missing from postlist table" << endl;
174 if (key.size() >= 2 && key[0] ==
'\0' && key[1] ==
'\xe0') {
176 const char * pos, * end;
178 if (key.size() > 2) {
180 if (!seen_doclen_initial_chunk) {
182 *out <<
"Doclen initial chunk missing" << endl;
186 end = pos + key.size();
190 *out <<
"Error unpacking docid from doclen key" << endl;
194 if (did <= lastdid) {
196 *out <<
"First did in this chunk is <= last in " 197 "prev chunk" << endl;
203 pos = cursor->current_tag.data();
204 end = pos + cursor->current_tag.size();
205 if (key.size() == 2) {
207 seen_doclen_initial_chunk =
true;
208 if (end - pos < 2 || pos[0] || pos[1]) {
210 *out <<
"Initial doclen chunk has nonzero dummy fields" << endl;
217 *out <<
"Failed to unpack firstdid for doclen" << endl;
227 *out <<
"Failed to unpack last chunk flag for doclen" << endl;
234 *out <<
"Failed to unpack increase to last" << endl;
244 *out <<
"Failed to unpack doclen" << endl;
252 if (did > db_last_docid) {
254 *out <<
"document id " << did <<
" in doclen " 255 "stream is larger than get_last_docid() " 256 << db_last_docid << endl;
260 if (!doclens.empty()) {
264 if (did < doclens.size())
265 termlist_doclen = doclens[did];
267 if (doclen != termlist_doclen) {
269 *out <<
"document id " << did <<
": length " 270 << doclen <<
" doesn't match " 271 << termlist_doclen <<
" in the termlist " 277 if (pos == end)
break;
282 *out <<
"Failed to unpack docid increase" << endl;
291 *out <<
"docid " << did <<
" > last docid " 300 if (did != lastdid) {
302 *out <<
"lastdid " << lastdid <<
" != last did " 311 if (key.size() >= 2 && key[0] ==
'\0' && key[1] ==
'\xd0') {
313 const char * p = key.data();
314 const char * end = p + key.length();
319 *out <<
"Bad valuestats key (no slot)" << endl;
325 p = cursor->current_tag.data();
326 end = p + cursor->current_tag.size();
328 VStats & v = valuestats[slot];
332 *out <<
"Incomplete stats item in value table";
334 *out <<
"Frequency statistic in value table is too large";
344 *out <<
"Incomplete stats item in value table";
346 *out <<
"Lower bound statistic in value table is too large";
353 size_t len = end - p;
363 if (key.size() >= 2 && key[0] ==
'\0' && key[1] ==
'\xd8') {
365 const char * p = key.data();
366 const char * end = p + key.length();
371 *out <<
"Bad value chunk key (no slot)" << endl;
378 *out <<
"Bad value chunk key (no docid)" << endl;
384 *out <<
"Bad value chunk key (trailing junk)" << endl;
389 VStats & v = valuestats[slot];
392 p = cursor->current_tag.data();
393 end = p + cursor->current_tag.size();
399 *out <<
"Failed to unpack value from chunk" << endl;
414 *out <<
"Value slot " << slot <<
" has value " 415 "below lower bound: '" << value <<
"' < '" 420 *out <<
"Value slot " << slot <<
" has value " 421 "above upper bound: '" << value <<
"' > '" 430 *out <<
"Failed to unpack docid delta from chunk" 436 if (new_did <= did) {
438 *out <<
"docid overflowed in value chunk" << endl;
444 if (did > db_last_docid) {
446 *out <<
"document id " << did <<
" in value chunk " 447 "is larger than get_last_docid() " 448 << db_last_docid << endl;
455 const char * pos, * end;
459 end = pos + key.size();
465 *out <<
"Error unpacking termname from key" << endl;
469 if (!current_term.empty() && term != current_term) {
473 *out <<
"No last chunk for term '" << current_term
475 current_term.resize(0);
478 *out <<
"Mismatch in follow-on chunk in posting list " 479 "for term '" << current_term <<
"' (got '" 480 << term <<
"')" << endl;
489 if (term == current_term) {
492 *out <<
"First posting list chunk for term '" << term
493 <<
"' follows previous chunk for the same term" 502 pos = cursor->current_tag.data();
503 end = pos + cursor->current_tag.size();
506 *out <<
"Failed to unpack termfreq for term '" << term
513 *out <<
"Failed to unpack collfreq for term '" << term
520 *out <<
"Failed to unpack firstdid for term '" << term
528 if (current_term.empty()) {
530 *out <<
"First chunk for term '" << current_term
531 <<
"' is a continuation chunk" << endl;
538 *out <<
"Failed to unpack did from key" << endl;
542 if (did <= lastdid) {
544 *out <<
"First did in this chunk is <= last in " 545 "prev chunk" << endl;
549 pos = cursor->current_tag.data();
550 end = pos + cursor->current_tag.size();
556 *out <<
"Failed to unpack last chunk flag" << endl;
563 *out <<
"Failed to unpack increase to last" << endl;
573 *out <<
"Failed to unpack wdf" << endl;
581 if (pos == end)
break;
586 *out <<
"Failed to unpack docid increase" << endl;
595 *out <<
"docid " << did <<
" > last docid " << lastdid
604 if (tf != termfreq) {
606 *out <<
"termfreq " << termfreq <<
" != # of entries " 610 if (cf != collfreq) {
612 *out <<
"collfreq " << collfreq <<
" != sum wdf " << cf
616 if (did != lastdid) {
618 *out <<
"lastdid " << lastdid <<
" != last did " << did
622 current_term.resize(0);
625 if (!current_term.empty()) {
627 *out <<
"Last term '" << current_term <<
"' has no last chunk" 634 *out <<
"Document length list has " << num_doclens
635 <<
" entries, should be " << doccount << endl;
639 map<Xapian::valueno, VStats>::const_iterator i;
640 for (i = valuestats.begin(); i != valuestats.end(); ++i) {
641 if (i->second.freq != i->second.freq_real) {
643 *out <<
"Value stats frequency for slot " << i->first
644 <<
" is " << i->second.freq <<
" but recounting " 645 "gives " << i->second.freq_real << endl;
649 }
else if (strcmp(tablename,
"record") == 0) {
654 <<
") != get_doccount() (" << doccount <<
")" << endl;
660 for ( ; !cursor->after_end(); cursor->next()) {
661 string & key = cursor->current_key;
664 const char * pos = key.data();
665 const char * end = pos + key.size();
670 *out <<
"Error unpacking docid from key" << endl;
672 }
else if (pos != end) {
674 *out <<
"Extra junk in key" << endl;
677 if (did > db_last_docid) {
679 *out <<
"document id " << did <<
" in docdata table " 680 "is larger than get_last_docid() " 681 << db_last_docid << endl;
686 }
else if (strcmp(tablename,
"termlist") == 0) {
690 for ( ; !cursor->after_end(); cursor->next()) {
691 string & key = cursor->current_key;
694 const char * pos = key.data();
695 const char * end = pos + key.size();
700 *out <<
"Error unpacking docid from key" << endl;
705 if (did > db_last_docid) {
707 *out <<
"document id " << did <<
" in termlist table " 708 "is larger than get_last_docid() " 709 << db_last_docid << endl;
713 if (end - pos == 1 && *pos ==
'\0') {
715 ++num_slotsused_entries;
718 pos = cursor->current_tag.data();
719 end = pos + cursor->current_tag.size();
723 *out <<
"Empty value slots used tag" << endl;
731 *out <<
"Value slot encoding corrupt" << endl;
740 *out <<
"Value slot encoding corrupt" << endl;
744 slot += prev_slot + 1;
745 if (slot <= prev_slot) {
747 *out <<
"Value slot number overflowed (" 748 << prev_slot <<
" -> " << slot <<
")" << endl;
758 *out <<
"Extra junk in key" << endl;
766 pos = cursor->current_tag.data();
767 end = pos + cursor->current_tag.size();
780 *out <<
"doclen out of range";
782 *out <<
"Unexpected end of data when reading doclen";
794 *out <<
"termlist_size out of range";
796 *out <<
"Unexpected end of data when reading " 806 string current_tname;
811 bool got_wdf =
false;
813 if (!current_tname.empty()) {
814 string::size_type len =
static_cast<unsigned char>(*pos++);
815 if (len > current_tname.length()) {
817 current_wdf = len / (current_tname.length() + 1) - 1;
818 len %= (current_tname.length() + 1);
821 current_tname.resize(len);
825 string::size_type len =
static_cast<unsigned char>(*pos++);
826 current_tname.append(pos, len);
834 *out <<
"Unexpected end of data when reading " 835 "termlist current_wdf";
837 *out <<
"Size of wdf out of range in termlist";
847 ++actual_termlist_size;
848 actual_doclen += current_wdf;
854 if (termlist_size != actual_termlist_size) {
856 *out <<
"termlist_size != # of entries in termlist" << endl;
859 if (doclen != actual_doclen) {
861 *out <<
"doclen != sum(wdf)" << endl;
866 if (doclens.size() <= did) doclens.resize(did + 1);
867 doclens[did] = actual_doclen;
872 *out <<
"Number of termlists (" << num_termlists
873 <<
") != get_doccount() (" << doccount <<
")" << endl;
879 if (num_slotsused_entries > doccount &&
882 *out <<
"More slots-used entries (" << num_slotsused_entries
883 <<
") then documents (" << doccount <<
")" << endl;
886 }
else if (strcmp(tablename,
"position") == 0) {
888 for ( ; !cursor->after_end(); cursor->next()) {
889 string & key = cursor->current_key;
892 const char * pos = key.data();
893 const char * end = pos + key.size();
898 *out <<
"Error unpacking docid from key" << endl;
903 if (did > db_last_docid) {
905 *out <<
"document id " << did <<
" in position table " 906 "is larger than get_last_docid() " 907 << db_last_docid << endl;
909 }
else if (!doclens.empty()) {
913 if (did >= doclens.size() || doclens[did] == 0) {
915 *out <<
"Position list entry for document " << did
916 <<
" which doesn't exist or has no terms" << endl;
923 *out <<
"No termname in key" << endl;
930 const string & data = cursor->current_tag;
932 end = pos + data.size();
937 *out << tablename <<
" table: Position list data corrupt" 949 rd.decode_interpolative(0, pos_size - 1, pos_first, pos_last);
952 while (p != pos_last) {
954 p = rd.decode_interpolative_next();
957 *out << tablename <<
" table: Positions not " 958 "strictly monotonically increasing" << endl;
964 if (ok && !rd.check_all_gone()) {
966 *out << tablename <<
" table: Junk after position data" 974 *out << tablename <<
" table: Don't know how to check structure\n" 981 *out << tablename <<
" table structure checked OK\n";
983 *out << tablename <<
" table errors found: " << errors <<
"\n";
Class to hold statistics for a given slot.
chert_tablesize_t get_entry_count() const
Return a count of the number of entries in the table.
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
Class managing a Btree table in a Chert database.
void open()
Open the btree at the latest revision.
std::string upper_bound
An upper bound on the values stored in the given value slot.
Types used by chert backend and the Btree manager.
unsigned int chert_revision_number_t
A type used to store a revision number for a table.
Xapian::doccount freq
The number of documents which have a (non-empty) value stored in the slot.
std::string lower_bound
A lower bound on the values stored in the given value slot.
Xapian::doccount freq_real
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
bool C_unpack_uint_preserving_sort(const char **p, const char *end, U *result)
Decode an "sort preserved" unsigned integer from a string.
Interface to Btree cursors.
Public interfaces for the Xapian library.
static bool is_user_metadata_key(const string &key)
Read a stream created by BitWriter.
bool unpack_string_preserving_sort(const char **p, const char *end, std::string &result)
Decode a "sort preserved" std::string from a string.
ChertCursor * cursor_get() const
Get a cursor for reading from the table.
Classes to encode/decode a bitstream.
Xapian::termpos decode(Xapian::termpos outof, bool force=false)
std::string get_description() const
Return a string describing this object.
bool unpack_bool(const char **p, const char *end, bool *result)
Decode a bool from a string.
size_t check_chert_table(const char *tablename, const string &dir, chert_revision_number_t *rev_ptr, int opts, vector< Xapian::termcount > &doclens, Xapian::doccount doccount, Xapian::docid db_last_docid, ostream *out)
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Pack types into strings and unpack them again.
unsigned valueno
The number for a value slot in a document.
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
bool unpack_uint_last(const char **p, const char *end, U *result)
Decode an unsigned integer as the last item in a string.
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
static void check(const char *tablename, const std::string &path, chert_revision_number_t *rev_ptr, int opts, std::ostream *out)
bool unpack_string(const char **p, const char *end, std::string &result)
Decode a std::string from a string.
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
DatabaseError indicates some sort of database related error.
Wrapper around standard unique_ptr template.