50 return key.size() > 1 && key[0] ==
'\0' && key[1] ==
'\xc0';
63 vector<Xapian::termcount>& doclens, ostream* out)
67 *out << tablename <<
":\n";
69 if (strcmp(tablename,
"postlist") != 0) {
71 string filename(db_dir);
73 filename += tablename;
77 if (strcmp(tablename,
"termlist") == 0) {
78 *out <<
"Not present.\n";
80 *out <<
"Lazily created, and not yet used.\n";
90 unique_ptr<GlassTableCheck> table(
92 version_file,
opts, out));
95 unique_ptr<GlassCursor> cursor(table->cursor_get());
102 if (strcmp(tablename,
"postlist") == 0) {
104 map<Xapian::valueno, VStats> valuestats;
111 for ( ; !cursor->after_end(); cursor->next()) {
112 string & key = cursor->current_key;
118 if (cursor->current_tag.empty()) {
120 *out <<
"User metadata item is empty" << endl;
126 if (key.size() >= 2 && key[0] ==
'\0' && key[1] ==
'\xe0') {
128 const char *
pos, * end;
130 if (key.size() > 2) {
133 end =
pos + key.size();
137 *out <<
"Error unpacking docid from doclen key" << endl;
141 if (did <= lastdid) {
143 *out <<
"First did in this doclen chunk is <= last in "
144 "prev chunk" << endl;
150 pos = cursor->current_tag.data();
151 end =
pos + cursor->current_tag.size();
152 if (key.size() == 2) {
156 *out <<
"Initial doclen chunk has nonzero dummy fields" << endl;
163 *out <<
"Failed to unpack firstdid for doclen" << endl;
173 *out <<
"Failed to unpack last chunk flag for doclen" << endl;
180 *out <<
"Failed to unpack increase to last" << endl;
190 *out <<
"Failed to unpack doclen" << endl;
198 if (did > db_last_docid) {
200 *out <<
"document id " << did <<
" in doclen "
201 "stream is larger than get_last_docid() "
202 << db_last_docid << endl;
206 if (!doclens.empty()) {
210 if (did < doclens.size())
211 termlist_doclen = doclens[did];
213 if (doclen != termlist_doclen) {
215 *out <<
"document id " << did <<
": length "
216 << doclen <<
" doesn't match "
217 << termlist_doclen <<
" in the termlist "
223 if (
pos == end)
break;
228 *out <<
"Failed to unpack docid increase" << endl;
237 *out <<
"docid " << did <<
" > last docid "
246 if (did != lastdid) {
248 *out <<
"lastdid " << lastdid <<
" != last did "
257 if (key.size() >= 2 && key[0] ==
'\0' && key[1] ==
'\xd0') {
259 const char *
p = key.data();
260 const char * end =
p + key.length();
265 *out <<
"Bad valuestats key (no slot)" << endl;
271 p = cursor->current_tag.data();
272 end =
p + cursor->current_tag.size();
274 VStats & v = valuestats[slot];
278 *out <<
"Incomplete stats item in value table";
280 *out <<
"Frequency statistic in value table is too large";
290 *out <<
"Incomplete stats item in value table";
292 *out <<
"Lower bound statistic in value table is too large";
299 size_t len = end -
p;
309 if (key.size() >= 2 && key[0] ==
'\0' && key[1] ==
'\xd8') {
311 const char *
p = key.data();
312 const char * end =
p + key.length();
317 *out <<
"Bad value chunk key (no slot)" << endl;
324 *out <<
"Bad value chunk key (no docid)" << endl;
330 *out <<
"Bad value chunk key (trailing junk)" << endl;
335 VStats & v = valuestats[slot];
338 p = cursor->current_tag.data();
339 end =
p + cursor->current_tag.size();
345 *out <<
"Failed to unpack value from chunk" << endl;
360 *out <<
"Value slot " << slot <<
" has value "
361 "below lower bound: '" << value <<
"' < '"
366 *out <<
"Value slot " << slot <<
" has value "
367 "above upper bound: '" << value <<
"' > '"
376 *out <<
"Failed to unpack docid delta from chunk"
382 if (new_did <= did) {
384 *out <<
"docid overflowed in value chunk" << endl;
390 if (did > db_last_docid) {
392 *out <<
"document id " << did <<
" in value chunk "
393 "is larger than get_last_docid() "
394 << db_last_docid << endl;
401 const char *
pos, * end;
405 end =
pos + key.size();
411 *out <<
"Error unpacking termname from key" << endl;
415 if (!current_term.empty() &&
term != current_term) {
419 *out <<
"No last chunk for term '" << current_term
421 current_term.resize(0);
424 *out <<
"Mismatch in follow-on chunk in posting list "
425 "for term '" << current_term <<
"' (got '"
426 <<
term <<
"')" << endl;
435 if (
term == current_term) {
438 *out <<
"First posting list chunk for term '" <<
term
439 <<
"' follows previous chunk for the same term"
448 pos = cursor->current_tag.data();
449 end =
pos + cursor->current_tag.size();
452 *out <<
"Failed to unpack termfreq for term '" <<
term
459 *out <<
"Failed to unpack collfreq for term '" <<
term
466 *out <<
"Failed to unpack firstdid for term '" <<
term
474 if (current_term.empty()) {
476 *out <<
"First chunk for term '" <<
term
477 <<
"' is a continuation chunk" << endl;
484 *out <<
"Failed to unpack did from key" << endl;
488 if (did <= lastdid) {
490 *out <<
"First did in this chunk is <= last in "
491 "prev chunk" << endl;
495 pos = cursor->current_tag.data();
496 end =
pos + cursor->current_tag.size();
502 *out <<
"Failed to unpack last chunk flag" << endl;
509 *out <<
"Failed to unpack increase to last" << endl;
519 *out <<
"Failed to unpack wdf" << endl;
527 if (
pos == end)
break;
532 *out <<
"Failed to unpack docid increase" << endl;
541 *out <<
"docid " << did <<
" > last docid " << lastdid
550 if (tf != termfreq) {
552 *out <<
"termfreq " << termfreq <<
" != # of entries "
556 if (cf != collfreq) {
558 *out <<
"collfreq " << collfreq <<
" != sum wdf " << cf
562 if (did != lastdid) {
564 *out <<
"lastdid " << lastdid <<
" != last did " << did
568 current_term.resize(0);
571 if (!current_term.empty()) {
573 *out <<
"Last term '" << current_term <<
"' has no last chunk"
581 *out <<
"Document length list has " << num_doclens
582 <<
" entries, should be " <<
doccount << endl;
586 map<Xapian::valueno, VStats>::const_iterator i;
587 for (i = valuestats.begin(); i != valuestats.end(); ++i) {
588 if (i->second.freq != i->second.freq_real) {
590 *out <<
"Value stats frequency for slot " << i->first
591 <<
" is " << i->second.freq <<
" but recounting "
592 "gives " << i->second.freq_real << endl;
596 }
else if (strcmp(tablename,
"docdata") == 0) {
601 if (table->get_entry_count() >
doccount) {
603 *out <<
"More document data (" << table->get_entry_count()
604 <<
") then documents (" <<
doccount <<
")" << endl;
609 for ( ; !cursor->after_end(); cursor->next()) {
610 string & key = cursor->current_key;
613 const char *
pos = key.data();
614 const char * end =
pos + key.size();
619 *out <<
"Error unpacking docid from key" << endl;
625 *out <<
"Extra junk in key" << endl;
628 if (did > db_last_docid) {
630 *out <<
"document id " << did <<
" in docdata table "
631 "is larger than get_last_docid() "
632 << db_last_docid << endl;
641 if (cursor->current_tag.empty()) {
644 *out <<
"Empty document data explicitly stored for "
645 "document id " << did << endl;
649 }
else if (strcmp(tablename,
"termlist") == 0) {
653 for ( ; !cursor->after_end(); cursor->next()) {
654 string & key = cursor->current_key;
657 const char *
pos = key.data();
658 const char * end =
pos + key.size();
663 *out <<
"Error unpacking docid from key" << endl;
668 if (did > db_last_docid) {
670 *out <<
"document id " << did <<
" in termlist table "
671 "is larger than get_last_docid() "
672 << db_last_docid << endl;
676 if (end -
pos == 1 && *
pos ==
'\0') {
678 ++num_slotsused_entries;
681 pos = cursor->current_tag.data();
682 end =
pos + cursor->current_tag.size();
686 *out <<
"document id " << did
687 <<
": Empty value slots used tag\n";
696 *out <<
"document id " << did
697 <<
": Value slot encoding corrupt\n";
707 *out <<
"document id " << did
708 <<
": Value slot encoding corrupt\n";
713 slot += prev_slot + 1;
714 if (slot <= prev_slot) {
716 *out <<
"document id " << did
717 <<
": Value slot number overflowed ("
718 << prev_slot <<
" -> " << slot <<
")\n";
729 *out <<
"document id " << did <<
": Extra junk in key\n";
738 pos = cursor->current_tag.data();
739 end =
pos + cursor->current_tag.size();
751 *out <<
"document id " << did;
753 *out <<
": doclen out of range\n";
755 *out <<
": Unexpected end of data when reading "
766 *out <<
"document id " << did
767 <<
": doclen " << doclen <<
" > upper bound "
774 *out <<
"document id " << did
775 <<
": doclen " << doclen <<
" < lower bound "
784 *out <<
"document id " << did;
786 *out <<
": termlist_size out of range\n";
788 *out <<
": Unexpected end of data when reading "
797 string current_tname;
802 bool got_wdf =
false;
804 if (!current_tname.empty()) {
805 string::size_type len =
static_cast<unsigned char>(*
pos++);
806 if (len > current_tname.length()) {
808 current_wdf = len / (current_tname.length() + 1) - 1;
809 len %= (current_tname.length() + 1);
812 current_tname.resize(len);
816 string::size_type len =
static_cast<unsigned char>(*
pos++);
817 current_tname.append(
pos, len);
824 *out <<
"document id " << did;
826 *out <<
": Unexpected end of data when reading "
827 "termlist current_wdf\n";
829 *out <<
": Size of wdf out of range in "
839 ++actual_termlist_size;
840 actual_doclen += current_wdf;
846 if (termlist_size != actual_termlist_size) {
848 *out <<
"document id " << did <<
": termlist_size "
849 << termlist_size <<
" != # of entries in termlist "
850 << actual_termlist_size <<
'\n';
854 if (doclen != actual_doclen) {
856 *out <<
"document id " << did <<
": length " << doclen
857 <<
" != sum(wdf) " << actual_doclen <<
'\n';
863 if (doclens.size() <= did) doclens.resize(did + 1);
864 doclens[did] = actual_doclen;
873 *out <<
"More termlists (" << num_termlists
874 <<
") then documents (" <<
doccount <<
")" << endl;
880 if (num_slotsused_entries >
doccount) {
882 *out <<
"More slots-used entries (" << num_slotsused_entries
883 <<
") then documents (" <<
doccount <<
")" << endl;
886 }
else if (strcmp(tablename,
"position") == 0) {
888 for ( ; !cursor->after_end(); cursor->next()) {
889 string & key = cursor->current_key;
892 const char *
pos = key.data();
893 const char * end =
pos + key.size();
898 *out <<
"Error unpacking term from key" << endl;
906 *out <<
"Error unpacking docid from key" << endl;
913 *out <<
"Extra junk in key with docid " << did << endl;
918 if (did > db_last_docid) {
920 *out <<
"document id " << did <<
" in position table "
921 "is larger than get_last_docid() "
922 << db_last_docid << endl;
924 }
else if (!doclens.empty()) {
928 if (did >= doclens.size() || doclens[did] == 0) {
930 *out <<
"Position list entry for document " << did
931 <<
" which doesn't exist or has no terms" << endl;
938 const string & data = cursor->current_tag;
940 end =
pos + data.size();
945 *out << tablename <<
" table: Position list data corrupt"
960 while (
p != pos_last) {
965 *out << tablename <<
" table: Positions not "
966 "strictly monotonically increasing" << endl;
974 *out << tablename <<
" table: Junk after position data"
982 *out << tablename <<
" table: Full structure check not "
983 "implemented, checking readability\n";
984 for ( ; !cursor->after_end(); cursor->next()) {
991 *out << tablename <<
" table structure checked OK\n";
993 *out << tablename <<
" table errors found: " << errors <<
"\n";
1000 #ifdef DISABLE_GPL_LIBXAPIAN
1001 # error GPL source we cannot relicense included in libxapian
Classes to encode/decode a bitstream.
static GlassTableCheck * check(const char *tablename, std::string_view path, int fd, off_t offset_, const GlassVersion &version_file, int opts, std::ostream *out)
The GlassVersion class manages the revision files.
Xapian::docid get_last_docid() const
Xapian::termcount get_doclength_lower_bound() const
Xapian::doccount get_doccount() const
Xapian::termcount get_doclength_upper_bound() const
Read a stream created by BitWriter.
bool check_all_gone() const
Xapian::termpos decode(Xapian::termpos outof, bool force=false)
void decode_interpolative(int j, int k, Xapian::termpos pos_j, Xapian::termpos pos_k)
Perform interpolative decoding between elements between j and k.
Xapian::termpos decode_interpolative_next()
Perform on-demand interpolative decoding.
Utility functions for testing files.
bool file_exists(const char *path)
Test if a file exists.
Interface to Btree cursors.
size_t check_glass_table(const char *tablename, string_view db_dir, int fd, off_t offset_, const GlassVersion &version_file, int opts, vector< Xapian::termcount > &doclens, ostream *out)
static bool is_user_metadata_key(const string &key)
Definitions, types, etc for use inside glass.
#define GLASS_TABLE_EXTENSION
Glass table extension.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
unsigned valueno
The number for a value slot in a document.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
Pack types into strings and unpack them again.
bool unpack_uint_last(const char **p, const char *end, U *result)
Decode an unsigned integer as the last item in a string.
bool unpack_string_preserving_sort(const char **p, const char *end, std::string &result)
Decode a "sort preserved" std::string from a string.
bool unpack_string(const char **p, const char *end, std::string &result)
Decode a std::string from a string.
bool unpack_bool(const char **p, const char *end, bool *result)
Decode a bool from a string.
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
bool unpack_uint_preserving_sort(const char **p, const char *end, U *result)
Decode a "sort preserved" unsigned integer from a string.
Xapian::doccount freq_real
Class to hold statistics for a given slot.
std::string lower_bound
A lower bound on the values stored in the given value slot.
std::string upper_bound
An upper bound on the values stored in the given value slot.
Xapian::doccount freq
The number of documents which have a (non-empty) value stored in the slot.
Public interfaces for the Xapian library.