51 return key.size() > 1 && key[0] ==
'\0' && key[1] ==
'\xc0';
64 vector<Xapian::termcount> & doclens, ostream * out)
68 *out << tablename <<
":\n";
70 if (strcmp(tablename,
"postlist") != 0) {
72 string filename(db_dir);
74 filename += tablename;
78 if (strcmp(tablename,
"termlist") == 0) {
79 *out <<
"Not present.\n";
81 *out <<
"Lazily created, and not yet used.\n";
91 AutoPtr<GlassTable> table(
93 version_file, opts, out));
96 AutoPtr<GlassCursor> cursor(table->cursor_get());
100 cursor->find_entry(
string());
103 if (strcmp(tablename,
"postlist") == 0) {
105 map<Xapian::valueno, VStats> valuestats;
112 for ( ; !cursor->after_end(); cursor->next()) {
113 string & key = cursor->current_key;
119 if (cursor->current_tag.empty()) {
121 *out <<
"User metadata item is empty" << endl;
127 if (key.size() >= 2 && key[0] ==
'\0' && key[1] ==
'\xe0') {
129 const char * pos, * end;
131 if (key.size() > 2) {
134 end = pos + key.size();
138 *out <<
"Error unpacking docid from doclen key" << endl;
142 if (did <= lastdid) {
144 *out <<
"First did in this doclen chunk is <= last in " 145 "prev chunk" << endl;
151 pos = cursor->current_tag.data();
152 end = pos + cursor->current_tag.size();
153 if (key.size() == 2) {
155 if (end - pos < 2 || pos[0] || pos[1]) {
157 *out <<
"Initial doclen chunk has nonzero dummy fields" << endl;
164 *out <<
"Failed to unpack firstdid for doclen" << endl;
174 *out <<
"Failed to unpack last chunk flag for doclen" << endl;
181 *out <<
"Failed to unpack increase to last" << endl;
191 *out <<
"Failed to unpack doclen" << endl;
199 if (did > db_last_docid) {
201 *out <<
"document id " << did <<
" in doclen " 202 "stream is larger than get_last_docid() " 203 << db_last_docid << endl;
207 if (!doclens.empty()) {
211 if (did < doclens.size())
212 termlist_doclen = doclens[did];
214 if (doclen != termlist_doclen) {
216 *out <<
"document id " << did <<
": length " 217 << doclen <<
" doesn't match " 218 << termlist_doclen <<
" in the termlist " 224 if (pos == end)
break;
229 *out <<
"Failed to unpack docid increase" << endl;
238 *out <<
"docid " << did <<
" > last docid " 247 if (did != lastdid) {
249 *out <<
"lastdid " << lastdid <<
" != last did " 258 if (key.size() >= 2 && key[0] ==
'\0' && key[1] ==
'\xd0') {
260 const char * p = key.data();
261 const char * end = p + key.length();
266 *out <<
"Bad valuestats key (no slot)" << endl;
272 p = cursor->current_tag.data();
273 end = p + cursor->current_tag.size();
275 VStats & v = valuestats[slot];
279 *out <<
"Incomplete stats item in value table";
281 *out <<
"Frequency statistic in value table is too large";
291 *out <<
"Incomplete stats item in value table";
293 *out <<
"Lower bound statistic in value table is too large";
300 size_t len = end - p;
310 if (key.size() >= 2 && key[0] ==
'\0' && key[1] ==
'\xd8') {
312 const char * p = key.data();
313 const char * end = p + key.length();
318 *out <<
"Bad value chunk key (no slot)" << endl;
325 *out <<
"Bad value chunk key (no docid)" << endl;
331 *out <<
"Bad value chunk key (trailing junk)" << endl;
336 VStats & v = valuestats[slot];
339 p = cursor->current_tag.data();
340 end = p + cursor->current_tag.size();
346 *out <<
"Failed to unpack value from chunk" << endl;
361 *out <<
"Value slot " << slot <<
" has value " 362 "below lower bound: '" << value <<
"' < '" 367 *out <<
"Value slot " << slot <<
" has value " 368 "above upper bound: '" << value <<
"' > '" 377 *out <<
"Failed to unpack docid delta from chunk" 383 if (new_did <= did) {
385 *out <<
"docid overflowed in value chunk" << endl;
391 if (did > db_last_docid) {
393 *out <<
"document id " << did <<
" in value chunk " 394 "is larger than get_last_docid() " 395 << db_last_docid << endl;
402 const char * pos, * end;
406 end = pos + key.size();
412 *out <<
"Error unpacking termname from key" << endl;
416 if (!current_term.empty() && term != current_term) {
420 *out <<
"No last chunk for term '" << current_term
422 current_term.resize(0);
425 *out <<
"Mismatch in follow-on chunk in posting list " 426 "for term '" << current_term <<
"' (got '" 427 << term <<
"')" << endl;
436 if (term == current_term) {
439 *out <<
"First posting list chunk for term '" << term
440 <<
"' follows previous chunk for the same term" 449 pos = cursor->current_tag.data();
450 end = pos + cursor->current_tag.size();
453 *out <<
"Failed to unpack termfreq for term '" << term
460 *out <<
"Failed to unpack collfreq for term '" << term
467 *out <<
"Failed to unpack firstdid for term '" << term
475 if (current_term.empty()) {
477 *out <<
"First chunk for term '" << term
478 <<
"' is a continuation chunk" << endl;
485 *out <<
"Failed to unpack did from key" << endl;
489 if (did <= lastdid) {
491 *out <<
"First did in this chunk is <= last in " 492 "prev chunk" << endl;
496 pos = cursor->current_tag.data();
497 end = pos + cursor->current_tag.size();
503 *out <<
"Failed to unpack last chunk flag" << endl;
510 *out <<
"Failed to unpack increase to last" << endl;
520 *out <<
"Failed to unpack wdf" << endl;
528 if (pos == end)
break;
533 *out <<
"Failed to unpack docid increase" << endl;
542 *out <<
"docid " << did <<
" > last docid " << lastdid
551 if (tf != termfreq) {
553 *out <<
"termfreq " << termfreq <<
" != # of entries " 557 if (cf != collfreq) {
559 *out <<
"collfreq " << collfreq <<
" != sum wdf " << cf
563 if (did != lastdid) {
565 *out <<
"lastdid " << lastdid <<
" != last did " << did
569 current_term.resize(0);
572 if (!current_term.empty()) {
574 *out <<
"Last term '" << current_term <<
"' has no last chunk" 580 if (num_doclens != doccount) {
582 *out <<
"Document length list has " << num_doclens
583 <<
" entries, should be " << doccount << endl;
587 map<Xapian::valueno, VStats>::const_iterator i;
588 for (i = valuestats.begin(); i != valuestats.end(); ++i) {
589 if (i->second.freq != i->second.freq_real) {
591 *out <<
"Value stats frequency for slot " << i->first
592 <<
" is " << i->second.freq <<
" but recounting " 593 "gives " << i->second.freq_real << endl;
597 }
else if (strcmp(tablename,
"docdata") == 0) {
602 if (table->get_entry_count() >
doccount) {
604 *out <<
"More document data (" << table->get_entry_count()
605 <<
") then documents (" << doccount <<
")" << endl;
610 for ( ; !cursor->after_end(); cursor->next()) {
611 string & key = cursor->current_key;
614 const char * pos = key.data();
615 const char * end = pos + key.size();
620 *out <<
"Error unpacking docid from key" << endl;
626 *out <<
"Extra junk in key" << endl;
629 if (did > db_last_docid) {
631 *out <<
"document id " << did <<
" in docdata table " 632 "is larger than get_last_docid() " 633 << db_last_docid << endl;
642 if (cursor->current_tag.empty()) {
645 *out <<
"Empty document data explicitly stored for " 646 "document id " << did << endl;
650 }
else if (strcmp(tablename,
"termlist") == 0) {
654 for ( ; !cursor->after_end(); cursor->next()) {
655 string & key = cursor->current_key;
658 const char * pos = key.data();
659 const char * end = pos + key.size();
664 *out <<
"Error unpacking docid from key" << endl;
669 if (did > db_last_docid) {
671 *out <<
"document id " << did <<
" in termlist table " 672 "is larger than get_last_docid() " 673 << db_last_docid << endl;
677 if (end - pos == 1 && *pos ==
'\0') {
679 ++num_slotsused_entries;
682 pos = cursor->current_tag.data();
683 end = pos + cursor->current_tag.size();
687 *out <<
"document id " << did
688 <<
": Empty value slots used tag\n";
697 *out <<
"document id " << did
698 <<
": Value slot encoding corrupt\n";
708 *out <<
"document id " << did
709 <<
": Value slot encoding corrupt\n";
714 slot += prev_slot + 1;
715 if (slot <= prev_slot) {
717 *out <<
"document id " << did
718 <<
": Value slot number overflowed (" 719 << prev_slot <<
" -> " << slot <<
")\n";
730 *out <<
"document id " << did <<
": Extra junk in key\n";
739 pos = cursor->current_tag.data();
740 end = pos + cursor->current_tag.size();
752 *out <<
"document id " << did;
754 *out <<
": doclen out of range\n";
756 *out <<
": Unexpected end of data when reading " 767 *out <<
"document id " << did
768 <<
": doclen " << doclen <<
" > upper bound " 775 *out <<
"document id " << did
776 <<
": doclen " << doclen <<
" < lower bound " 785 *out <<
"document id " << did;
787 *out <<
": termlist_size out of range\n";
789 *out <<
": Unexpected end of data when reading " 798 string current_tname;
803 bool got_wdf =
false;
805 if (!current_tname.empty()) {
806 string::size_type len =
static_cast<unsigned char>(*pos++);
807 if (len > current_tname.length()) {
809 current_wdf = len / (current_tname.length() + 1) - 1;
810 len %= (current_tname.length() + 1);
813 current_tname.resize(len);
817 string::size_type len =
static_cast<unsigned char>(*pos++);
818 current_tname.append(pos, len);
825 *out <<
"document id " << did;
827 *out <<
": Unexpected end of data when reading " 828 "termlist current_wdf\n";
830 *out <<
": Size of wdf out of range in " 840 ++actual_termlist_size;
841 actual_doclen += current_wdf;
847 if (termlist_size != actual_termlist_size) {
849 *out <<
"document id " << did <<
": termlist_size " 850 << termlist_size <<
" != # of entries in termlist " 851 << actual_termlist_size <<
'\n';
855 if (doclen != actual_doclen) {
857 *out <<
"document id " << did <<
": length " << doclen
858 <<
" != sum(wdf) " << actual_doclen <<
'\n';
864 if (doclens.size() <= did) doclens.resize(did + 1);
865 doclens[did] = actual_doclen;
872 if (num_termlists > doccount) {
874 *out <<
"More termlists (" << num_termlists
875 <<
") then documents (" << doccount <<
")" << endl;
881 if (num_slotsused_entries > doccount) {
883 *out <<
"More slots-used entries (" << num_slotsused_entries
884 <<
") then documents (" << doccount <<
")" << endl;
887 }
else if (strcmp(tablename,
"position") == 0) {
889 for ( ; !cursor->after_end(); cursor->next()) {
890 string & key = cursor->current_key;
893 const char * pos = key.data();
894 const char * end = pos + key.size();
899 *out <<
"Error unpacking term from key" << endl;
907 *out <<
"Error unpacking docid from key" << endl;
914 *out <<
"Extra junk in key with docid " << did << endl;
919 if (did > db_last_docid) {
921 *out <<
"document id " << did <<
" in position table " 922 "is larger than get_last_docid() " 923 << db_last_docid << endl;
925 }
else if (!doclens.empty()) {
929 if (did >= doclens.size() || doclens[did] == 0) {
931 *out <<
"Position list entry for document " << did
932 <<
" which doesn't exist or has no terms" << endl;
939 const string & data = cursor->current_tag;
941 end = pos + data.size();
946 *out << tablename <<
" table: Position list data corrupt" 958 rd.decode_interpolative(0, pos_size - 1, pos_first, pos_last);
961 while (p != pos_last) {
963 p = rd.decode_interpolative_next();
966 *out << tablename <<
" table: Positions not " 967 "strictly monotonically increasing" << endl;
973 if (ok && !rd.check_all_gone()) {
975 *out << tablename <<
" table: Junk after position data" 983 *out << tablename <<
" table: Full structure check not " 984 "implemented, checking readability\n";
985 for ( ; !cursor->after_end(); cursor->next()) {
992 *out << tablename <<
" table structure checked OK\n";
994 *out << tablename <<
" table errors found: " << errors <<
"\n";
Xapian::termcount get_doclength_upper_bound() const
Class to hold statistics for a given slot.
static bool is_user_metadata_key(const string &key)
The GlassVersion class manages the revision files.
Definitions, types, etc for use inside glass.
std::string upper_bound
An upper bound on the values stored in the given value slot.
Utility functions for testing files.
size_t check_glass_table(const char *tablename, const string &db_dir, int fd, off_t offset_, const GlassVersion &version_file, int opts, vector< Xapian::termcount > &doclens, ostream *out)
#define GLASS_TABLE_EXTENSION
Glass table extension.
Xapian::docid get_last_docid() const
Xapian::doccount freq
The number of documents which have a (non-empty) value stored in the slot.
std::string lower_bound
A lower bound on the values stored in the given value slot.
Xapian::doccount freq_real
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Public interfaces for the Xapian library.
Xapian::termcount get_doclength_lower_bound() const
Read a stream created by BitWriter.
bool unpack_string_preserving_sort(const char **p, const char *end, std::string &result)
Decode a "sort preserved" std::string from a string.
bool unpack_uint_preserving_sort(const char **p, const char *end, U *result)
Decode a "sort preserved" unsigned integer from a string.
Classes to encode/decode a bitstream.
static GlassTableCheck * check(const char *tablename, const std::string &path, int fd, off_t offset_, const GlassVersion &version_file, int opts, std::ostream *out)
Xapian::termpos decode(Xapian::termpos outof, bool force=false)
bool unpack_bool(const char **p, const char *end, bool *result)
Decode a bool from a string.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Interface to Btree cursors.
Pack types into strings and unpack them again.
unsigned valueno
The number for a value slot in a document.
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
bool unpack_uint_last(const char **p, const char *end, U *result)
Decode an unsigned integer as the last item in a string.
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
bool unpack_string(const char **p, const char *end, std::string &result)
Decode a std::string from a string.
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
bool file_exists(const char *path)
Test if a file exists.
Wrapper around standard unique_ptr template.
Xapian::doccount get_doccount() const