xapian-core  1.4.25
glass_dbcheck.cc
Go to the documentation of this file.
1 
4 /* Copyright 1999,2000,2001 BrightStation PLC
5  * Copyright 2002-2022 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20  * USA
21  */
22 
23 #include <config.h>
24 
25 #include "glass_dbcheck.h"
26 
27 #include "bitstream.h"
28 
29 #include "internaltypes.h"
30 
31 #include "glass_check.h"
32 #include "glass_cursor.h"
33 #include "glass_defs.h"
34 #include "glass_table.h"
35 #include "glass_version.h"
36 #include "pack.h"
37 #include "backends/valuestats.h"
38 
39 #include <xapian.h>
40 
41 #include "filetests.h"
42 #include "autoptr.h"
43 #include <ostream>
44 #include <vector>
45 
46 using namespace std;
47 
48 static inline bool
49 is_user_metadata_key(const string & key)
50 {
51  return key.size() > 1 && key[0] == '\0' && key[1] == '\xc0';
52 }
53 
54 struct VStats : public ValueStats {
55  Xapian::doccount freq_real;
56 
57  VStats() : ValueStats(), freq_real(0) {}
58 };
59 
60 size_t
61 check_glass_table(const char * tablename, const string &db_dir, int fd,
62  off_t offset_,
63  const GlassVersion & version_file, int opts,
64  vector<Xapian::termcount> & doclens, ostream * out)
65 {
66  Xapian::docid db_last_docid = version_file.get_last_docid();
67  if (out)
68  *out << tablename << ":\n";
69  if (fd < 0) {
70  if (strcmp(tablename, "postlist") != 0) {
71  // Other filenames are created lazily, so may not exist.
72  string filename(db_dir);
73  filename += '/';
74  filename += tablename;
75  filename += "." GLASS_TABLE_EXTENSION;
76  if (!file_exists(filename)) {
77  if (out) {
78  if (strcmp(tablename, "termlist") == 0) {
79  *out << "Not present.\n";
80  } else {
81  *out << "Lazily created, and not yet used.\n";
82  }
83  *out << endl;
84  }
85  return 0;
86  }
87  }
88  }
89 
90  // Check the btree structure.
91  AutoPtr<GlassTable> table(
92  GlassTableCheck::check(tablename, db_dir, fd, offset_,
93  version_file, opts, out));
94 
95  // Now check the glass structures inside the btree.
96  AutoPtr<GlassCursor> cursor(table->cursor_get());
97 
98  size_t errors = 0;
99 
100  cursor->find_entry(string());
101  cursor->next(); // Skip the empty entry.
102 
103  if (strcmp(tablename, "postlist") == 0) {
104  // Now check the structure of each postlist in the table.
105  map<Xapian::valueno, VStats> valuestats;
106  string current_term;
107  Xapian::docid lastdid = 0;
108  Xapian::termcount termfreq = 0, collfreq = 0;
109  Xapian::termcount tf = 0, cf = 0;
110  Xapian::doccount num_doclens = 0;
111 
112  for ( ; !cursor->after_end(); cursor->next()) {
113  string & key = cursor->current_key;
114 
115  if (is_user_metadata_key(key)) {
116  // User metadata can be anything, so we can't do any particular
117  // checks on it other than to check that the tag isn't empty.
118  cursor->read_tag();
119  if (cursor->current_tag.empty()) {
120  if (out)
121  *out << "User metadata item is empty" << endl;
122  ++errors;
123  }
124  continue;
125  }
126 
127  if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xe0') {
128  // doclen chunk
129  const char * pos, * end;
130  Xapian::docid did = 1;
131  if (key.size() > 2) {
132  // Non-initial chunk.
133  pos = key.data();
134  end = pos + key.size();
135  pos += 2;
136  if (!unpack_uint_preserving_sort(&pos, end, &did)) {
137  if (out)
138  *out << "Error unpacking docid from doclen key" << endl;
139  ++errors;
140  continue;
141  }
142  if (did <= lastdid) {
143  if (out)
144  *out << "First did in this doclen chunk is <= last in "
145  "prev chunk" << endl;
146  ++errors;
147  }
148  }
149 
150  cursor->read_tag();
151  pos = cursor->current_tag.data();
152  end = pos + cursor->current_tag.size();
153  if (key.size() == 2) {
154  // Initial chunk.
155  if (end - pos < 2 || pos[0] || pos[1]) {
156  if (out)
157  *out << "Initial doclen chunk has nonzero dummy fields" << endl;
158  ++errors;
159  continue;
160  }
161  pos += 2;
162  if (!unpack_uint(&pos, end, &did)) {
163  if (out)
164  *out << "Failed to unpack firstdid for doclen" << endl;
165  ++errors;
166  continue;
167  }
168  ++did;
169  }
170 
171  bool is_last_chunk;
172  if (!unpack_bool(&pos, end, &is_last_chunk)) {
173  if (out)
174  *out << "Failed to unpack last chunk flag for doclen" << endl;
175  ++errors;
176  continue;
177  }
178  // Read what the final document ID in this chunk is.
179  if (!unpack_uint(&pos, end, &lastdid)) {
180  if (out)
181  *out << "Failed to unpack increase to last" << endl;
182  ++errors;
183  continue;
184  }
185  lastdid += did;
186  bool bad = false;
187  while (true) {
188  Xapian::termcount doclen;
189  if (!unpack_uint(&pos, end, &doclen)) {
190  if (out)
191  *out << "Failed to unpack doclen" << endl;
192  ++errors;
193  bad = true;
194  break;
195  }
196 
197  ++num_doclens;
198 
199  if (did > db_last_docid) {
200  if (out)
201  *out << "document id " << did << " in doclen "
202  "stream is larger than get_last_docid() "
203  << db_last_docid << endl;
204  ++errors;
205  }
206 
207  if (!doclens.empty()) {
208  // In glass, a document without terms doesn't get a
209  // termlist entry.
210  Xapian::termcount termlist_doclen = 0;
211  if (did < doclens.size())
212  termlist_doclen = doclens[did];
213 
214  if (doclen != termlist_doclen) {
215  if (out)
216  *out << "document id " << did << ": length "
217  << doclen << " doesn't match "
218  << termlist_doclen << " in the termlist "
219  "table" << endl;
220  ++errors;
221  }
222  }
223 
224  if (pos == end) break;
225 
226  Xapian::docid inc;
227  if (!unpack_uint(&pos, end, &inc)) {
228  if (out)
229  *out << "Failed to unpack docid increase" << endl;
230  ++errors;
231  bad = true;
232  break;
233  }
234  ++inc;
235  did += inc;
236  if (did > lastdid) {
237  if (out)
238  *out << "docid " << did << " > last docid "
239  << lastdid << endl;
240  ++errors;
241  }
242  }
243  if (bad) {
244  continue;
245  }
246  if (is_last_chunk) {
247  if (did != lastdid) {
248  if (out)
249  *out << "lastdid " << lastdid << " != last did "
250  << did << endl;
251  ++errors;
252  }
253  }
254 
255  continue;
256  }
257 
258  if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xd0') {
259  // Value stats.
260  const char * p = key.data();
261  const char * end = p + key.length();
262  p += 2;
263  Xapian::valueno slot;
264  if (!unpack_uint_last(&p, end, &slot)) {
265  if (out)
266  *out << "Bad valuestats key (no slot)" << endl;
267  ++errors;
268  continue;
269  }
270 
271  cursor->read_tag();
272  p = cursor->current_tag.data();
273  end = p + cursor->current_tag.size();
274 
275  VStats & v = valuestats[slot];
276  if (!unpack_uint(&p, end, &v.freq)) {
277  if (out) {
278  if (*p == 0) {
279  *out << "Incomplete stats item in value table";
280  } else {
281  *out << "Frequency statistic in value table is too large";
282  }
283  *out << endl;
284  }
285  ++errors;
286  continue;
287  }
288  if (!unpack_string(&p, end, v.lower_bound)) {
289  if (out) {
290  if (*p == 0) {
291  *out << "Incomplete stats item in value table";
292  } else {
293  *out << "Lower bound statistic in value table is too large";
294  }
295  *out << endl;
296  }
297  ++errors;
298  continue;
299  }
300  size_t len = end - p;
301  if (len == 0) {
302  v.upper_bound = v.lower_bound;
303  } else {
304  v.upper_bound.assign(p, len);
305  }
306 
307  continue;
308  }
309 
310  if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xd8') {
311  // Value stream chunk.
312  const char * p = key.data();
313  const char * end = p + key.length();
314  p += 2;
315  Xapian::valueno slot;
316  if (!unpack_uint(&p, end, &slot)) {
317  if (out)
318  *out << "Bad value chunk key (no slot)" << endl;
319  ++errors;
320  continue;
321  }
322  Xapian::docid did;
323  if (!unpack_uint_preserving_sort(&p, end, &did)) {
324  if (out)
325  *out << "Bad value chunk key (no docid)" << endl;
326  ++errors;
327  continue;
328  }
329  if (p != end) {
330  if (out)
331  *out << "Bad value chunk key (trailing junk)" << endl;
332  ++errors;
333  continue;
334  }
335 
336  VStats & v = valuestats[slot];
337 
338  cursor->read_tag();
339  p = cursor->current_tag.data();
340  end = p + cursor->current_tag.size();
341 
342  while (true) {
343  string value;
344  if (!unpack_string(&p, end, value)) {
345  if (out)
346  *out << "Failed to unpack value from chunk" << endl;
347  ++errors;
348  break;
349  }
350 
351  ++v.freq_real;
352 
353  // FIXME: Cross-check that docid did has value slot (and
354  // vice versa - that there's a value here if the slot entry
355  // says so).
356 
357  // FIXME: Check if the bounds are tight? Or is that better
358  // as a separate tool which can also update the bounds?
359  if (value < v.lower_bound) {
360  if (out)
361  *out << "Value slot " << slot << " has value "
362  "below lower bound: '" << value << "' < '"
363  << v.lower_bound << "'" << endl;
364  ++errors;
365  } else if (value > v.upper_bound) {
366  if (out)
367  *out << "Value slot " << slot << " has value "
368  "above upper bound: '" << value << "' > '"
369  << v.upper_bound << "'" << endl;
370  ++errors;
371  }
372 
373  if (p == end) break;
374  Xapian::docid delta;
375  if (!unpack_uint(&p, end, &delta)) {
376  if (out)
377  *out << "Failed to unpack docid delta from chunk"
378  << endl;
379  ++errors;
380  break;
381  }
382  Xapian::docid new_did = did + delta + 1;
383  if (new_did <= did) {
384  if (out)
385  *out << "docid overflowed in value chunk" << endl;
386  ++errors;
387  break;
388  }
389  did = new_did;
390 
391  if (did > db_last_docid) {
392  if (out)
393  *out << "document id " << did << " in value chunk "
394  "is larger than get_last_docid() "
395  << db_last_docid << endl;
396  ++errors;
397  }
398  }
399  continue;
400  }
401 
402  const char * pos, * end;
403 
404  // Get term from key.
405  pos = key.data();
406  end = pos + key.size();
407 
408  string term;
409  Xapian::docid did;
410  if (!unpack_string_preserving_sort(&pos, end, term)) {
411  if (out)
412  *out << "Error unpacking termname from key" << endl;
413  ++errors;
414  continue;
415  }
416  if (!current_term.empty() && term != current_term) {
417  // The term changed unexpectedly.
418  if (pos == end) {
419  if (out)
420  *out << "No last chunk for term '" << current_term
421  << "'" << endl;
422  current_term.resize(0);
423  } else {
424  if (out)
425  *out << "Mismatch in follow-on chunk in posting list "
426  "for term '" << current_term << "' (got '"
427  << term << "')" << endl;
428  current_term = term;
429  tf = cf = 0;
430  lastdid = 0;
431  }
432  ++errors;
433  }
434  if (pos == end) {
435  // First chunk.
436  if (term == current_term) {
437  // This probably isn't possible.
438  if (out)
439  *out << "First posting list chunk for term '" << term
440  << "' follows previous chunk for the same term"
441  << endl;
442  ++errors;
443  }
444  current_term = term;
445  tf = cf = 0;
446 
447  // Unpack extra header from first chunk.
448  cursor->read_tag();
449  pos = cursor->current_tag.data();
450  end = pos + cursor->current_tag.size();
451  if (!unpack_uint(&pos, end, &termfreq)) {
452  if (out)
453  *out << "Failed to unpack termfreq for term '" << term
454  << "'" << endl;
455  ++errors;
456  continue;
457  }
458  if (!unpack_uint(&pos, end, &collfreq)) {
459  if (out)
460  *out << "Failed to unpack collfreq for term '" << term
461  << "'" << endl;
462  ++errors;
463  continue;
464  }
465  if (!unpack_uint(&pos, end, &did)) {
466  if (out)
467  *out << "Failed to unpack firstdid for term '" << term
468  << "'" << endl;
469  ++errors;
470  continue;
471  }
472  ++did;
473  } else {
474  // Continuation chunk.
475  if (current_term.empty()) {
476  if (out)
477  *out << "First chunk for term '" << term
478  << "' is a continuation chunk" << endl;
479  ++errors;
480  current_term = term;
481  }
482  AssertEq(current_term, term);
483  if (!unpack_uint_preserving_sort(&pos, end, &did)) {
484  if (out)
485  *out << "Failed to unpack did from key" << endl;
486  ++errors;
487  continue;
488  }
489  if (did <= lastdid) {
490  if (out)
491  *out << "First did in this chunk is <= last in "
492  "prev chunk" << endl;
493  ++errors;
494  }
495  cursor->read_tag();
496  pos = cursor->current_tag.data();
497  end = pos + cursor->current_tag.size();
498  }
499 
500  bool is_last_chunk;
501  if (!unpack_bool(&pos, end, &is_last_chunk)) {
502  if (out)
503  *out << "Failed to unpack last chunk flag" << endl;
504  ++errors;
505  continue;
506  }
507  // Read what the final document ID in this chunk is.
508  if (!unpack_uint(&pos, end, &lastdid)) {
509  if (out)
510  *out << "Failed to unpack increase to last" << endl;
511  ++errors;
512  continue;
513  }
514  lastdid += did;
515  bool bad = false;
516  while (true) {
517  Xapian::termcount wdf;
518  if (!unpack_uint(&pos, end, &wdf)) {
519  if (out)
520  *out << "Failed to unpack wdf" << endl;
521  ++errors;
522  bad = true;
523  break;
524  }
525  ++tf;
526  cf += wdf;
527 
528  if (pos == end) break;
529 
530  Xapian::docid inc;
531  if (!unpack_uint(&pos, end, &inc)) {
532  if (out)
533  *out << "Failed to unpack docid increase" << endl;
534  ++errors;
535  bad = true;
536  break;
537  }
538  ++inc;
539  did += inc;
540  if (did > lastdid) {
541  if (out)
542  *out << "docid " << did << " > last docid " << lastdid
543  << endl;
544  ++errors;
545  }
546  }
547  if (bad) {
548  continue;
549  }
550  if (is_last_chunk) {
551  if (tf != termfreq) {
552  if (out)
553  *out << "termfreq " << termfreq << " != # of entries "
554  << tf << endl;
555  ++errors;
556  }
557  if (cf != collfreq) {
558  if (out)
559  *out << "collfreq " << collfreq << " != sum wdf " << cf
560  << endl;
561  ++errors;
562  }
563  if (did != lastdid) {
564  if (out)
565  *out << "lastdid " << lastdid << " != last did " << did
566  << endl;
567  ++errors;
568  }
569  current_term.resize(0);
570  }
571  }
572  if (!current_term.empty()) {
573  if (out)
574  *out << "Last term '" << current_term << "' has no last chunk"
575  << endl;
576  ++errors;
577  }
578 
579  Xapian::doccount doccount = version_file.get_doccount();
580  if (num_doclens != doccount) {
581  if (out)
582  *out << "Document length list has " << num_doclens
583  << " entries, should be " << doccount << endl;
584  ++errors;
585  }
586 
587  map<Xapian::valueno, VStats>::const_iterator i;
588  for (i = valuestats.begin(); i != valuestats.end(); ++i) {
589  if (i->second.freq != i->second.freq_real) {
590  if (out)
591  *out << "Value stats frequency for slot " << i->first
592  << " is " << i->second.freq << " but recounting "
593  "gives " << i->second.freq_real << endl;
594  ++errors;
595  }
596  }
597  } else if (strcmp(tablename, "docdata") == 0) {
598  // glass doesn't store a docdata entry if the document data is empty,
599  // so we can only check there aren't more docdata entries than
600  // documents.
601  Xapian::doccount doccount = version_file.get_doccount();
602  if (table->get_entry_count() > doccount) {
603  if (out)
604  *out << "More document data (" << table->get_entry_count()
605  << ") then documents (" << doccount << ")" << endl;
606  ++errors;
607  }
608 
609  // Now check the contents of the docdata table.
610  for ( ; !cursor->after_end(); cursor->next()) {
611  string & key = cursor->current_key;
612 
613  // Get docid from key.
614  const char * pos = key.data();
615  const char * end = pos + key.size();
616 
617  Xapian::docid did;
618  if (!unpack_uint_preserving_sort(&pos, end, &did)) {
619  if (out)
620  *out << "Error unpacking docid from key" << endl;
621  ++errors;
622  continue;
623  }
624  if (pos != end) {
625  if (out)
626  *out << "Extra junk in key" << endl;
627  ++errors;
628  } else {
629  if (did > db_last_docid) {
630  if (out)
631  *out << "document id " << did << " in docdata table "
632  "is larger than get_last_docid() "
633  << db_last_docid << endl;
634  ++errors;
635  }
636  }
637 
638  // Fetch and decompress the document data to catch problems with
639  // the splitting into multiple items, corruption of the compressed
640  // data, etc.
641  cursor->read_tag();
642  if (cursor->current_tag.empty()) {
643  // We shouldn't store empty document data.
644  if (out)
645  *out << "Empty document data explicitly stored for "
646  "document id " << did << endl;
647  ++errors;
648  }
649  }
650  } else if (strcmp(tablename, "termlist") == 0) {
651  // Now check the contents of the termlist table.
652  Xapian::doccount num_termlists = 0;
653  Xapian::doccount num_slotsused_entries = 0;
654  for ( ; !cursor->after_end(); cursor->next()) {
655  string & key = cursor->current_key;
656 
657  // Get docid from key.
658  const char * pos = key.data();
659  const char * end = pos + key.size();
660 
661  Xapian::docid did;
662  if (!unpack_uint_preserving_sort(&pos, end, &did)) {
663  if (out)
664  *out << "Error unpacking docid from key" << endl;
665  ++errors;
666  continue;
667  }
668 
669  if (did > db_last_docid) {
670  if (out)
671  *out << "document id " << did << " in termlist table "
672  "is larger than get_last_docid() "
673  << db_last_docid << endl;
674  ++errors;
675  }
676 
677  if (end - pos == 1 && *pos == '\0') {
678  // Value slots used entry.
679  ++num_slotsused_entries;
680  cursor->read_tag();
681 
682  pos = cursor->current_tag.data();
683  end = pos + cursor->current_tag.size();
684 
685  if (pos == end) {
686  if (out) {
687  *out << "document id " << did
688  << ": Empty value slots used tag\n";
689  }
690  ++errors;
691  continue;
692  }
693 
694  Xapian::valueno prev_slot;
695  if (!unpack_uint(&pos, end, &prev_slot)) {
696  if (out) {
697  *out << "document id " << did
698  << ": Value slot encoding corrupt\n";
699  }
700  ++errors;
701  continue;
702  }
703 
704  while (pos != end) {
705  Xapian::valueno slot;
706  if (!unpack_uint(&pos, end, &slot)) {
707  if (out) {
708  *out << "document id " << did
709  << ": Value slot encoding corrupt\n";
710  }
711  ++errors;
712  break;
713  }
714  slot += prev_slot + 1;
715  if (slot <= prev_slot) {
716  if (out) {
717  *out << "document id " << did
718  << ": Value slot number overflowed ("
719  << prev_slot << " -> " << slot << ")\n";
720  }
721  ++errors;
722  }
723  prev_slot = slot;
724  }
725  continue;
726  }
727 
728  if (pos != end) {
729  if (out) {
730  *out << "document id " << did << ": Extra junk in key\n";
731  }
732  ++errors;
733  continue;
734  }
735 
736  ++num_termlists;
737  cursor->read_tag();
738 
739  pos = cursor->current_tag.data();
740  end = pos + cursor->current_tag.size();
741 
742  if (pos == end) {
743  // Empty termlist.
744  continue;
745  }
746 
747  Xapian::termcount doclen, termlist_size;
748 
749  // Read doclen
750  if (!unpack_uint(&pos, end, &doclen)) {
751  if (out) {
752  *out << "document id " << did;
753  if (pos != 0) {
754  *out << ": doclen out of range\n";
755  } else {
756  *out << ": Unexpected end of data when reading "
757  "doclen\n";
758  }
759  }
760  ++errors;
761  continue;
762  }
763 
764  // Check doclen with doclen lower and upper bounds
765  if (doclen > version_file.get_doclength_upper_bound()) {
766  if (out) {
767  *out << "document id " << did
768  << ": doclen " << doclen << " > upper bound "
769  << version_file.get_doclength_upper_bound() << '\n';
770  }
771  ++errors;
772  } else if (doclen < version_file.get_doclength_lower_bound() &&
773  doclen != 0) {
774  if (out) {
775  *out << "document id " << did
776  << ": doclen " << doclen << " < lower bound "
777  << version_file.get_doclength_lower_bound() << '\n';
778  }
779  ++errors;
780  }
781 
782  // Read termlist_size
783  if (!unpack_uint(&pos, end, &termlist_size)) {
784  if (out) {
785  *out << "document id " << did;
786  if (pos != 0) {
787  *out << ": termlist_size out of range\n";
788  } else {
789  *out << ": Unexpected end of data when reading "
790  "termlist_size\n";
791  }
792  }
793  ++errors;
794  continue;
795  }
796 
797  Xapian::termcount actual_doclen = 0, actual_termlist_size = 0;
798  string current_tname;
799 
800  bool bad = false;
801  while (pos != end) {
802  Xapian::doccount current_wdf = 0;
803  bool got_wdf = false;
804  // If there was a previous term, how much to reuse.
805  if (!current_tname.empty()) {
806  string::size_type len = static_cast<unsigned char>(*pos++);
807  if (len > current_tname.length()) {
808  // The wdf was squeezed into the same byte.
809  current_wdf = len / (current_tname.length() + 1) - 1;
810  len %= (current_tname.length() + 1);
811  got_wdf = true;
812  }
813  current_tname.resize(len);
814  }
815  // What to append (note len must be positive, since just truncating
816  // always takes us backwards in the sort order)
817  string::size_type len = static_cast<unsigned char>(*pos++);
818  current_tname.append(pos, len);
819  pos += len;
820 
821  if (!got_wdf) {
822  // Read wdf
823  if (!unpack_uint(&pos, end, &current_wdf)) {
824  if (out) {
825  *out << "document id " << did;
826  if (pos == 0) {
827  *out << ": Unexpected end of data when reading "
828  "termlist current_wdf\n";
829  } else {
830  *out << ": Size of wdf out of range in "
831  "termlist\n";
832  }
833  }
834  ++errors;
835  bad = true;
836  break;
837  }
838  }
839 
840  ++actual_termlist_size;
841  actual_doclen += current_wdf;
842  }
843  if (bad) {
844  continue;
845  }
846 
847  if (termlist_size != actual_termlist_size) {
848  if (out) {
849  *out << "document id " << did << ": termlist_size "
850  << termlist_size << " != # of entries in termlist "
851  << actual_termlist_size << '\n';
852  }
853  ++errors;
854  }
855  if (doclen != actual_doclen) {
856  if (out) {
857  *out << "document id " << did << ": length " << doclen
858  << " != sum(wdf) " << actual_doclen << '\n';
859  }
860  ++errors;
861  }
862 
863  // + 1 so that did is a valid subscript.
864  if (doclens.size() <= did) doclens.resize(did + 1);
865  doclens[did] = actual_doclen;
866  }
867 
868  Xapian::doccount doccount = version_file.get_doccount();
869 
870  // glass doesn't store a termlist entry if there are no terms, so we
871  // can only check there aren't more termlists than documents.
872  if (num_termlists > doccount) {
873  if (out)
874  *out << "More termlists (" << num_termlists
875  << ") then documents (" << doccount << ")" << endl;
876  ++errors;
877  }
878 
879  // glass doesn't store a valueslots used entry if there are no terms,
880  // so we can only check there aren't more such entries than documents.
881  if (num_slotsused_entries > doccount) {
882  if (out)
883  *out << "More slots-used entries (" << num_slotsused_entries
884  << ") then documents (" << doccount << ")" << endl;
885  ++errors;
886  }
887  } else if (strcmp(tablename, "position") == 0) {
888  // Now check the contents of the position table.
889  for ( ; !cursor->after_end(); cursor->next()) {
890  string & key = cursor->current_key;
891 
892  // Get docid from key.
893  const char * pos = key.data();
894  const char * end = pos + key.size();
895 
896  string term;
897  if (!unpack_string_preserving_sort(&pos, end, term)) {
898  if (out)
899  *out << "Error unpacking term from key" << endl;
900  ++errors;
901  continue;
902  }
903 
904  Xapian::docid did;
905  if (!unpack_uint_preserving_sort(&pos, end, &did)) {
906  if (out)
907  *out << "Error unpacking docid from key" << endl;
908  ++errors;
909  continue;
910  }
911 
912  if (pos != end) {
913  if (out)
914  *out << "Extra junk in key with docid " << did << endl;
915  ++errors;
916  continue;
917  }
918 
919  if (did > db_last_docid) {
920  if (out)
921  *out << "document id " << did << " in position table "
922  "is larger than get_last_docid() "
923  << db_last_docid << endl;
924  ++errors;
925  } else if (!doclens.empty()) {
926  // In glass, a document without terms doesn't get a
927  // termlist entry, so we can't tell the difference
928  // easily.
929  if (did >= doclens.size() || doclens[did] == 0) {
930  if (out)
931  *out << "Position list entry for document " << did
932  << " which doesn't exist or has no terms" << endl;
933  ++errors;
934  }
935  }
936 
937  cursor->read_tag();
938 
939  const string & data = cursor->current_tag;
940  pos = data.data();
941  end = pos + data.size();
942 
943  Xapian::termpos pos_last;
944  if (!unpack_uint(&pos, end, &pos_last)) {
945  if (out)
946  *out << tablename << " table: Position list data corrupt"
947  << endl;
948  ++errors;
949  continue;
950  }
951  if (pos == end) {
952  // Special case for single entry position list.
953  } else {
954  // Skip the header we just read.
955  BitReader rd(data, pos - data.data());
956  Xapian::termpos pos_first = rd.decode(pos_last);
957  Xapian::termpos pos_size = rd.decode(pos_last - pos_first) + 2;
958  rd.decode_interpolative(0, pos_size - 1, pos_first, pos_last);
959  Xapian::termpos p = rd.decode_interpolative_next();
960  bool ok = true;
961  while (p != pos_last) {
962  Xapian::termpos pos_prev = p;
963  p = rd.decode_interpolative_next();
964  if (p <= pos_prev) {
965  if (out)
966  *out << tablename << " table: Positions not "
967  "strictly monotonically increasing" << endl;
968  ++errors;
969  ok = false;
970  break;
971  }
972  }
973  if (ok && !rd.check_all_gone()) {
974  if (out)
975  *out << tablename << " table: Junk after position data"
976  << endl;
977  ++errors;
978  }
979  }
980  }
981  } else {
982  if (out)
983  *out << tablename << " table: Full structure check not "
984  "implemented, checking readability\n";
985  for ( ; !cursor->after_end(); cursor->next()) {
986  cursor->read_tag();
987  }
988  }
989 
990  if (out) {
991  if (!errors)
992  *out << tablename << " table structure checked OK\n";
993  else
994  *out << tablename << " table errors found: " << errors << "\n";
995  *out << endl;
996  }
997 
998  return errors;
999 }
Xapian::termcount get_doclength_upper_bound() const
GlassVersion class.
Class to hold statistics for a given slot.
Definition: valuestats.h:29
Statistics about values.
#define AssertEq(A, B)
Definition: omassert.h:124
static bool is_user_metadata_key(const string &key)
The GlassVersion class manages the revision files.
Definition: glass_version.h:94
static const char * opts
STL namespace.
Definitions, types, etc for use inside glass.
std::string upper_bound
An upper bound on the values stored in the given value slot.
Definition: valuestats.h:41
Utility functions for testing files.
size_t check_glass_table(const char *tablename, const string &db_dir, int fd, off_t offset_, const GlassVersion &version_file, int opts, vector< Xapian::termcount > &doclens, ostream *out)
#define GLASS_TABLE_EXTENSION
Glass table extension.
Definition: glass_defs.h:27
Xapian::docid get_last_docid() const
Xapian::doccount freq
The number of documents which have a (non-empty) value stored in the slot.
Definition: valuestats.h:33
std::string lower_bound
A lower bound on the values stored in the given value slot.
Definition: valuestats.h:37
Xapian::doccount freq_real
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
Btree checking.
Public interfaces for the Xapian library.
Xapian::termcount get_doclength_lower_bound() const
Read a stream created by BitWriter.
Definition: bitstream.h:64
bool unpack_string_preserving_sort(const char **p, const char *end, std::string &result)
Decode a "sort preserved" std::string from a string.
Definition: pack.h:562
bool unpack_uint_preserving_sort(const char **p, const char *end, U *result)
Decode a "sort preserved" unsigned integer from a string.
Definition: pack.h:318
Btree implementation.
Classes to encode/decode a bitstream.
static GlassTableCheck * check(const char *tablename, const std::string &path, int fd, off_t offset_, const GlassVersion &version_file, int opts, std::ostream *out)
Definition: glass_check.cc:263
Xapian::termpos decode(Xapian::termpos outof, bool force=false)
Definition: bitstream.cc:176
bool unpack_bool(const char **p, const char *end, bool *result)
Decode a bool from a string.
Definition: pack.h:69
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Interface to Btree cursors.
Pack types into strings and unpack them again.
unsigned valueno
The number for a value slot in a document.
Definition: types.h:108
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
Definition: types.h:83
bool unpack_uint_last(const char **p, const char *end, U *result)
Decode an unsigned integer as the last item in a string.
Definition: pack.h:111
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
Definition: pack.h:413
bool unpack_string(const char **p, const char *end, std::string &result)
Decode a std::string from a string.
Definition: pack.h:504
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
Check a glass table.
bool file_exists(const char *path)
Test if a file exists.
Definition: filetests.h:39
Types used internally.
Wrapper around standard unique_ptr template.
Xapian::doccount get_doccount() const