xapian-core  1.4.21
glass_dbcheck.cc
Go to the documentation of this file.
1 
4 /* Copyright 1999,2000,2001 BrightStation PLC
5  * Copyright 2002-2022 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20  * USA
21  */
22 
23 #include <config.h>
24 
25 #include "glass_dbcheck.h"
26 
27 #include "bitstream.h"
28 
29 #include "internaltypes.h"
30 
31 #include "glass_check.h"
32 #include "glass_cursor.h"
33 #include "glass_defs.h"
34 #include "glass_table.h"
35 #include "glass_version.h"
36 #include "pack.h"
37 #include "backends/valuestats.h"
38 
39 #include <xapian.h>
40 
41 #include "filetests.h"
42 #include "autoptr.h"
43 #include <ostream>
44 #include <vector>
45 
46 using namespace std;
47 
48 static inline bool
49 is_user_metadata_key(const string & key)
50 {
51  return key.size() > 1 && key[0] == '\0' && key[1] == '\xc0';
52 }
53 
54 struct VStats : public ValueStats {
55  Xapian::doccount freq_real;
56 
57  VStats() : ValueStats(), freq_real(0) {}
58 };
59 
60 size_t
61 check_glass_table(const char * tablename, const string &db_dir, int fd,
62  off_t offset_,
63  const GlassVersion & version_file, int opts,
64  vector<Xapian::termcount> & doclens, ostream * out)
65 {
66  Xapian::docid db_last_docid = version_file.get_last_docid();
67  if (out)
68  *out << tablename << ":\n";
69  if (fd < 0) {
70  if (strcmp(tablename, "postlist") != 0) {
71  // Other filenames are created lazily, so may not exist.
72  string filename(db_dir);
73  filename += '/';
74  filename += tablename;
75  filename += "." GLASS_TABLE_EXTENSION;
76  if (!file_exists(filename)) {
77  if (out) {
78  if (strcmp(tablename, "termlist") == 0) {
79  *out << "Not present.\n";
80  } else {
81  *out << "Lazily created, and not yet used.\n";
82  }
83  *out << endl;
84  }
85  return 0;
86  }
87  }
88  }
89 
90  // Check the btree structure.
91  AutoPtr<GlassTable> table(
92  GlassTableCheck::check(tablename, db_dir, fd, offset_,
93  version_file, opts, out));
94 
95  // Now check the glass structures inside the btree.
96  AutoPtr<GlassCursor> cursor(table->cursor_get());
97 
98  size_t errors = 0;
99 
100  cursor->find_entry(string());
101  cursor->next(); // Skip the empty entry.
102 
103  if (strcmp(tablename, "postlist") == 0) {
104  // Now check the structure of each postlist in the table.
105  map<Xapian::valueno, VStats> valuestats;
106  string current_term;
107  Xapian::docid lastdid = 0;
108  Xapian::termcount termfreq = 0, collfreq = 0;
109  Xapian::termcount tf = 0, cf = 0;
110  Xapian::doccount num_doclens = 0;
111 
112  for ( ; !cursor->after_end(); cursor->next()) {
113  string & key = cursor->current_key;
114 
115  if (is_user_metadata_key(key)) {
116  // User metadata can be anything, so we can't do any particular
117  // checks on it other than to check that the tag isn't empty.
118  cursor->read_tag();
119  if (cursor->current_tag.empty()) {
120  if (out)
121  *out << "User metadata item is empty" << endl;
122  ++errors;
123  }
124  continue;
125  }
126 
127  if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xe0') {
128  // doclen chunk
129  const char * pos, * end;
130  Xapian::docid did = 1;
131  if (key.size() > 2) {
132  // Non-initial chunk.
133  pos = key.data();
134  end = pos + key.size();
135  pos += 2;
136  if (!unpack_uint_preserving_sort(&pos, end, &did)) {
137  if (out)
138  *out << "Error unpacking docid from doclen key" << endl;
139  ++errors;
140  continue;
141  }
142  if (did <= lastdid) {
143  if (out)
144  *out << "First did in this doclen chunk is <= last in "
145  "prev chunk" << endl;
146  ++errors;
147  }
148  }
149 
150  cursor->read_tag();
151  pos = cursor->current_tag.data();
152  end = pos + cursor->current_tag.size();
153  if (key.size() == 2) {
154  // Initial chunk.
155  if (end - pos < 2 || pos[0] || pos[1]) {
156  if (out)
157  *out << "Initial doclen chunk has nonzero dummy fields" << endl;
158  ++errors;
159  continue;
160  }
161  pos += 2;
162  if (!unpack_uint(&pos, end, &did)) {
163  if (out)
164  *out << "Failed to unpack firstdid for doclen" << endl;
165  ++errors;
166  continue;
167  }
168  ++did;
169  }
170 
171  bool is_last_chunk;
172  if (!unpack_bool(&pos, end, &is_last_chunk)) {
173  if (out)
174  *out << "Failed to unpack last chunk flag for doclen" << endl;
175  ++errors;
176  continue;
177  }
178  // Read what the final document ID in this chunk is.
179  if (!unpack_uint(&pos, end, &lastdid)) {
180  if (out)
181  *out << "Failed to unpack increase to last" << endl;
182  ++errors;
183  continue;
184  }
185  lastdid += did;
186  bool bad = false;
187  while (true) {
188  Xapian::termcount doclen;
189  if (!unpack_uint(&pos, end, &doclen)) {
190  if (out)
191  *out << "Failed to unpack doclen" << endl;
192  ++errors;
193  bad = true;
194  break;
195  }
196 
197  ++num_doclens;
198 
199  if (did > db_last_docid) {
200  if (out)
201  *out << "document id " << did << " in doclen "
202  "stream is larger than get_last_docid() "
203  << db_last_docid << endl;
204  ++errors;
205  }
206 
207  if (!doclens.empty()) {
208  // In glass, a document without terms doesn't get a
209  // termlist entry.
210  Xapian::termcount termlist_doclen = 0;
211  if (did < doclens.size())
212  termlist_doclen = doclens[did];
213 
214  if (doclen != termlist_doclen) {
215  if (out)
216  *out << "document id " << did << ": length "
217  << doclen << " doesn't match "
218  << termlist_doclen << " in the termlist "
219  "table" << endl;
220  ++errors;
221  }
222  }
223 
224  if (pos == end) break;
225 
226  Xapian::docid inc;
227  if (!unpack_uint(&pos, end, &inc)) {
228  if (out)
229  *out << "Failed to unpack docid increase" << endl;
230  ++errors;
231  bad = true;
232  break;
233  }
234  ++inc;
235  did += inc;
236  if (did > lastdid) {
237  if (out)
238  *out << "docid " << did << " > last docid "
239  << lastdid << endl;
240  ++errors;
241  }
242  }
243  if (bad) {
244  continue;
245  }
246  if (is_last_chunk) {
247  if (did != lastdid) {
248  if (out)
249  *out << "lastdid " << lastdid << " != last did "
250  << did << endl;
251  ++errors;
252  }
253  }
254 
255  continue;
256  }
257 
258  if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xd0') {
259  // Value stats.
260  const char * p = key.data();
261  const char * end = p + key.length();
262  p += 2;
263  Xapian::valueno slot;
264  if (!unpack_uint_last(&p, end, &slot)) {
265  if (out)
266  *out << "Bad valuestats key (no slot)" << endl;
267  ++errors;
268  continue;
269  }
270 
271  cursor->read_tag();
272  p = cursor->current_tag.data();
273  end = p + cursor->current_tag.size();
274 
275  VStats & v = valuestats[slot];
276  if (!unpack_uint(&p, end, &v.freq)) {
277  if (out) {
278  if (*p == 0) {
279  *out << "Incomplete stats item in value table";
280  } else {
281  *out << "Frequency statistic in value table is too large";
282  }
283  *out << endl;
284  }
285  ++errors;
286  continue;
287  }
288  if (!unpack_string(&p, end, v.lower_bound)) {
289  if (out) {
290  if (*p == 0) {
291  *out << "Incomplete stats item in value table";
292  } else {
293  *out << "Lower bound statistic in value table is too large";
294  }
295  *out << endl;
296  }
297  ++errors;
298  continue;
299  }
300  size_t len = end - p;
301  if (len == 0) {
302  v.upper_bound = v.lower_bound;
303  } else {
304  v.upper_bound.assign(p, len);
305  }
306 
307  continue;
308  }
309 
310  if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xd8') {
311  // Value stream chunk.
312  const char * p = key.data();
313  const char * end = p + key.length();
314  p += 2;
315  Xapian::valueno slot;
316  if (!unpack_uint(&p, end, &slot)) {
317  if (out)
318  *out << "Bad value chunk key (no slot)" << endl;
319  ++errors;
320  continue;
321  }
322  Xapian::docid did;
323  if (!unpack_uint_preserving_sort(&p, end, &did)) {
324  if (out)
325  *out << "Bad value chunk key (no docid)" << endl;
326  ++errors;
327  continue;
328  }
329  if (p != end) {
330  if (out)
331  *out << "Bad value chunk key (trailing junk)" << endl;
332  ++errors;
333  continue;
334  }
335 
336  VStats & v = valuestats[slot];
337 
338  cursor->read_tag();
339  p = cursor->current_tag.data();
340  end = p + cursor->current_tag.size();
341 
342  while (true) {
343  string value;
344  if (!unpack_string(&p, end, value)) {
345  if (out)
346  *out << "Failed to unpack value from chunk" << endl;
347  ++errors;
348  break;
349  }
350 
351  ++v.freq_real;
352 
353  // FIXME: Cross-check that docid did has value slot (and
354  // vice versa - that there's a value here if the slot entry
355  // says so).
356 
357  // FIXME: Check if the bounds are tight? Or is that better
358  // as a separate tool which can also update the bounds?
359  if (value < v.lower_bound) {
360  if (out)
361  *out << "Value slot " << slot << " has value "
362  "below lower bound: '" << value << "' < '"
363  << v.lower_bound << "'" << endl;
364  ++errors;
365  } else if (value > v.upper_bound) {
366  if (out)
367  *out << "Value slot " << slot << " has value "
368  "above upper bound: '" << value << "' > '"
369  << v.upper_bound << "'" << endl;
370  ++errors;
371  }
372 
373  if (p == end) break;
374  Xapian::docid delta;
375  if (!unpack_uint(&p, end, &delta)) {
376  if (out)
377  *out << "Failed to unpack docid delta from chunk"
378  << endl;
379  ++errors;
380  break;
381  }
382  Xapian::docid new_did = did + delta + 1;
383  if (new_did <= did) {
384  if (out)
385  *out << "docid overflowed in value chunk" << endl;
386  ++errors;
387  break;
388  }
389  did = new_did;
390 
391  if (did > db_last_docid) {
392  if (out)
393  *out << "document id " << did << " in value chunk "
394  "is larger than get_last_docid() "
395  << db_last_docid << endl;
396  ++errors;
397  }
398  }
399  continue;
400  }
401 
402  const char * pos, * end;
403 
404  // Get term from key.
405  pos = key.data();
406  end = pos + key.size();
407 
408  string term;
409  Xapian::docid did;
410  if (!unpack_string_preserving_sort(&pos, end, term)) {
411  if (out)
412  *out << "Error unpacking termname from key" << endl;
413  ++errors;
414  continue;
415  }
416  if (!current_term.empty() && term != current_term) {
417  // The term changed unexpectedly.
418  if (pos == end) {
419  if (out)
420  *out << "No last chunk for term '" << current_term
421  << "'" << endl;
422  current_term.resize(0);
423  } else {
424  if (out)
425  *out << "Mismatch in follow-on chunk in posting list "
426  "for term '" << current_term << "' (got '"
427  << term << "')" << endl;
428  current_term = term;
429  tf = cf = 0;
430  lastdid = 0;
431  }
432  ++errors;
433  }
434  if (pos == end) {
435  // First chunk.
436  if (term == current_term) {
437  // This probably isn't possible.
438  if (out)
439  *out << "First posting list chunk for term '" << term
440  << "' follows previous chunk for the same term"
441  << endl;
442  ++errors;
443  }
444  current_term = term;
445  tf = cf = 0;
446 
447  // Unpack extra header from first chunk.
448  cursor->read_tag();
449  pos = cursor->current_tag.data();
450  end = pos + cursor->current_tag.size();
451  if (!unpack_uint(&pos, end, &termfreq)) {
452  if (out)
453  *out << "Failed to unpack termfreq for term '" << term
454  << "'" << endl;
455  ++errors;
456  continue;
457  }
458  if (!unpack_uint(&pos, end, &collfreq)) {
459  if (out)
460  *out << "Failed to unpack collfreq for term '" << term
461  << "'" << endl;
462  ++errors;
463  continue;
464  }
465  if (!unpack_uint(&pos, end, &did)) {
466  if (out)
467  *out << "Failed to unpack firstdid for term '" << term
468  << "'" << endl;
469  ++errors;
470  continue;
471  }
472  ++did;
473  } else {
474  // Continuation chunk.
475  if (current_term.empty()) {
476  if (out)
477  *out << "First chunk for term '" << current_term
478  << "' is a continuation chunk" << endl;
479  ++errors;
480  current_term = term;
481  }
482  AssertEq(current_term, term);
483  if (!unpack_uint_preserving_sort(&pos, end, &did)) {
484  if (out)
485  *out << "Failed to unpack did from key" << endl;
486  ++errors;
487  continue;
488  }
489  if (did <= lastdid) {
490  if (out)
491  *out << "First did in this chunk is <= last in "
492  "prev chunk" << endl;
493  ++errors;
494  }
495  cursor->read_tag();
496  pos = cursor->current_tag.data();
497  end = pos + cursor->current_tag.size();
498  }
499 
500  bool is_last_chunk;
501  if (!unpack_bool(&pos, end, &is_last_chunk)) {
502  if (out)
503  *out << "Failed to unpack last chunk flag" << endl;
504  ++errors;
505  continue;
506  }
507  // Read what the final document ID in this chunk is.
508  if (!unpack_uint(&pos, end, &lastdid)) {
509  if (out)
510  *out << "Failed to unpack increase to last" << endl;
511  ++errors;
512  continue;
513  }
514  lastdid += did;
515  bool bad = false;
516  while (true) {
517  Xapian::termcount wdf;
518  if (!unpack_uint(&pos, end, &wdf)) {
519  if (out)
520  *out << "Failed to unpack wdf" << endl;
521  ++errors;
522  bad = true;
523  break;
524  }
525  ++tf;
526  cf += wdf;
527 
528  if (pos == end) break;
529 
530  Xapian::docid inc;
531  if (!unpack_uint(&pos, end, &inc)) {
532  if (out)
533  *out << "Failed to unpack docid increase" << endl;
534  ++errors;
535  bad = true;
536  break;
537  }
538  ++inc;
539  did += inc;
540  if (did > lastdid) {
541  if (out)
542  *out << "docid " << did << " > last docid " << lastdid
543  << endl;
544  ++errors;
545  }
546  }
547  if (bad) {
548  continue;
549  }
550  if (is_last_chunk) {
551  if (tf != termfreq) {
552  if (out)
553  *out << "termfreq " << termfreq << " != # of entries "
554  << tf << endl;
555  ++errors;
556  }
557  if (cf != collfreq) {
558  if (out)
559  *out << "collfreq " << collfreq << " != sum wdf " << cf
560  << endl;
561  ++errors;
562  }
563  if (did != lastdid) {
564  if (out)
565  *out << "lastdid " << lastdid << " != last did " << did
566  << endl;
567  ++errors;
568  }
569  current_term.resize(0);
570  }
571  }
572  if (!current_term.empty()) {
573  if (out)
574  *out << "Last term '" << current_term << "' has no last chunk"
575  << endl;
576  ++errors;
577  }
578 
579  Xapian::doccount doccount = version_file.get_doccount();
580  if (num_doclens != doccount) {
581  if (out)
582  *out << "Document length list has " << num_doclens
583  << " entries, should be " << doccount << endl;
584  ++errors;
585  }
586 
587  map<Xapian::valueno, VStats>::const_iterator i;
588  for (i = valuestats.begin(); i != valuestats.end(); ++i) {
589  if (i->second.freq != i->second.freq_real) {
590  if (out)
591  *out << "Value stats frequency for slot " << i->first
592  << " is " << i->second.freq << " but recounting "
593  "gives " << i->second.freq_real << endl;
594  ++errors;
595  }
596  }
597  } else if (strcmp(tablename, "docdata") == 0) {
598  // glass doesn't store a docdata entry if the document data is empty,
599  // so we can only check there aren't more docdata entries than
600  // documents.
601  Xapian::doccount doccount = version_file.get_doccount();
602  if (table->get_entry_count() > doccount) {
603  if (out)
604  *out << "More document data (" << table->get_entry_count()
605  << ") then documents (" << doccount << ")" << endl;
606  ++errors;
607  }
608 
609  // Now check the contents of the docdata table.
610  for ( ; !cursor->after_end(); cursor->next()) {
611  string & key = cursor->current_key;
612 
613  // Get docid from key.
614  const char * pos = key.data();
615  const char * end = pos + key.size();
616 
617  Xapian::docid did;
618  if (!unpack_uint_preserving_sort(&pos, end, &did)) {
619  if (out)
620  *out << "Error unpacking docid from key" << endl;
621  ++errors;
622  continue;
623  }
624  if (pos != end) {
625  if (out)
626  *out << "Extra junk in key" << endl;
627  ++errors;
628  } else {
629  if (did > db_last_docid) {
630  if (out)
631  *out << "document id " << did << " in docdata table "
632  "is larger than get_last_docid() "
633  << db_last_docid << endl;
634  ++errors;
635  }
636  }
637 
638  // Fetch and decompress the document data to catch problems with
639  // the splitting into multiple items, corruption of the compressed
640  // data, etc.
641  cursor->read_tag();
642  if (cursor->current_tag.empty()) {
643  // We shouldn't store empty document data.
644  if (out)
645  *out << "Empty document data explicitly stored for "
646  "document id " << did << endl;
647  ++errors;
648  }
649  }
650  } else if (strcmp(tablename, "termlist") == 0) {
651  // Now check the contents of the termlist table.
652  Xapian::doccount num_termlists = 0;
653  Xapian::doccount num_slotsused_entries = 0;
654  for ( ; !cursor->after_end(); cursor->next()) {
655  string & key = cursor->current_key;
656 
657  // Get docid from key.
658  const char * pos = key.data();
659  const char * end = pos + key.size();
660 
661  Xapian::docid did;
662  if (!unpack_uint_preserving_sort(&pos, end, &did)) {
663  if (out)
664  *out << "Error unpacking docid from key" << endl;
665  ++errors;
666  continue;
667  }
668 
669  if (did > db_last_docid) {
670  if (out)
671  *out << "document id " << did << " in termlist table "
672  "is larger than get_last_docid() "
673  << db_last_docid << endl;
674  ++errors;
675  }
676 
677  if (end - pos == 1 && *pos == '\0') {
678  // Value slots used entry.
679  ++num_slotsused_entries;
680  cursor->read_tag();
681 
682  pos = cursor->current_tag.data();
683  end = pos + cursor->current_tag.size();
684 
685  if (pos == end) {
686  if (out)
687  *out << "Empty value slots used tag" << endl;
688  ++errors;
689  continue;
690  }
691 
692  Xapian::valueno prev_slot;
693  if (!unpack_uint(&pos, end, &prev_slot)) {
694  if (out)
695  *out << "Value slot encoding corrupt" << endl;
696  ++errors;
697  continue;
698  }
699 
700  while (pos != end) {
701  Xapian::valueno slot;
702  if (!unpack_uint(&pos, end, &slot)) {
703  if (out)
704  *out << "Value slot encoding corrupt" << endl;
705  ++errors;
706  break;
707  }
708  slot += prev_slot + 1;
709  if (slot <= prev_slot) {
710  if (out)
711  *out << "Value slot number overflowed ("
712  << prev_slot << " -> " << slot << ")" << endl;
713  ++errors;
714  }
715  prev_slot = slot;
716  }
717  continue;
718  }
719 
720  if (pos != end) {
721  if (out)
722  *out << "Extra junk in key" << endl;
723  ++errors;
724  continue;
725  }
726 
727  ++num_termlists;
728  cursor->read_tag();
729 
730  pos = cursor->current_tag.data();
731  end = pos + cursor->current_tag.size();
732 
733  if (pos == end) {
734  // Empty termlist.
735  continue;
736  }
737 
738  Xapian::termcount doclen, termlist_size;
739 
740  // Read doclen
741  if (!unpack_uint(&pos, end, &doclen)) {
742  if (out) {
743  if (pos != 0) {
744  *out << "doclen out of range";
745  } else {
746  *out << "Unexpected end of data when reading doclen";
747  }
748  *out << endl;
749  }
750  ++errors;
751  continue;
752  }
753 
754  // Check doclen with doclen lower and upper bounds
755  if (doclen > version_file.get_doclength_upper_bound()) {
756  if (out)
757  *out << "doclen " << doclen << " > upper bound "
758  << version_file.get_doclength_upper_bound() << endl;
759  ++errors;
760  } else if (doclen < version_file.get_doclength_lower_bound() &&
761  doclen != 0) {
762  if (out)
763  *out << "doclen " << doclen << " < lower bound "
764  << version_file.get_doclength_lower_bound() << endl;
765  ++errors;
766  }
767 
768  // Read termlist_size
769  if (!unpack_uint(&pos, end, &termlist_size)) {
770  if (out) {
771  if (pos != 0) {
772  *out << "termlist_size out of range";
773  } else {
774  *out << "Unexpected end of data when reading "
775  "termlist_size";
776  }
777  *out << endl;
778  }
779  ++errors;
780  continue;
781  }
782 
783  Xapian::termcount actual_doclen = 0, actual_termlist_size = 0;
784  string current_tname;
785 
786  bool bad = false;
787  while (pos != end) {
788  Xapian::doccount current_wdf = 0;
789  bool got_wdf = false;
790  // If there was a previous term, how much to reuse.
791  if (!current_tname.empty()) {
792  string::size_type len = static_cast<unsigned char>(*pos++);
793  if (len > current_tname.length()) {
794  // The wdf was squeezed into the same byte.
795  current_wdf = len / (current_tname.length() + 1) - 1;
796  len %= (current_tname.length() + 1);
797  got_wdf = true;
798  }
799  current_tname.resize(len);
800  }
801  // What to append (note len must be positive, since just truncating
802  // always takes us backwards in the sort order)
803  string::size_type len = static_cast<unsigned char>(*pos++);
804  current_tname.append(pos, len);
805  pos += len;
806 
807  if (!got_wdf) {
808  // Read wdf
809  if (!unpack_uint(&pos, end, &current_wdf)) {
810  if (out) {
811  if (pos == 0) {
812  *out << "Unexpected end of data when reading "
813  "termlist current_wdf";
814  } else {
815  *out << "Size of wdf out of range in termlist";
816  }
817  *out << endl;
818  }
819  ++errors;
820  bad = true;
821  break;
822  }
823  }
824 
825  ++actual_termlist_size;
826  actual_doclen += current_wdf;
827  }
828  if (bad) {
829  continue;
830  }
831 
832  if (termlist_size != actual_termlist_size) {
833  if (out)
834  *out << "termlist_size != # of entries in termlist" << endl;
835  ++errors;
836  }
837  if (doclen != actual_doclen) {
838  if (out)
839  *out << "doclen != sum(wdf)" << endl;
840  ++errors;
841  }
842 
843  // + 1 so that did is a valid subscript.
844  if (doclens.size() <= did) doclens.resize(did + 1);
845  doclens[did] = actual_doclen;
846  }
847 
848  Xapian::doccount doccount = version_file.get_doccount();
849 
850  // glass doesn't store a termlist entry if there are no terms, so we
851  // can only check there aren't more termlists than documents.
852  if (num_termlists > doccount) {
853  if (out)
854  *out << "More termlists (" << num_termlists
855  << ") then documents (" << doccount << ")" << endl;
856  ++errors;
857  }
858 
859  // glass doesn't store a valueslots used entry if there are no terms,
860  // so we can only check there aren't more such entries than documents.
861  if (num_slotsused_entries > doccount) {
862  if (out)
863  *out << "More slots-used entries (" << num_slotsused_entries
864  << ") then documents (" << doccount << ")" << endl;
865  ++errors;
866  }
867  } else if (strcmp(tablename, "position") == 0) {
868  // Now check the contents of the position table.
869  for ( ; !cursor->after_end(); cursor->next()) {
870  string & key = cursor->current_key;
871 
872  // Get docid from key.
873  const char * pos = key.data();
874  const char * end = pos + key.size();
875 
876  string term;
877  if (!unpack_string_preserving_sort(&pos, end, term)) {
878  if (out)
879  *out << "Error unpacking term from key" << endl;
880  ++errors;
881  continue;
882  }
883 
884  Xapian::docid did;
885  if (!unpack_uint_preserving_sort(&pos, end, &did)) {
886  if (out)
887  *out << "Error unpacking docid from key" << endl;
888  ++errors;
889  continue;
890  }
891 
892  if (pos != end) {
893  if (out)
894  *out << "Extra junk in key with docid " << did << endl;
895  ++errors;
896  continue;
897  }
898 
899  if (did > db_last_docid) {
900  if (out)
901  *out << "document id " << did << " in position table "
902  "is larger than get_last_docid() "
903  << db_last_docid << endl;
904  ++errors;
905  } else if (!doclens.empty()) {
906  // In glass, a document without terms doesn't get a
907  // termlist entry, so we can't tell the difference
908  // easily.
909  if (did >= doclens.size() || doclens[did] == 0) {
910  if (out)
911  *out << "Position list entry for document " << did
912  << " which doesn't exist or has no terms" << endl;
913  ++errors;
914  }
915  }
916 
917  cursor->read_tag();
918 
919  const string & data = cursor->current_tag;
920  pos = data.data();
921  end = pos + data.size();
922 
923  Xapian::termpos pos_last;
924  if (!unpack_uint(&pos, end, &pos_last)) {
925  if (out)
926  *out << tablename << " table: Position list data corrupt"
927  << endl;
928  ++errors;
929  continue;
930  }
931  if (pos == end) {
932  // Special case for single entry position list.
933  } else {
934  // Skip the header we just read.
935  BitReader rd(data, pos - data.data());
936  Xapian::termpos pos_first = rd.decode(pos_last);
937  Xapian::termpos pos_size = rd.decode(pos_last - pos_first) + 2;
938  rd.decode_interpolative(0, pos_size - 1, pos_first, pos_last);
939  Xapian::termpos p = rd.decode_interpolative_next();
940  bool ok = true;
941  while (p != pos_last) {
942  Xapian::termpos pos_prev = p;
943  p = rd.decode_interpolative_next();
944  if (p <= pos_prev) {
945  if (out)
946  *out << tablename << " table: Positions not "
947  "strictly monotonically increasing" << endl;
948  ++errors;
949  ok = false;
950  break;
951  }
952  }
953  if (ok && !rd.check_all_gone()) {
954  if (out)
955  *out << tablename << " table: Junk after position data"
956  << endl;
957  ++errors;
958  }
959  }
960  }
961  } else {
962  if (out)
963  *out << tablename << " table: Full structure check not "
964  "implemented, checking readability\n";
965  for ( ; !cursor->after_end(); cursor->next()) {
966  cursor->read_tag();
967  }
968  }
969 
970  if (out) {
971  if (!errors)
972  *out << tablename << " table structure checked OK\n";
973  else
974  *out << tablename << " table errors found: " << errors << "\n";
975  *out << endl;
976  }
977 
978  return errors;
979 }
Xapian::termcount get_doclength_upper_bound() const
GlassVersion class.
Class to hold statistics for a given slot.
Definition: valuestats.h:29
Statistics about values.
#define AssertEq(A, B)
Definition: omassert.h:124
static bool is_user_metadata_key(const string &key)
The GlassVersion class manages the revision files.
Definition: glass_version.h:94
static const char * opts
STL namespace.
Definitions, types, etc for use inside glass.
std::string upper_bound
An upper bound on the values stored in the given value slot.
Definition: valuestats.h:41
Utility functions for testing files.
size_t check_glass_table(const char *tablename, const string &db_dir, int fd, off_t offset_, const GlassVersion &version_file, int opts, vector< Xapian::termcount > &doclens, ostream *out)
#define GLASS_TABLE_EXTENSION
Glass table extension.
Definition: glass_defs.h:27
Xapian::docid get_last_docid() const
Xapian::doccount freq
The number of documents which have a (non-empty) value stored in the slot.
Definition: valuestats.h:33
std::string lower_bound
A lower bound on the values stored in the given value slot.
Definition: valuestats.h:37
Xapian::doccount freq_real
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
Btree checking.
Public interfaces for the Xapian library.
Xapian::termcount get_doclength_lower_bound() const
Read a stream created by BitWriter.
Definition: bitstream.h:64
bool unpack_string_preserving_sort(const char **p, const char *end, std::string &result)
Decode a "sort preserved" std::string from a string.
Definition: pack.h:562
bool unpack_uint_preserving_sort(const char **p, const char *end, U *result)
Decode an "sort preserved" unsigned integer from a string.
Definition: pack.h:318
Btree implementation.
Classes to encode/decode a bitstream.
static GlassTableCheck * check(const char *tablename, const std::string &path, int fd, off_t offset_, const GlassVersion &version_file, int opts, std::ostream *out)
Definition: glass_check.cc:263
Xapian::termpos decode(Xapian::termpos outof, bool force=false)
Definition: bitstream.cc:174
bool unpack_bool(const char **p, const char *end, bool *result)
Decode a bool from a string.
Definition: pack.h:69
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Interface to Btree cursors.
Pack types into strings and unpack them again.
unsigned valueno
The number for a value slot in a document.
Definition: types.h:108
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
Definition: types.h:83
bool unpack_uint_last(const char **p, const char *end, U *result)
Decode an unsigned integer as the last item in a string.
Definition: pack.h:111
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
Definition: pack.h:413
bool unpack_string(const char **p, const char *end, std::string &result)
Decode a std::string from a string.
Definition: pack.h:504
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
Check a glass table.
bool file_exists(const char *path)
Test if a file exists.
Definition: filetests.h:39
Types used internally.
Wrapper around standard unique_ptr template.
Xapian::doccount get_doccount() const