xapian-core  1.4.25
chert_dbcheck.cc
Go to the documentation of this file.
1 
4 /* Copyright 1999,2000,2001 BrightStation PLC
5  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2016 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20  * USA
21  */
22 
23 #include <config.h>
24 
25 #include "chert_dbcheck.h"
26 
27 #include "bitstream.h"
28 
29 #include "internaltypes.h"
30 
31 #include "chert_check.h"
32 #include "chert_cursor.h"
33 #include "chert_table.h"
34 #include "chert_types.h"
35 #include "pack.h"
36 #include "backends/valuestats.h"
37 
38 #include <xapian.h>
39 
40 #include "autoptr.h"
41 #include <ostream>
42 #include <vector>
43 
44 using namespace std;
45 
46 static inline bool
47 is_user_metadata_key(const string & key)
48 {
49  return key.size() > 1 && key[0] == '\0' && key[1] == '\xc0';
50 }
51 
52 struct VStats : public ValueStats {
54 
55  VStats() : ValueStats(), freq_real(0) {}
56 };
57 
58 size_t
59 check_chert_table(const char * tablename, const string& dir,
60  chert_revision_number_t * rev_ptr, int opts,
61  vector<Xapian::termcount> & doclens,
63  ostream * out)
64 {
65  string filename = dir;
66  filename += '/';
67  filename += tablename;
68  filename += '.';
69 
70  try {
71  // Check the btree structure.
72  ChertTableCheck::check(tablename, filename, rev_ptr, opts, out);
73  } catch (const Xapian::DatabaseError & e) {
74  if (out)
75  *out << "Failed to check B-tree: " << e.get_description() << endl;
76  return 1;
77  }
78 
79  // Now check the chert structures inside the btree.
80  ChertTable table(tablename, filename, true);
81  if (rev_ptr && *rev_ptr) {
82  if (!table.open(*rev_ptr)) {
83  if (out)
84  *out << "Failed to reopen table after it checked OK" << endl;
85  return 1;
86  }
87  } else {
88  table.open();
89  }
90  AutoPtr<ChertCursor> cursor(table.cursor_get());
91 
92  size_t errors = 0;
93 
94  cursor->find_entry(string());
95  cursor->next(); // Skip the empty entry.
96 
97  if (strcmp(tablename, "postlist") == 0) {
98  // Now check the structure of each postlist in the table.
99  map<Xapian::valueno, VStats> valuestats;
100  string current_term;
101  Xapian::docid lastdid = 0;
102  Xapian::termcount termfreq = 0, collfreq = 0;
103  Xapian::termcount tf = 0, cf = 0;
104  Xapian::doccount num_doclens = 0;
105  bool have_metainfo_key = false;
106 
107  // The first key/tag pair should be the METAINFO - though this may be
108  // missing if the table only contains user-metadata.
109  if (!cursor->after_end()) {
110  if (cursor->current_key == string("", 1)) {
111  have_metainfo_key = true;
112  cursor->read_tag();
113  // Check format of the METAINFO key.
114  Xapian::totallength total_doclen;
115  Xapian::docid last_docid;
116  Xapian::termcount doclen_lbound;
117  Xapian::termcount doclen_ubound;
118  Xapian::termcount wdf_ubound;
119 
120  const char * data = cursor->current_tag.data();
121  const char * end = data + cursor->current_tag.size();
122  if (!unpack_uint(&data, end, &last_docid)) {
123  if (out)
124  *out << "Tag containing meta information is corrupt (couldn't read last_docid)." << endl;
125  ++errors;
126  } else if (!unpack_uint(&data, end, &doclen_lbound)) {
127  if (out)
128  *out << "Tag containing meta information is corrupt (couldn't read doclen_lbound)." << endl;
129  ++errors;
130  } else if (!unpack_uint(&data, end, &wdf_ubound)) {
131  if (out)
132  *out << "Tag containing meta information is corrupt (couldn't read wdf_ubound)." << endl;
133  ++errors;
134  } else if (!unpack_uint(&data, end, &doclen_ubound)) {
135  if (out)
136  *out << "Tag containing meta information is corrupt (couldn't read doclen_ubound)." << endl;
137  ++errors;
138  } else if (!unpack_uint_last(&data, end, &total_doclen)) {
139  if (out)
140  *out << "Tag containing meta information is corrupt (couldn't read total_doclen)." << endl;
141  ++errors;
142  } else if (data != end) {
143  if (out)
144  *out << "Tag containing meta information is corrupt (junk at end)." << endl;
145  ++errors;
146  }
147  cursor->next();
148  }
149  }
150 
151  bool seen_doclen_initial_chunk = false;
152  for ( ; !cursor->after_end(); cursor->next()) {
153  string & key = cursor->current_key;
154 
155  if (is_user_metadata_key(key)) {
156  // User metadata can be anything, so we can't do any particular
157  // checks on it other than to check that the tag isn't empty.
158  cursor->read_tag();
159  if (cursor->current_tag.empty()) {
160  if (out)
161  *out << "User metadata item is empty" << endl;
162  ++errors;
163  }
164  continue;
165  }
166 
167  if (!have_metainfo_key) {
168  have_metainfo_key = true;
169  if (out)
170  *out << "METAINFO key missing from postlist table" << endl;
171  ++errors;
172  }
173 
174  if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xe0') {
175  // doclen chunk
176  const char * pos, * end;
177  Xapian::docid did = 1;
178  if (key.size() > 2) {
179  // Non-initial chunk.
180  if (!seen_doclen_initial_chunk) {
181  if (out)
182  *out << "Doclen initial chunk missing" << endl;
183  ++errors;
184  }
185  pos = key.data();
186  end = pos + key.size();
187  pos += 2;
188  if (!C_unpack_uint_preserving_sort(&pos, end, &did)) {
189  if (out)
190  *out << "Error unpacking docid from doclen key" << endl;
191  ++errors;
192  continue;
193  }
194  if (did <= lastdid) {
195  if (out)
196  *out << "First did in this chunk is <= last in "
197  "prev chunk" << endl;
198  ++errors;
199  }
200  }
201 
202  cursor->read_tag();
203  pos = cursor->current_tag.data();
204  end = pos + cursor->current_tag.size();
205  if (key.size() == 2) {
206  // Initial chunk.
207  seen_doclen_initial_chunk = true;
208  if (end - pos < 2 || pos[0] || pos[1]) {
209  if (out)
210  *out << "Initial doclen chunk has nonzero dummy fields" << endl;
211  ++errors;
212  continue;
213  }
214  pos += 2;
215  if (!unpack_uint(&pos, end, &did)) {
216  if (out)
217  *out << "Failed to unpack firstdid for doclen" << endl;
218  ++errors;
219  continue;
220  }
221  ++did;
222  }
223 
224  bool is_last_chunk;
225  if (!unpack_bool(&pos, end, &is_last_chunk)) {
226  if (out)
227  *out << "Failed to unpack last chunk flag for doclen" << endl;
228  ++errors;
229  continue;
230  }
231  // Read what the final document ID in this chunk is.
232  if (!unpack_uint(&pos, end, &lastdid)) {
233  if (out)
234  *out << "Failed to unpack increase to last" << endl;
235  ++errors;
236  continue;
237  }
238  lastdid += did;
239  bool bad = false;
240  while (true) {
241  Xapian::termcount doclen;
242  if (!unpack_uint(&pos, end, &doclen)) {
243  if (out)
244  *out << "Failed to unpack doclen" << endl;
245  ++errors;
246  bad = true;
247  break;
248  }
249 
250  ++num_doclens;
251 
252  if (did > db_last_docid) {
253  if (out)
254  *out << "document id " << did << " in doclen "
255  "stream is larger than get_last_docid() "
256  << db_last_docid << endl;
257  ++errors;
258  }
259 
260  if (!doclens.empty()) {
261  // In chert, a document without terms doesn't get a
262  // termlist entry.
263  Xapian::termcount termlist_doclen = 0;
264  if (did < doclens.size())
265  termlist_doclen = doclens[did];
266 
267  if (doclen != termlist_doclen) {
268  if (out)
269  *out << "document id " << did << ": length "
270  << doclen << " doesn't match "
271  << termlist_doclen << " in the termlist "
272  "table" << endl;
273  ++errors;
274  }
275  }
276 
277  if (pos == end) break;
278 
279  Xapian::docid inc;
280  if (!unpack_uint(&pos, end, &inc)) {
281  if (out)
282  *out << "Failed to unpack docid increase" << endl;
283  ++errors;
284  bad = true;
285  break;
286  }
287  ++inc;
288  did += inc;
289  if (did > lastdid) {
290  if (out)
291  *out << "docid " << did << " > last docid "
292  << lastdid << endl;
293  ++errors;
294  }
295  }
296  if (bad) {
297  continue;
298  }
299  if (is_last_chunk) {
300  if (did != lastdid) {
301  if (out)
302  *out << "lastdid " << lastdid << " != last did "
303  << did << endl;
304  ++errors;
305  }
306  }
307 
308  continue;
309  }
310 
311  if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xd0') {
312  // Value stats.
313  const char * p = key.data();
314  const char * end = p + key.length();
315  p += 2;
316  Xapian::valueno slot;
317  if (!unpack_uint_last(&p, end, &slot)) {
318  if (out)
319  *out << "Bad valuestats key (no slot)" << endl;
320  ++errors;
321  continue;
322  }
323 
324  cursor->read_tag();
325  p = cursor->current_tag.data();
326  end = p + cursor->current_tag.size();
327 
328  VStats & v = valuestats[slot];
329  if (!unpack_uint(&p, end, &v.freq)) {
330  if (out) {
331  if (*p == 0) {
332  *out << "Incomplete stats item in value table";
333  } else {
334  *out << "Frequency statistic in value table is too large";
335  }
336  *out << endl;
337  }
338  ++errors;
339  continue;
340  }
341  if (!unpack_string(&p, end, v.lower_bound)) {
342  if (out) {
343  if (*p == 0) {
344  *out << "Incomplete stats item in value table";
345  } else {
346  *out << "Lower bound statistic in value table is too large";
347  }
348  *out << endl;
349  }
350  ++errors;
351  continue;
352  }
353  size_t len = end - p;
354  if (len == 0) {
355  v.upper_bound = v.lower_bound;
356  } else {
357  v.upper_bound.assign(p, len);
358  }
359 
360  continue;
361  }
362 
363  if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xd8') {
364  // Value stream chunk.
365  const char * p = key.data();
366  const char * end = p + key.length();
367  p += 2;
368  Xapian::valueno slot;
369  if (!unpack_uint(&p, end, &slot)) {
370  if (out)
371  *out << "Bad value chunk key (no slot)" << endl;
372  ++errors;
373  continue;
374  }
375  Xapian::docid did;
376  if (!C_unpack_uint_preserving_sort(&p, end, &did)) {
377  if (out)
378  *out << "Bad value chunk key (no docid)" << endl;
379  ++errors;
380  continue;
381  }
382  if (p != end) {
383  if (out)
384  *out << "Bad value chunk key (trailing junk)" << endl;
385  ++errors;
386  continue;
387  }
388 
389  VStats & v = valuestats[slot];
390 
391  cursor->read_tag();
392  p = cursor->current_tag.data();
393  end = p + cursor->current_tag.size();
394 
395  while (true) {
396  string value;
397  if (!unpack_string(&p, end, value)) {
398  if (out)
399  *out << "Failed to unpack value from chunk" << endl;
400  ++errors;
401  break;
402  }
403 
404  ++v.freq_real;
405 
406  // FIXME: Cross-check that docid did has value slot (and
407  // vice versa - that there's a value here if the slot entry
408  // says so).
409 
410  // FIXME: Check if the bounds are tight? Or is that better
411  // as a separate tool which can also update the bounds?
412  if (value < v.lower_bound) {
413  if (out)
414  *out << "Value slot " << slot << " has value "
415  "below lower bound: '" << value << "' < '"
416  << v.lower_bound << "'" << endl;
417  ++errors;
418  } else if (value > v.upper_bound) {
419  if (out)
420  *out << "Value slot " << slot << " has value "
421  "above upper bound: '" << value << "' > '"
422  << v.upper_bound << "'" << endl;
423  ++errors;
424  }
425 
426  if (p == end) break;
427  Xapian::docid delta;
428  if (!unpack_uint(&p, end, &delta)) {
429  if (out)
430  *out << "Failed to unpack docid delta from chunk"
431  << endl;
432  ++errors;
433  break;
434  }
435  Xapian::docid new_did = did + delta + 1;
436  if (new_did <= did) {
437  if (out)
438  *out << "docid overflowed in value chunk" << endl;
439  ++errors;
440  break;
441  }
442  did = new_did;
443 
444  if (did > db_last_docid) {
445  if (out)
446  *out << "document id " << did << " in value chunk "
447  "is larger than get_last_docid() "
448  << db_last_docid << endl;
449  ++errors;
450  }
451  }
452  continue;
453  }
454 
455  const char * pos, * end;
456 
457  // Get term from key.
458  pos = key.data();
459  end = pos + key.size();
460 
461  string term;
462  Xapian::docid did;
463  if (!unpack_string_preserving_sort(&pos, end, term)) {
464  if (out)
465  *out << "Error unpacking termname from key" << endl;
466  ++errors;
467  continue;
468  }
469  if (!current_term.empty() && term != current_term) {
470  // The term changed unexpectedly.
471  if (pos == end) {
472  if (out)
473  *out << "No last chunk for term '" << current_term
474  << "'" << endl;
475  current_term.resize(0);
476  } else {
477  if (out)
478  *out << "Mismatch in follow-on chunk in posting list "
479  "for term '" << current_term << "' (got '"
480  << term << "')" << endl;
481  current_term = term;
482  tf = cf = 0;
483  lastdid = 0;
484  }
485  ++errors;
486  }
487  if (pos == end) {
488  // First chunk.
489  if (term == current_term) {
490  // This probably isn't possible.
491  if (out)
492  *out << "First posting list chunk for term '" << term
493  << "' follows previous chunk for the same term"
494  << endl;
495  ++errors;
496  }
497  current_term = term;
498  tf = cf = 0;
499 
500  // Unpack extra header from first chunk.
501  cursor->read_tag();
502  pos = cursor->current_tag.data();
503  end = pos + cursor->current_tag.size();
504  if (!unpack_uint(&pos, end, &termfreq)) {
505  if (out)
506  *out << "Failed to unpack termfreq for term '" << term
507  << "'" << endl;
508  ++errors;
509  continue;
510  }
511  if (!unpack_uint(&pos, end, &collfreq)) {
512  if (out)
513  *out << "Failed to unpack collfreq for term '" << term
514  << "'" << endl;
515  ++errors;
516  continue;
517  }
518  if (!unpack_uint(&pos, end, &did)) {
519  if (out)
520  *out << "Failed to unpack firstdid for term '" << term
521  << "'" << endl;
522  ++errors;
523  continue;
524  }
525  ++did;
526  } else {
527  // Continuation chunk.
528  if (current_term.empty()) {
529  if (out)
530  *out << "First chunk for term '" << current_term
531  << "' is a continuation chunk" << endl;
532  ++errors;
533  current_term = term;
534  }
535  AssertEq(current_term, term);
536  if (!C_unpack_uint_preserving_sort(&pos, end, &did)) {
537  if (out)
538  *out << "Failed to unpack did from key" << endl;
539  ++errors;
540  continue;
541  }
542  if (did <= lastdid) {
543  if (out)
544  *out << "First did in this chunk is <= last in "
545  "prev chunk" << endl;
546  ++errors;
547  }
548  cursor->read_tag();
549  pos = cursor->current_tag.data();
550  end = pos + cursor->current_tag.size();
551  }
552 
553  bool is_last_chunk;
554  if (!unpack_bool(&pos, end, &is_last_chunk)) {
555  if (out)
556  *out << "Failed to unpack last chunk flag" << endl;
557  ++errors;
558  continue;
559  }
560  // Read what the final document ID in this chunk is.
561  if (!unpack_uint(&pos, end, &lastdid)) {
562  if (out)
563  *out << "Failed to unpack increase to last" << endl;
564  ++errors;
565  continue;
566  }
567  lastdid += did;
568  bool bad = false;
569  while (true) {
570  Xapian::termcount wdf;
571  if (!unpack_uint(&pos, end, &wdf)) {
572  if (out)
573  *out << "Failed to unpack wdf" << endl;
574  ++errors;
575  bad = true;
576  break;
577  }
578  ++tf;
579  cf += wdf;
580 
581  if (pos == end) break;
582 
583  Xapian::docid inc;
584  if (!unpack_uint(&pos, end, &inc)) {
585  if (out)
586  *out << "Failed to unpack docid increase" << endl;
587  ++errors;
588  bad = true;
589  break;
590  }
591  ++inc;
592  did += inc;
593  if (did > lastdid) {
594  if (out)
595  *out << "docid " << did << " > last docid " << lastdid
596  << endl;
597  ++errors;
598  }
599  }
600  if (bad) {
601  continue;
602  }
603  if (is_last_chunk) {
604  if (tf != termfreq) {
605  if (out)
606  *out << "termfreq " << termfreq << " != # of entries "
607  << tf << endl;
608  ++errors;
609  }
610  if (cf != collfreq) {
611  if (out)
612  *out << "collfreq " << collfreq << " != sum wdf " << cf
613  << endl;
614  ++errors;
615  }
616  if (did != lastdid) {
617  if (out)
618  *out << "lastdid " << lastdid << " != last did " << did
619  << endl;
620  ++errors;
621  }
622  current_term.resize(0);
623  }
624  }
625  if (!current_term.empty()) {
626  if (out)
627  *out << "Last term '" << current_term << "' has no last chunk"
628  << endl;
629  ++errors;
630  }
631 
632  if (num_doclens != doccount && doccount != Xapian::doccount(-1)) {
633  if (out)
634  *out << "Document length list has " << num_doclens
635  << " entries, should be " << doccount << endl;
636  ++errors;
637  }
638 
639  map<Xapian::valueno, VStats>::const_iterator i;
640  for (i = valuestats.begin(); i != valuestats.end(); ++i) {
641  if (i->second.freq != i->second.freq_real) {
642  if (out)
643  *out << "Value stats frequency for slot " << i->first
644  << " is " << i->second.freq << " but recounting "
645  "gives " << i->second.freq_real << endl;
646  ++errors;
647  }
648  }
649  } else if (strcmp(tablename, "record") == 0) {
650  if (table.get_entry_count() != doccount &&
651  doccount != Xapian::doccount(-1)) {
652  if (out)
653  *out << "Document data entry count (" << table.get_entry_count()
654  << ") != get_doccount() (" << doccount << ")" << endl;
655  ++errors;
656  }
657 
658  // Now check the contents of the record table. Any data is valid as
659  // the tag so we don't check the tags.
660  for ( ; !cursor->after_end(); cursor->next()) {
661  string & key = cursor->current_key;
662 
663  // Get docid from key.
664  const char * pos = key.data();
665  const char * end = pos + key.size();
666 
667  Xapian::docid did;
668  if (!C_unpack_uint_preserving_sort(&pos, end, &did)) {
669  if (out)
670  *out << "Error unpacking docid from key" << endl;
671  ++errors;
672  } else if (pos != end) {
673  if (out)
674  *out << "Extra junk in key" << endl;
675  ++errors;
676  } else {
677  if (did > db_last_docid) {
678  if (out)
679  *out << "document id " << did << " in docdata table "
680  "is larger than get_last_docid() "
681  << db_last_docid << endl;
682  ++errors;
683  }
684  }
685  }
686  } else if (strcmp(tablename, "termlist") == 0) {
687  // Now check the contents of the termlist table.
688  Xapian::doccount num_termlists = 0;
689  Xapian::doccount num_slotsused_entries = 0;
690  for ( ; !cursor->after_end(); cursor->next()) {
691  string & key = cursor->current_key;
692 
693  // Get docid from key.
694  const char * pos = key.data();
695  const char * end = pos + key.size();
696 
697  Xapian::docid did;
698  if (!C_unpack_uint_preserving_sort(&pos, end, &did)) {
699  if (out)
700  *out << "Error unpacking docid from key" << endl;
701  ++errors;
702  continue;
703  }
704 
705  if (did > db_last_docid) {
706  if (out)
707  *out << "document id " << did << " in termlist table "
708  "is larger than get_last_docid() "
709  << db_last_docid << endl;
710  ++errors;
711  }
712 
713  if (end - pos == 1 && *pos == '\0') {
714  // Value slots used entry.
715  ++num_slotsused_entries;
716  cursor->read_tag();
717 
718  pos = cursor->current_tag.data();
719  end = pos + cursor->current_tag.size();
720 
721  if (pos == end) {
722  if (out)
723  *out << "Empty value slots used tag" << endl;
724  ++errors;
725  continue;
726  }
727 
728  Xapian::valueno prev_slot;
729  if (!unpack_uint(&pos, end, &prev_slot)) {
730  if (out)
731  *out << "Value slot encoding corrupt" << endl;
732  ++errors;
733  continue;
734  }
735 
736  while (pos != end) {
737  Xapian::valueno slot;
738  if (!unpack_uint(&pos, end, &slot)) {
739  if (out)
740  *out << "Value slot encoding corrupt" << endl;
741  ++errors;
742  break;
743  }
744  slot += prev_slot + 1;
745  if (slot <= prev_slot) {
746  if (out)
747  *out << "Value slot number overflowed ("
748  << prev_slot << " -> " << slot << ")" << endl;
749  ++errors;
750  }
751  prev_slot = slot;
752  }
753  continue;
754  }
755 
756  if (pos != end) {
757  if (out)
758  *out << "Extra junk in key" << endl;
759  ++errors;
760  continue;
761  }
762 
763  ++num_termlists;
764  cursor->read_tag();
765 
766  pos = cursor->current_tag.data();
767  end = pos + cursor->current_tag.size();
768 
769  if (pos == end) {
770  // Empty termlist.
771  continue;
772  }
773 
774  Xapian::termcount doclen, termlist_size;
775 
776  // Read doclen
777  if (!unpack_uint(&pos, end, &doclen)) {
778  if (out) {
779  if (pos != 0) {
780  *out << "doclen out of range";
781  } else {
782  *out << "Unexpected end of data when reading doclen";
783  }
784  *out << endl;
785  }
786  ++errors;
787  continue;
788  }
789 
790  // Read termlist_size
791  if (!unpack_uint(&pos, end, &termlist_size)) {
792  if (out) {
793  if (pos != 0) {
794  *out << "termlist_size out of range";
795  } else {
796  *out << "Unexpected end of data when reading "
797  "termlist_size";
798  }
799  *out << endl;
800  }
801  ++errors;
802  continue;
803  }
804 
805  Xapian::termcount actual_doclen = 0, actual_termlist_size = 0;
806  string current_tname;
807 
808  bool bad = false;
809  while (pos != end) {
810  Xapian::doccount current_wdf = 0;
811  bool got_wdf = false;
812  // If there was a previous term, how much to reuse.
813  if (!current_tname.empty()) {
814  string::size_type len = static_cast<unsigned char>(*pos++);
815  if (len > current_tname.length()) {
816  // The wdf was squeezed into the same byte.
817  current_wdf = len / (current_tname.length() + 1) - 1;
818  len %= (current_tname.length() + 1);
819  got_wdf = true;
820  }
821  current_tname.resize(len);
822  }
823  // What to append (note len must be positive, since just truncating
824  // always takes us backwards in the sort order)
825  string::size_type len = static_cast<unsigned char>(*pos++);
826  current_tname.append(pos, len);
827  pos += len;
828 
829  if (!got_wdf) {
830  // Read wdf
831  if (!unpack_uint(&pos, end, &current_wdf)) {
832  if (out) {
833  if (pos == 0) {
834  *out << "Unexpected end of data when reading "
835  "termlist current_wdf";
836  } else {
837  *out << "Size of wdf out of range in termlist";
838  }
839  *out << endl;
840  }
841  ++errors;
842  bad = true;
843  break;
844  }
845  }
846 
847  ++actual_termlist_size;
848  actual_doclen += current_wdf;
849  }
850  if (bad) {
851  continue;
852  }
853 
854  if (termlist_size != actual_termlist_size) {
855  if (out)
856  *out << "termlist_size != # of entries in termlist" << endl;
857  ++errors;
858  }
859  if (doclen != actual_doclen) {
860  if (out)
861  *out << "doclen != sum(wdf)" << endl;
862  ++errors;
863  }
864 
865  // + 1 so that did is a valid subscript.
866  if (doclens.size() <= did) doclens.resize(did + 1);
867  doclens[did] = actual_doclen;
868  }
869 
870  if (num_termlists != doccount && doccount != Xapian::doccount(-1)) {
871  if (out)
872  *out << "Number of termlists (" << num_termlists
873  << ") != get_doccount() (" << doccount << ")" << endl;
874  ++errors;
875  }
876 
877  // chert doesn't store a valueslots used entry if there are no terms,
878  // so we can only check there aren't more such entries than documents.
879  if (num_slotsused_entries > doccount &&
880  doccount != Xapian::doccount(-1)) {
881  if (out)
882  *out << "More slots-used entries (" << num_slotsused_entries
883  << ") then documents (" << doccount << ")" << endl;
884  ++errors;
885  }
886  } else if (strcmp(tablename, "position") == 0) {
887  // Now check the contents of the position table.
888  for ( ; !cursor->after_end(); cursor->next()) {
889  string & key = cursor->current_key;
890 
891  // Get docid from key.
892  const char * pos = key.data();
893  const char * end = pos + key.size();
894 
895  Xapian::docid did;
896  if (!C_unpack_uint_preserving_sort(&pos, end, &did)) {
897  if (out)
898  *out << "Error unpacking docid from key" << endl;
899  ++errors;
900  continue;
901  }
902 
903  if (did > db_last_docid) {
904  if (out)
905  *out << "document id " << did << " in position table "
906  "is larger than get_last_docid() "
907  << db_last_docid << endl;
908  ++errors;
909  } else if (!doclens.empty()) {
910  // In chert, a document without terms doesn't get a
911  // termlist entry, so we can't tell the difference
912  // easily.
913  if (did >= doclens.size() || doclens[did] == 0) {
914  if (out)
915  *out << "Position list entry for document " << did
916  << " which doesn't exist or has no terms" << endl;
917  ++errors;
918  }
919  }
920 
921  if (pos == end) {
922  if (out)
923  *out << "No termname in key" << endl;
924  ++errors;
925  continue;
926  }
927 
928  cursor->read_tag();
929 
930  const string & data = cursor->current_tag;
931  pos = data.data();
932  end = pos + data.size();
933 
934  Xapian::termpos pos_last;
935  if (!unpack_uint(&pos, end, &pos_last)) {
936  if (out)
937  *out << tablename << " table: Position list data corrupt"
938  << endl;
939  ++errors;
940  continue;
941  }
942  if (pos == end) {
943  // Special case for single entry position list.
944  } else {
945  // Skip the header we just read.
946  BitReader rd(data, pos - data.data());
947  Xapian::termpos pos_first = rd.decode(pos_last);
948  Xapian::termpos pos_size = rd.decode(pos_last - pos_first) + 2;
949  rd.decode_interpolative(0, pos_size - 1, pos_first, pos_last);
950  Xapian::termpos p = rd.decode_interpolative_next();
951  bool ok = true;
952  while (p != pos_last) {
953  Xapian::termpos pos_prev = p;
954  p = rd.decode_interpolative_next();
955  if (p <= pos_prev) {
956  if (out)
957  *out << tablename << " table: Positions not "
958  "strictly monotonically increasing" << endl;
959  ++errors;
960  ok = false;
961  break;
962  }
963  }
964  if (ok && !rd.check_all_gone()) {
965  if (out)
966  *out << tablename << " table: Junk after position data"
967  << endl;
968  ++errors ;
969  }
970  }
971  }
972  } else {
973  if (out)
974  *out << tablename << " table: Don't know how to check structure\n"
975  << endl;
976  return errors;
977  }
978 
979  if (out) {
980  if (!errors)
981  *out << tablename << " table structure checked OK\n";
982  else
983  *out << tablename << " table errors found: " << errors << "\n";
984  *out << endl;
985  }
986 
987  return errors;
988 }
Class to hold statistics for a given slot.
Definition: valuestats.h:29
Statistics about values.
#define AssertEq(A, B)
Definition: omassert.h:124
chert_tablesize_t get_entry_count() const
Return a count of the number of entries in the table.
Definition: chert_table.h:623
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
Definition: types.h:139
static const char * opts
Class managing a Btree table in a Chert database.
Definition: chert_table.h:347
Btree checking.
STL namespace.
void open()
Open the btree at the latest revision.
std::string upper_bound
An upper bound on the values stored in the given value slot.
Definition: valuestats.h:41
Types used by chert backend and the Btree manager.
unsigned int chert_revision_number_t
A type used to store a revision number for a table.
Definition: chert_types.h:40
Xapian::doccount freq
The number of documents which have a (non-empty) value stored in the slot.
Definition: valuestats.h:33
std::string lower_bound
A lower bound on the values stored in the given value slot.
Definition: valuestats.h:37
Xapian::doccount freq_real
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
bool C_unpack_uint_preserving_sort(const char **p, const char *end, U *result)
Decode an "sort preserved" unsigned integer from a string.
Definition: pack.h:185
Interface to Btree cursors.
Public interfaces for the Xapian library.
static bool is_user_metadata_key(const string &key)
Read a stream created by BitWriter.
Definition: bitstream.h:64
bool unpack_string_preserving_sort(const char **p, const char *end, std::string &result)
Decode a "sort preserved" std::string from a string.
Definition: pack.h:562
ChertCursor * cursor_get() const
Get a cursor for reading from the table.
Classes to encode/decode a bitstream.
Xapian::termpos decode(Xapian::termpos outof, bool force=false)
Definition: bitstream.cc:176
std::string get_description() const
Return a string describing this object.
Definition: error.cc:93
bool unpack_bool(const char **p, const char *end, bool *result)
Decode a bool from a string.
Definition: pack.h:69
size_t check_chert_table(const char *tablename, const string &dir, chert_revision_number_t *rev_ptr, int opts, vector< Xapian::termcount > &doclens, Xapian::doccount doccount, Xapian::docid db_last_docid, ostream *out)
Btree implementation.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Check a chert table.
Pack types into strings and unpack them again.
unsigned valueno
The number for a value slot in a document.
Definition: types.h:108
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
Definition: types.h:83
bool unpack_uint_last(const char **p, const char *end, U *result)
Decode an unsigned integer as the last item in a string.
Definition: pack.h:111
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
Definition: pack.h:413
static void check(const char *tablename, const std::string &path, chert_revision_number_t *rev_ptr, int opts, std::ostream *out)
Definition: chert_check.cc:235
bool unpack_string(const char **p, const char *end, std::string &result)
Decode a std::string from a string.
Definition: pack.h:504
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
DatabaseError indicates some sort of database related error.
Definition: error.h:367
Types used internally.
Wrapper around standard unique_ptr template.