xapian-core  1.4.19
glass_dbcheck.cc
Go to the documentation of this file.
1 
4 /* Copyright 1999,2000,2001 BrightStation PLC
5  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2018 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20  * USA
21  */
22 
23 #include <config.h>
24 
25 #include "glass_dbcheck.h"
26 
27 #include "bitstream.h"
28 
29 #include "internaltypes.h"
30 
31 #include "glass_check.h"
32 #include "glass_cursor.h"
33 #include "glass_defs.h"
34 #include "glass_table.h"
35 #include "glass_version.h"
36 #include "pack.h"
37 #include "backends/valuestats.h"
38 
39 #include <xapian.h>
40 
41 #include "filetests.h"
42 #include "autoptr.h"
43 #include <ostream>
44 #include <vector>
45 
46 using namespace std;
47 
48 static inline bool
49 is_user_metadata_key(const string & key)
50 {
51  return key.size() > 1 && key[0] == '\0' && key[1] == '\xc0';
52 }
53 
54 struct VStats : public ValueStats {
55  Xapian::doccount freq_real;
56 
57  VStats() : ValueStats(), freq_real(0) {}
58 };
59 
60 size_t
61 check_glass_table(const char * tablename, const string &db_dir, int fd,
62  off_t offset_,
63  const GlassVersion & version_file, int opts,
64  vector<Xapian::termcount> & doclens, ostream * out)
65 {
66  Xapian::docid db_last_docid = version_file.get_last_docid();
67  if (out)
68  *out << tablename << ":\n";
69  if (fd < 0) {
70  if (strcmp(tablename, "postlist") != 0) {
71  // Other filenames are created lazily, so may not exist.
72  string filename(db_dir);
73  filename += '/';
74  filename += tablename;
75  filename += "." GLASS_TABLE_EXTENSION;
76  if (!file_exists(filename)) {
77  if (out) {
78  if (strcmp(tablename, "termlist") == 0) {
79  *out << "Not present.\n";
80  } else {
81  *out << "Lazily created, and not yet used.\n";
82  }
83  *out << endl;
84  }
85  return 0;
86  }
87  }
88  }
89 
90  // Check the btree structure.
91  AutoPtr<GlassTable> table(
92  GlassTableCheck::check(tablename, db_dir, fd, offset_,
93  version_file, opts, out));
94 
95  // Now check the glass structures inside the btree.
96  AutoPtr<GlassCursor> cursor(table->cursor_get());
97 
98  size_t errors = 0;
99 
100  cursor->find_entry(string());
101  cursor->next(); // Skip the empty entry.
102 
103  if (strcmp(tablename, "postlist") == 0) {
104  // Now check the structure of each postlist in the table.
105  map<Xapian::valueno, VStats> valuestats;
106  string current_term;
107  Xapian::docid lastdid = 0;
108  Xapian::termcount termfreq = 0, collfreq = 0;
109  Xapian::termcount tf = 0, cf = 0;
110  Xapian::doccount num_doclens = 0;
111 
112  for ( ; !cursor->after_end(); cursor->next()) {
113  string & key = cursor->current_key;
114 
115  if (is_user_metadata_key(key)) {
116  // User metadata can be anything, so we can't do any particular
117  // checks on it other than to check that the tag isn't empty.
118  cursor->read_tag();
119  if (cursor->current_tag.empty()) {
120  if (out)
121  *out << "User metadata item is empty" << endl;
122  ++errors;
123  }
124  continue;
125  }
126 
127  if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xe0') {
128  // doclen chunk
129  const char * pos, * end;
130  Xapian::docid did = 1;
131  if (key.size() > 2) {
132  // Non-initial chunk.
133  pos = key.data();
134  end = pos + key.size();
135  pos += 2;
136  if (!unpack_uint_preserving_sort(&pos, end, &did)) {
137  if (out)
138  *out << "Error unpacking docid from doclen key" << endl;
139  ++errors;
140  continue;
141  }
142  if (did <= lastdid) {
143  if (out)
144  *out << "First did in this doclen chunk is <= last in "
145  "prev chunk" << endl;
146  ++errors;
147  }
148  }
149 
150  cursor->read_tag();
151  pos = cursor->current_tag.data();
152  end = pos + cursor->current_tag.size();
153  if (key.size() == 2) {
154  // Initial chunk.
155  if (end - pos < 2 || pos[0] || pos[1]) {
156  if (out)
157  *out << "Initial doclen chunk has nonzero dummy fields" << endl;
158  ++errors;
159  continue;
160  }
161  pos += 2;
162  if (!unpack_uint(&pos, end, &did)) {
163  if (out)
164  *out << "Failed to unpack firstdid for doclen" << endl;
165  ++errors;
166  continue;
167  }
168  ++did;
169  }
170 
171  bool is_last_chunk;
172  if (!unpack_bool(&pos, end, &is_last_chunk)) {
173  if (out)
174  *out << "Failed to unpack last chunk flag for doclen" << endl;
175  ++errors;
176  continue;
177  }
178  // Read what the final document ID in this chunk is.
179  if (!unpack_uint(&pos, end, &lastdid)) {
180  if (out)
181  *out << "Failed to unpack increase to last" << endl;
182  ++errors;
183  continue;
184  }
185  lastdid += did;
186  bool bad = false;
187  while (true) {
188  Xapian::termcount doclen;
189  if (!unpack_uint(&pos, end, &doclen)) {
190  if (out)
191  *out << "Failed to unpack doclen" << endl;
192  ++errors;
193  bad = true;
194  break;
195  }
196 
197  ++num_doclens;
198 
199  if (did > db_last_docid) {
200  if (out)
201  *out << "document id " << did << " in doclen "
202  "stream is larger than get_last_docid() "
203  << db_last_docid << endl;
204  ++errors;
205  }
206 
207  if (!doclens.empty()) {
208  // In glass, a document without terms doesn't get a
209  // termlist entry.
210  Xapian::termcount termlist_doclen = 0;
211  if (did < doclens.size())
212  termlist_doclen = doclens[did];
213 
214  if (doclen != termlist_doclen) {
215  if (out)
216  *out << "document id " << did << ": length "
217  << doclen << " doesn't match "
218  << termlist_doclen << " in the termlist "
219  "table" << endl;
220  ++errors;
221  }
222  }
223 
224  if (pos == end) break;
225 
226  Xapian::docid inc;
227  if (!unpack_uint(&pos, end, &inc)) {
228  if (out)
229  *out << "Failed to unpack docid increase" << endl;
230  ++errors;
231  bad = true;
232  break;
233  }
234  ++inc;
235  did += inc;
236  if (did > lastdid) {
237  if (out)
238  *out << "docid " << did << " > last docid "
239  << lastdid << endl;
240  ++errors;
241  }
242  }
243  if (bad) {
244  continue;
245  }
246  if (is_last_chunk) {
247  if (did != lastdid) {
248  if (out)
249  *out << "lastdid " << lastdid << " != last did "
250  << did << endl;
251  ++errors;
252  }
253  }
254 
255  continue;
256  }
257 
258  if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xd0') {
259  // Value stats.
260  const char * p = key.data();
261  const char * end = p + key.length();
262  p += 2;
263  Xapian::valueno slot;
264  if (!unpack_uint_last(&p, end, &slot)) {
265  if (out)
266  *out << "Bad valuestats key (no slot)" << endl;
267  ++errors;
268  continue;
269  }
270 
271  cursor->read_tag();
272  p = cursor->current_tag.data();
273  end = p + cursor->current_tag.size();
274 
275  VStats & v = valuestats[slot];
276  if (!unpack_uint(&p, end, &v.freq)) {
277  if (out) {
278  if (*p == 0) {
279  *out << "Incomplete stats item in value table";
280  } else {
281  *out << "Frequency statistic in value table is too large";
282  }
283  *out << endl;
284  }
285  ++errors;
286  continue;
287  }
288  if (!unpack_string(&p, end, v.lower_bound)) {
289  if (out) {
290  if (*p == 0) {
291  *out << "Incomplete stats item in value table";
292  } else {
293  *out << "Lower bound statistic in value table is too large";
294  }
295  *out << endl;
296  }
297  ++errors;
298  continue;
299  }
300  size_t len = end - p;
301  if (len == 0) {
302  v.upper_bound = v.lower_bound;
303  } else {
304  v.upper_bound.assign(p, len);
305  }
306 
307  continue;
308  }
309 
310  if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xd8') {
311  // Value stream chunk.
312  const char * p = key.data();
313  const char * end = p + key.length();
314  p += 2;
315  Xapian::valueno slot;
316  if (!unpack_uint(&p, end, &slot)) {
317  if (out)
318  *out << "Bad value chunk key (no slot)" << endl;
319  ++errors;
320  continue;
321  }
322  Xapian::docid did;
323  if (!unpack_uint_preserving_sort(&p, end, &did)) {
324  if (out)
325  *out << "Bad value chunk key (no docid)" << endl;
326  ++errors;
327  continue;
328  }
329  if (p != end) {
330  if (out)
331  *out << "Bad value chunk key (trailing junk)" << endl;
332  ++errors;
333  continue;
334  }
335 
336  VStats & v = valuestats[slot];
337 
338  cursor->read_tag();
339  p = cursor->current_tag.data();
340  end = p + cursor->current_tag.size();
341 
342  while (true) {
343  string value;
344  if (!unpack_string(&p, end, value)) {
345  if (out)
346  *out << "Failed to unpack value from chunk" << endl;
347  ++errors;
348  break;
349  }
350 
351  ++v.freq_real;
352 
353  // FIXME: Cross-check that docid did has value slot (and
354  // vice versa - that there's a value here if the slot entry
355  // says so).
356 
357  // FIXME: Check if the bounds are tight? Or is that better
358  // as a separate tool which can also update the bounds?
359  if (value < v.lower_bound) {
360  if (out)
361  *out << "Value slot " << slot << " has value "
362  "below lower bound: '" << value << "' < '"
363  << v.lower_bound << "'" << endl;
364  ++errors;
365  } else if (value > v.upper_bound) {
366  if (out)
367  *out << "Value slot " << slot << " has value "
368  "above upper bound: '" << value << "' > '"
369  << v.upper_bound << "'" << endl;
370  ++errors;
371  }
372 
373  if (p == end) break;
374  Xapian::docid delta;
375  if (!unpack_uint(&p, end, &delta)) {
376  if (out)
377  *out << "Failed to unpack docid delta from chunk"
378  << endl;
379  ++errors;
380  break;
381  }
382  Xapian::docid new_did = did + delta + 1;
383  if (new_did <= did) {
384  if (out)
385  *out << "docid overflowed in value chunk" << endl;
386  ++errors;
387  break;
388  }
389  did = new_did;
390 
391  if (did > db_last_docid) {
392  if (out)
393  *out << "document id " << did << " in value chunk "
394  "is larger than get_last_docid() "
395  << db_last_docid << endl;
396  ++errors;
397  }
398  }
399  continue;
400  }
401 
402  const char * pos, * end;
403 
404  // Get term from key.
405  pos = key.data();
406  end = pos + key.size();
407 
408  string term;
409  Xapian::docid did;
410  if (!unpack_string_preserving_sort(&pos, end, term)) {
411  if (out)
412  *out << "Error unpacking termname from key" << endl;
413  ++errors;
414  continue;
415  }
416  if (!current_term.empty() && term != current_term) {
417  // The term changed unexpectedly.
418  if (pos == end) {
419  if (out)
420  *out << "No last chunk for term '" << current_term
421  << "'" << endl;
422  current_term.resize(0);
423  } else {
424  if (out)
425  *out << "Mismatch in follow-on chunk in posting list "
426  "for term '" << current_term << "' (got '"
427  << term << "')" << endl;
428  current_term = term;
429  tf = cf = 0;
430  lastdid = 0;
431  }
432  ++errors;
433  }
434  if (pos == end) {
435  // First chunk.
436  if (term == current_term) {
437  // This probably isn't possible.
438  if (out)
439  *out << "First posting list chunk for term '" << term
440  << "' follows previous chunk for the same term"
441  << endl;
442  ++errors;
443  }
444  current_term = term;
445  tf = cf = 0;
446 
447  // Unpack extra header from first chunk.
448  cursor->read_tag();
449  pos = cursor->current_tag.data();
450  end = pos + cursor->current_tag.size();
451  if (!unpack_uint(&pos, end, &termfreq)) {
452  if (out)
453  *out << "Failed to unpack termfreq for term '" << term
454  << "'" << endl;
455  ++errors;
456  continue;
457  }
458  if (!unpack_uint(&pos, end, &collfreq)) {
459  if (out)
460  *out << "Failed to unpack collfreq for term '" << term
461  << "'" << endl;
462  ++errors;
463  continue;
464  }
465  if (!unpack_uint(&pos, end, &did)) {
466  if (out)
467  *out << "Failed to unpack firstdid for term '" << term
468  << "'" << endl;
469  ++errors;
470  continue;
471  }
472  ++did;
473  } else {
474  // Continuation chunk.
475  if (current_term.empty()) {
476  if (out)
477  *out << "First chunk for term '" << current_term
478  << "' is a continuation chunk" << endl;
479  ++errors;
480  current_term = term;
481  }
482  AssertEq(current_term, term);
483  if (!unpack_uint_preserving_sort(&pos, end, &did)) {
484  if (out)
485  *out << "Failed to unpack did from key" << endl;
486  ++errors;
487  continue;
488  }
489  if (did <= lastdid) {
490  if (out)
491  *out << "First did in this chunk is <= last in "
492  "prev chunk" << endl;
493  ++errors;
494  }
495  cursor->read_tag();
496  pos = cursor->current_tag.data();
497  end = pos + cursor->current_tag.size();
498  }
499 
500  bool is_last_chunk;
501  if (!unpack_bool(&pos, end, &is_last_chunk)) {
502  if (out)
503  *out << "Failed to unpack last chunk flag" << endl;
504  ++errors;
505  continue;
506  }
507  // Read what the final document ID in this chunk is.
508  if (!unpack_uint(&pos, end, &lastdid)) {
509  if (out)
510  *out << "Failed to unpack increase to last" << endl;
511  ++errors;
512  continue;
513  }
514  lastdid += did;
515  bool bad = false;
516  while (true) {
517  Xapian::termcount wdf;
518  if (!unpack_uint(&pos, end, &wdf)) {
519  if (out)
520  *out << "Failed to unpack wdf" << endl;
521  ++errors;
522  bad = true;
523  break;
524  }
525  ++tf;
526  cf += wdf;
527 
528  if (pos == end) break;
529 
530  Xapian::docid inc;
531  if (!unpack_uint(&pos, end, &inc)) {
532  if (out)
533  *out << "Failed to unpack docid increase" << endl;
534  ++errors;
535  bad = true;
536  break;
537  }
538  ++inc;
539  did += inc;
540  if (did > lastdid) {
541  if (out)
542  *out << "docid " << did << " > last docid " << lastdid
543  << endl;
544  ++errors;
545  }
546  }
547  if (bad) {
548  continue;
549  }
550  if (is_last_chunk) {
551  if (tf != termfreq) {
552  if (out)
553  *out << "termfreq " << termfreq << " != # of entries "
554  << tf << endl;
555  ++errors;
556  }
557  if (cf != collfreq) {
558  if (out)
559  *out << "collfreq " << collfreq << " != sum wdf " << cf
560  << endl;
561  ++errors;
562  }
563  if (did != lastdid) {
564  if (out)
565  *out << "lastdid " << lastdid << " != last did " << did
566  << endl;
567  ++errors;
568  }
569  current_term.resize(0);
570  }
571  }
572  if (!current_term.empty()) {
573  if (out)
574  *out << "Last term '" << current_term << "' has no last chunk"
575  << endl;
576  ++errors;
577  }
578 
579  Xapian::doccount doccount = version_file.get_doccount();
580  if (num_doclens != doccount) {
581  if (out)
582  *out << "Document length list has " << num_doclens
583  << " entries, should be " << doccount << endl;
584  ++errors;
585  }
586 
587  map<Xapian::valueno, VStats>::const_iterator i;
588  for (i = valuestats.begin(); i != valuestats.end(); ++i) {
589  if (i->second.freq != i->second.freq_real) {
590  if (out)
591  *out << "Value stats frequency for slot " << i->first
592  << " is " << i->second.freq << " but recounting "
593  "gives " << i->second.freq_real << endl;
594  ++errors;
595  }
596  }
597  } else if (strcmp(tablename, "docdata") == 0) {
598  // glass doesn't store a docdata entry if the document data is empty,
599  // so we can only check there aren't more docdata entries than
600  // documents.
601  Xapian::doccount doccount = version_file.get_doccount();
602  if (table->get_entry_count() > doccount) {
603  if (out)
604  *out << "More document data (" << table->get_entry_count()
605  << ") then documents (" << doccount << ")" << endl;
606  ++errors;
607  }
608 
609  // Now check the contents of the docdata table.
610  for ( ; !cursor->after_end(); cursor->next()) {
611  string & key = cursor->current_key;
612 
613  // Get docid from key.
614  const char * pos = key.data();
615  const char * end = pos + key.size();
616 
617  Xapian::docid did;
618  if (!unpack_uint_preserving_sort(&pos, end, &did)) {
619  if (out)
620  *out << "Error unpacking docid from key" << endl;
621  ++errors;
622  } else if (pos != end) {
623  if (out)
624  *out << "Extra junk in key" << endl;
625  ++errors;
626  } else {
627  if (did > db_last_docid) {
628  if (out)
629  *out << "document id " << did << " in docdata table "
630  "is larger than get_last_docid() "
631  << db_last_docid << endl;
632  ++errors;
633  }
634  }
635 
636  // Fetch and decompress the document data to catch problems with
637  // the splitting into multiple items, corruption of the compressed
638  // data, etc.
639  cursor->read_tag();
640  if (cursor->current_tag.empty()) {
641  // We shouldn't store empty document data.
642  if (out)
643  *out << "Empty document data explicitly stored for "
644  "document id " << did << endl;
645  ++errors;
646  }
647  }
648  } else if (strcmp(tablename, "termlist") == 0) {
649  // Now check the contents of the termlist table.
650  Xapian::doccount num_termlists = 0;
651  Xapian::doccount num_slotsused_entries = 0;
652  for ( ; !cursor->after_end(); cursor->next()) {
653  string & key = cursor->current_key;
654 
655  // Get docid from key.
656  const char * pos = key.data();
657  const char * end = pos + key.size();
658 
659  Xapian::docid did;
660  if (!unpack_uint_preserving_sort(&pos, end, &did)) {
661  if (out)
662  *out << "Error unpacking docid from key" << endl;
663  ++errors;
664  continue;
665  }
666 
667  if (did > db_last_docid) {
668  if (out)
669  *out << "document id " << did << " in termlist table "
670  "is larger than get_last_docid() "
671  << db_last_docid << endl;
672  ++errors;
673  }
674 
675  if (end - pos == 1 && *pos == '\0') {
676  // Value slots used entry.
677  ++num_slotsused_entries;
678  cursor->read_tag();
679 
680  pos = cursor->current_tag.data();
681  end = pos + cursor->current_tag.size();
682 
683  if (pos == end) {
684  if (out)
685  *out << "Empty value slots used tag" << endl;
686  ++errors;
687  continue;
688  }
689 
690  Xapian::valueno prev_slot;
691  if (!unpack_uint(&pos, end, &prev_slot)) {
692  if (out)
693  *out << "Value slot encoding corrupt" << endl;
694  ++errors;
695  continue;
696  }
697 
698  while (pos != end) {
699  Xapian::valueno slot;
700  if (!unpack_uint(&pos, end, &slot)) {
701  if (out)
702  *out << "Value slot encoding corrupt" << endl;
703  ++errors;
704  break;
705  }
706  slot += prev_slot + 1;
707  if (slot <= prev_slot) {
708  if (out)
709  *out << "Value slot number overflowed ("
710  << prev_slot << " -> " << slot << ")" << endl;
711  ++errors;
712  }
713  prev_slot = slot;
714  }
715  continue;
716  }
717 
718  if (pos != end) {
719  if (out)
720  *out << "Extra junk in key" << endl;
721  ++errors;
722  continue;
723  }
724 
725  ++num_termlists;
726  cursor->read_tag();
727 
728  pos = cursor->current_tag.data();
729  end = pos + cursor->current_tag.size();
730 
731  if (pos == end) {
732  // Empty termlist.
733  continue;
734  }
735 
736  Xapian::termcount doclen, termlist_size;
737 
738  // Read doclen
739  if (!unpack_uint(&pos, end, &doclen)) {
740  if (out) {
741  if (pos != 0) {
742  *out << "doclen out of range";
743  } else {
744  *out << "Unexpected end of data when reading doclen";
745  }
746  *out << endl;
747  }
748  ++errors;
749  continue;
750  }
751 
752  // Check doclen with doclen lower and upper bounds
753  if (doclen > version_file.get_doclength_upper_bound()) {
754  if (out)
755  *out << "doclen " << doclen << " > upper bound "
756  << version_file.get_doclength_upper_bound() << endl;
757  ++errors;
758  } else if (doclen < version_file.get_doclength_lower_bound() &&
759  doclen != 0) {
760  if (out)
761  *out << "doclen " << doclen << " < lower bound "
762  << version_file.get_doclength_lower_bound() << endl;
763  ++errors;
764  }
765 
766  // Read termlist_size
767  if (!unpack_uint(&pos, end, &termlist_size)) {
768  if (out) {
769  if (pos != 0) {
770  *out << "termlist_size out of range";
771  } else {
772  *out << "Unexpected end of data when reading "
773  "termlist_size";
774  }
775  *out << endl;
776  }
777  ++errors;
778  continue;
779  }
780 
781  Xapian::termcount actual_doclen = 0, actual_termlist_size = 0;
782  string current_tname;
783 
784  bool bad = false;
785  while (pos != end) {
786  Xapian::doccount current_wdf = 0;
787  bool got_wdf = false;
788  // If there was a previous term, how much to reuse.
789  if (!current_tname.empty()) {
790  string::size_type len = static_cast<unsigned char>(*pos++);
791  if (len > current_tname.length()) {
792  // The wdf was squeezed into the same byte.
793  current_wdf = len / (current_tname.length() + 1) - 1;
794  len %= (current_tname.length() + 1);
795  got_wdf = true;
796  }
797  current_tname.resize(len);
798  }
799  // What to append (note len must be positive, since just truncating
800  // always takes us backwards in the sort order)
801  string::size_type len = static_cast<unsigned char>(*pos++);
802  current_tname.append(pos, len);
803  pos += len;
804 
805  if (!got_wdf) {
806  // Read wdf
807  if (!unpack_uint(&pos, end, &current_wdf)) {
808  if (out) {
809  if (pos == 0) {
810  *out << "Unexpected end of data when reading "
811  "termlist current_wdf";
812  } else {
813  *out << "Size of wdf out of range in termlist";
814  }
815  *out << endl;
816  }
817  ++errors;
818  bad = true;
819  break;
820  }
821  }
822 
823  ++actual_termlist_size;
824  actual_doclen += current_wdf;
825  }
826  if (bad) {
827  continue;
828  }
829 
830  if (termlist_size != actual_termlist_size) {
831  if (out)
832  *out << "termlist_size != # of entries in termlist" << endl;
833  ++errors;
834  }
835  if (doclen != actual_doclen) {
836  if (out)
837  *out << "doclen != sum(wdf)" << endl;
838  ++errors;
839  }
840 
841  // + 1 so that did is a valid subscript.
842  if (doclens.size() <= did) doclens.resize(did + 1);
843  doclens[did] = actual_doclen;
844  }
845 
846  Xapian::doccount doccount = version_file.get_doccount();
847 
848  // glass doesn't store a termlist entry if there are no terms, so we
849  // can only check there aren't more termlists than documents.
850  if (num_termlists > doccount) {
851  if (out)
852  *out << "More termlists (" << num_termlists
853  << ") then documents (" << doccount << ")" << endl;
854  ++errors;
855  }
856 
857  // glass doesn't store a valueslots used entry if there are no terms,
858  // so we can only check there aren't more such entries than documents.
859  if (num_slotsused_entries > doccount) {
860  if (out)
861  *out << "More slots-used entries (" << num_slotsused_entries
862  << ") then documents (" << doccount << ")" << endl;
863  ++errors;
864  }
865  } else if (strcmp(tablename, "position") == 0) {
866  // Now check the contents of the position table.
867  for ( ; !cursor->after_end(); cursor->next()) {
868  string & key = cursor->current_key;
869 
870  // Get docid from key.
871  const char * pos = key.data();
872  const char * end = pos + key.size();
873 
874  string term;
875  if (!unpack_string_preserving_sort(&pos, end, term)) {
876  if (out)
877  *out << "Error unpacking term from key" << endl;
878  ++errors;
879  continue;
880  }
881 
882  Xapian::docid did;
883  if (!unpack_uint_preserving_sort(&pos, end, &did)) {
884  if (out)
885  *out << "Error unpacking docid from key" << endl;
886  ++errors;
887  continue;
888  }
889 
890  if (pos != end) {
891  if (out)
892  *out << "Extra junk in key with docid " << did << endl;
893  ++errors;
894  continue;
895  }
896 
897  if (did > db_last_docid) {
898  if (out)
899  *out << "document id " << did << " in position table "
900  "is larger than get_last_docid() "
901  << db_last_docid << endl;
902  ++errors;
903  } else if (!doclens.empty()) {
904  // In glass, a document without terms doesn't get a
905  // termlist entry, so we can't tell the difference
906  // easily.
907  if (did >= doclens.size() || doclens[did] == 0) {
908  if (out)
909  *out << "Position list entry for document " << did
910  << " which doesn't exist or has no terms" << endl;
911  ++errors;
912  }
913  }
914 
915  cursor->read_tag();
916 
917  const string & data = cursor->current_tag;
918  pos = data.data();
919  end = pos + data.size();
920 
921  Xapian::termpos pos_last;
922  if (!unpack_uint(&pos, end, &pos_last)) {
923  if (out)
924  *out << tablename << " table: Position list data corrupt"
925  << endl;
926  ++errors;
927  continue;
928  }
929  if (pos == end) {
930  // Special case for single entry position list.
931  } else {
932  // Skip the header we just read.
933  BitReader rd(data, pos - data.data());
934  Xapian::termpos pos_first = rd.decode(pos_last);
935  Xapian::termpos pos_size = rd.decode(pos_last - pos_first) + 2;
936  rd.decode_interpolative(0, pos_size - 1, pos_first, pos_last);
937  Xapian::termpos p = rd.decode_interpolative_next();
938  bool ok = true;
939  while (p != pos_last) {
940  Xapian::termpos pos_prev = p;
941  p = rd.decode_interpolative_next();
942  if (p <= pos_prev) {
943  if (out)
944  *out << tablename << " table: Positions not "
945  "strictly monotonically increasing" << endl;
946  ++errors;
947  ok = false;
948  break;
949  }
950  }
951  if (ok && !rd.check_all_gone()) {
952  if (out)
953  *out << tablename << " table: Junk after position data"
954  << endl;
955  ++errors;
956  }
957  }
958  }
959  } else {
960  if (out)
961  *out << tablename << " table: Don't know how to check structure\n"
962  << endl;
963  return errors;
964  }
965 
966  if (out) {
967  if (!errors)
968  *out << tablename << " table structure checked OK\n";
969  else
970  *out << tablename << " table errors found: " << errors << "\n";
971  *out << endl;
972  }
973 
974  return errors;
975 }
Xapian::termcount get_doclength_upper_bound() const
GlassVersion class.
Class to hold statistics for a given slot.
Definition: valuestats.h:29
Statistics about values.
#define AssertEq(A, B)
Definition: omassert.h:124
static bool is_user_metadata_key(const string &key)
The GlassVersion class manages the revision files.
Definition: glass_version.h:94
static const char * opts
STL namespace.
Definitions, types, etc for use inside glass.
std::string upper_bound
An upper bound on the values stored in the given value slot.
Definition: valuestats.h:41
Utility functions for testing files.
size_t check_glass_table(const char *tablename, const string &db_dir, int fd, off_t offset_, const GlassVersion &version_file, int opts, vector< Xapian::termcount > &doclens, ostream *out)
#define GLASS_TABLE_EXTENSION
Glass table extension.
Definition: glass_defs.h:27
Xapian::docid get_last_docid() const
Xapian::doccount freq
The number of documents which have a (non-empty) value stored in the slot.
Definition: valuestats.h:33
std::string lower_bound
A lower bound on the values stored in the given value slot.
Definition: valuestats.h:37
Xapian::doccount freq_real
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
Btree checking.
Public interfaces for the Xapian library.
Xapian::termcount get_doclength_lower_bound() const
Read a stream created by BitWriter.
Definition: bitstream.h:64
bool unpack_string_preserving_sort(const char **p, const char *end, std::string &result)
Decode a "sort preserved" std::string from a string.
Definition: pack.h:562
bool unpack_uint_preserving_sort(const char **p, const char *end, U *result)
Decode an "sort preserved" unsigned integer from a string.
Definition: pack.h:318
Btree implementation.
Classes to encode/decode a bitstream.
static GlassTableCheck * check(const char *tablename, const std::string &path, int fd, off_t offset_, const GlassVersion &version_file, int opts, std::ostream *out)
Definition: glass_check.cc:263
Xapian::termpos decode(Xapian::termpos outof, bool force=false)
Definition: bitstream.cc:174
bool unpack_bool(const char **p, const char *end, bool *result)
Decode a bool from a string.
Definition: pack.h:69
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Interface to Btree cursors.
Pack types into strings and unpack them again.
unsigned valueno
The number for a value slot in a document.
Definition: types.h:108
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
Definition: types.h:83
bool unpack_uint_last(const char **p, const char *end, U *result)
Decode an unsigned integer as the last item in a string.
Definition: pack.h:111
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
Definition: pack.h:413
bool unpack_string(const char **p, const char *end, std::string &result)
Decode a std::string from a string.
Definition: pack.h:504
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
Check a glass table.
bool file_exists(const char *path)
Test if a file exists.
Definition: filetests.h:39
Types used internally.
Wrapper around standard unique_ptr template.
Xapian::doccount get_doccount() const