xapian-core  2.0.0
glass_dbcheck.cc
Go to the documentation of this file.
1 
4 /* Copyright 1999,2000,2001 BrightStation PLC
5  * Copyright 2002-2024 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, see
19  * <https://www.gnu.org/licenses/>.
20  */
21 
22 #include <config.h>
23 
24 #include "glass_dbcheck.h"
25 
26 #include "bitstream.h"
27 
28 #include "internaltypes.h"
29 
30 #include "glass_check.h"
31 #include "glass_cursor.h"
32 #include "glass_defs.h"
33 #include "glass_table.h"
34 #include "glass_version.h"
35 #include "pack.h"
36 #include "backends/valuestats.h"
37 
38 #include <xapian.h>
39 
40 #include "filetests.h"
41 #include <memory>
42 #include <ostream>
43 #include <vector>
44 
45 using namespace std;
46 
47 static inline bool
48 is_user_metadata_key(const string & key)
49 {
50  return key.size() > 1 && key[0] == '\0' && key[1] == '\xc0';
51 }
52 
53 struct VStats : public ValueStats {
55 
56  VStats() : ValueStats(), freq_real(0) {}
57 };
58 
59 size_t
60 check_glass_table(const char* tablename, string_view db_dir, int fd,
61  off_t offset_,
62  const GlassVersion& version_file, int opts,
63  vector<Xapian::termcount>& doclens, ostream* out)
64 {
65  Xapian::docid db_last_docid = version_file.get_last_docid();
66  if (out)
67  *out << tablename << ":\n";
68  if (fd < 0) {
69  if (strcmp(tablename, "postlist") != 0) {
70  // Other filenames are created lazily, so may not exist.
71  string filename(db_dir);
72  filename += '/';
73  filename += tablename;
74  filename += "." GLASS_TABLE_EXTENSION;
75  if (!file_exists(filename)) {
76  if (out) {
77  if (strcmp(tablename, "termlist") == 0) {
78  *out << "Not present.\n";
79  } else {
80  *out << "Lazily created, and not yet used.\n";
81  }
82  *out << endl;
83  }
84  return 0;
85  }
86  }
87  }
88 
89  // Check the btree structure.
90  unique_ptr<GlassTableCheck> table(
91  GlassTableCheck::check(tablename, db_dir, fd, offset_,
92  version_file, opts, out));
93 
94  // Now check the glass structures inside the btree.
95  unique_ptr<GlassCursor> cursor(table->cursor_get());
96 
97  size_t errors = 0;
98 
99  cursor->rewind();
100  cursor->next(); // Skip the empty entry.
101 
102  if (strcmp(tablename, "postlist") == 0) {
103  // Now check the structure of each postlist in the table.
104  map<Xapian::valueno, VStats> valuestats;
105  string current_term;
106  Xapian::docid lastdid = 0;
107  Xapian::termcount termfreq = 0, collfreq = 0;
108  Xapian::termcount tf = 0, cf = 0;
109  Xapian::doccount num_doclens = 0;
110 
111  for ( ; !cursor->after_end(); cursor->next()) {
112  string & key = cursor->current_key;
113 
114  if (is_user_metadata_key(key)) {
115  // User metadata can be anything, so we can't do any particular
116  // checks on it other than to check that the tag isn't empty.
117  cursor->read_tag();
118  if (cursor->current_tag.empty()) {
119  if (out)
120  *out << "User metadata item is empty" << endl;
121  ++errors;
122  }
123  continue;
124  }
125 
126  if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xe0') {
127  // doclen chunk
128  const char * pos, * end;
129  Xapian::docid did = 1;
130  if (key.size() > 2) {
131  // Non-initial chunk.
132  pos = key.data();
133  end = pos + key.size();
134  pos += 2;
135  if (!unpack_uint_preserving_sort(&pos, end, &did)) {
136  if (out)
137  *out << "Error unpacking docid from doclen key" << endl;
138  ++errors;
139  continue;
140  }
141  if (did <= lastdid) {
142  if (out)
143  *out << "First did in this doclen chunk is <= last in "
144  "prev chunk" << endl;
145  ++errors;
146  }
147  }
148 
149  cursor->read_tag();
150  pos = cursor->current_tag.data();
151  end = pos + cursor->current_tag.size();
152  if (key.size() == 2) {
153  // Initial chunk.
154  if (end - pos < 2 || pos[0] || pos[1]) {
155  if (out)
156  *out << "Initial doclen chunk has nonzero dummy fields" << endl;
157  ++errors;
158  continue;
159  }
160  pos += 2;
161  if (!unpack_uint(&pos, end, &did)) {
162  if (out)
163  *out << "Failed to unpack firstdid for doclen" << endl;
164  ++errors;
165  continue;
166  }
167  ++did;
168  }
169 
170  bool is_last_chunk;
171  if (!unpack_bool(&pos, end, &is_last_chunk)) {
172  if (out)
173  *out << "Failed to unpack last chunk flag for doclen" << endl;
174  ++errors;
175  continue;
176  }
177  // Read what the final document ID in this chunk is.
178  if (!unpack_uint(&pos, end, &lastdid)) {
179  if (out)
180  *out << "Failed to unpack increase to last" << endl;
181  ++errors;
182  continue;
183  }
184  lastdid += did;
185  bool bad = false;
186  while (true) {
187  Xapian::termcount doclen;
188  if (!unpack_uint(&pos, end, &doclen)) {
189  if (out)
190  *out << "Failed to unpack doclen" << endl;
191  ++errors;
192  bad = true;
193  break;
194  }
195 
196  ++num_doclens;
197 
198  if (did > db_last_docid) {
199  if (out)
200  *out << "document id " << did << " in doclen "
201  "stream is larger than get_last_docid() "
202  << db_last_docid << endl;
203  ++errors;
204  }
205 
206  if (!doclens.empty()) {
207  // In glass, a document without terms doesn't get a
208  // termlist entry.
209  Xapian::termcount termlist_doclen = 0;
210  if (did < doclens.size())
211  termlist_doclen = doclens[did];
212 
213  if (doclen != termlist_doclen) {
214  if (out)
215  *out << "document id " << did << ": length "
216  << doclen << " doesn't match "
217  << termlist_doclen << " in the termlist "
218  "table" << endl;
219  ++errors;
220  }
221  }
222 
223  if (pos == end) break;
224 
225  Xapian::docid inc;
226  if (!unpack_uint(&pos, end, &inc)) {
227  if (out)
228  *out << "Failed to unpack docid increase" << endl;
229  ++errors;
230  bad = true;
231  break;
232  }
233  ++inc;
234  did += inc;
235  if (did > lastdid) {
236  if (out)
237  *out << "docid " << did << " > last docid "
238  << lastdid << endl;
239  ++errors;
240  }
241  }
242  if (bad) {
243  continue;
244  }
245  if (is_last_chunk) {
246  if (did != lastdid) {
247  if (out)
248  *out << "lastdid " << lastdid << " != last did "
249  << did << endl;
250  ++errors;
251  }
252  }
253 
254  continue;
255  }
256 
257  if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xd0') {
258  // Value stats.
259  const char * p = key.data();
260  const char * end = p + key.length();
261  p += 2;
262  Xapian::valueno slot;
263  if (!unpack_uint_last(&p, end, &slot)) {
264  if (out)
265  *out << "Bad valuestats key (no slot)" << endl;
266  ++errors;
267  continue;
268  }
269 
270  cursor->read_tag();
271  p = cursor->current_tag.data();
272  end = p + cursor->current_tag.size();
273 
274  VStats & v = valuestats[slot];
275  if (!unpack_uint(&p, end, &v.freq)) {
276  if (out) {
277  if (*p == 0) {
278  *out << "Incomplete stats item in value table";
279  } else {
280  *out << "Frequency statistic in value table is too large";
281  }
282  *out << endl;
283  }
284  ++errors;
285  continue;
286  }
287  if (!unpack_string(&p, end, v.lower_bound)) {
288  if (out) {
289  if (*p == 0) {
290  *out << "Incomplete stats item in value table";
291  } else {
292  *out << "Lower bound statistic in value table is too large";
293  }
294  *out << endl;
295  }
296  ++errors;
297  continue;
298  }
299  size_t len = end - p;
300  if (len == 0) {
301  v.upper_bound = v.lower_bound;
302  } else {
303  v.upper_bound.assign(p, len);
304  }
305 
306  continue;
307  }
308 
309  if (key.size() >= 2 && key[0] == '\0' && key[1] == '\xd8') {
310  // Value stream chunk.
311  const char * p = key.data();
312  const char * end = p + key.length();
313  p += 2;
314  Xapian::valueno slot;
315  if (!unpack_uint(&p, end, &slot)) {
316  if (out)
317  *out << "Bad value chunk key (no slot)" << endl;
318  ++errors;
319  continue;
320  }
321  Xapian::docid did;
322  if (!unpack_uint_preserving_sort(&p, end, &did)) {
323  if (out)
324  *out << "Bad value chunk key (no docid)" << endl;
325  ++errors;
326  continue;
327  }
328  if (p != end) {
329  if (out)
330  *out << "Bad value chunk key (trailing junk)" << endl;
331  ++errors;
332  continue;
333  }
334 
335  VStats & v = valuestats[slot];
336 
337  cursor->read_tag();
338  p = cursor->current_tag.data();
339  end = p + cursor->current_tag.size();
340 
341  while (true) {
342  string value;
343  if (!unpack_string(&p, end, value)) {
344  if (out)
345  *out << "Failed to unpack value from chunk" << endl;
346  ++errors;
347  break;
348  }
349 
350  ++v.freq_real;
351 
352  // FIXME: Cross-check that docid did has value slot (and
353  // vice versa - that there's a value here if the slot entry
354  // says so).
355 
356  // FIXME: Check if the bounds are tight? Or is that better
357  // as a separate tool which can also update the bounds?
358  if (value < v.lower_bound) {
359  if (out)
360  *out << "Value slot " << slot << " has value "
361  "below lower bound: '" << value << "' < '"
362  << v.lower_bound << "'" << endl;
363  ++errors;
364  } else if (value > v.upper_bound) {
365  if (out)
366  *out << "Value slot " << slot << " has value "
367  "above upper bound: '" << value << "' > '"
368  << v.upper_bound << "'" << endl;
369  ++errors;
370  }
371 
372  if (p == end) break;
373  Xapian::docid delta;
374  if (!unpack_uint(&p, end, &delta)) {
375  if (out)
376  *out << "Failed to unpack docid delta from chunk"
377  << endl;
378  ++errors;
379  break;
380  }
381  Xapian::docid new_did = did + delta + 1;
382  if (new_did <= did) {
383  if (out)
384  *out << "docid overflowed in value chunk" << endl;
385  ++errors;
386  break;
387  }
388  did = new_did;
389 
390  if (did > db_last_docid) {
391  if (out)
392  *out << "document id " << did << " in value chunk "
393  "is larger than get_last_docid() "
394  << db_last_docid << endl;
395  ++errors;
396  }
397  }
398  continue;
399  }
400 
401  const char * pos, * end;
402 
403  // Get term from key.
404  pos = key.data();
405  end = pos + key.size();
406 
407  string term;
408  Xapian::docid did;
409  if (!unpack_string_preserving_sort(&pos, end, term)) {
410  if (out)
411  *out << "Error unpacking termname from key" << endl;
412  ++errors;
413  continue;
414  }
415  if (!current_term.empty() && term != current_term) {
416  // The term changed unexpectedly.
417  if (pos == end) {
418  if (out)
419  *out << "No last chunk for term '" << current_term
420  << "'" << endl;
421  current_term.resize(0);
422  } else {
423  if (out)
424  *out << "Mismatch in follow-on chunk in posting list "
425  "for term '" << current_term << "' (got '"
426  << term << "')" << endl;
427  current_term = term;
428  tf = cf = 0;
429  lastdid = 0;
430  }
431  ++errors;
432  }
433  if (pos == end) {
434  // First chunk.
435  if (term == current_term) {
436  // This probably isn't possible.
437  if (out)
438  *out << "First posting list chunk for term '" << term
439  << "' follows previous chunk for the same term"
440  << endl;
441  ++errors;
442  }
443  current_term = term;
444  tf = cf = 0;
445 
446  // Unpack extra header from first chunk.
447  cursor->read_tag();
448  pos = cursor->current_tag.data();
449  end = pos + cursor->current_tag.size();
450  if (!unpack_uint(&pos, end, &termfreq)) {
451  if (out)
452  *out << "Failed to unpack termfreq for term '" << term
453  << "'" << endl;
454  ++errors;
455  continue;
456  }
457  if (!unpack_uint(&pos, end, &collfreq)) {
458  if (out)
459  *out << "Failed to unpack collfreq for term '" << term
460  << "'" << endl;
461  ++errors;
462  continue;
463  }
464  if (!unpack_uint(&pos, end, &did)) {
465  if (out)
466  *out << "Failed to unpack firstdid for term '" << term
467  << "'" << endl;
468  ++errors;
469  continue;
470  }
471  ++did;
472  } else {
473  // Continuation chunk.
474  if (current_term.empty()) {
475  if (out)
476  *out << "First chunk for term '" << term
477  << "' is a continuation chunk" << endl;
478  ++errors;
479  current_term = term;
480  }
481  AssertEq(current_term, term);
482  if (!unpack_uint_preserving_sort(&pos, end, &did)) {
483  if (out)
484  *out << "Failed to unpack did from key" << endl;
485  ++errors;
486  continue;
487  }
488  if (did <= lastdid) {
489  if (out)
490  *out << "First did in this chunk is <= last in "
491  "prev chunk" << endl;
492  ++errors;
493  }
494  cursor->read_tag();
495  pos = cursor->current_tag.data();
496  end = pos + cursor->current_tag.size();
497  }
498 
499  bool is_last_chunk;
500  if (!unpack_bool(&pos, end, &is_last_chunk)) {
501  if (out)
502  *out << "Failed to unpack last chunk flag" << endl;
503  ++errors;
504  continue;
505  }
506  // Read what the final document ID in this chunk is.
507  if (!unpack_uint(&pos, end, &lastdid)) {
508  if (out)
509  *out << "Failed to unpack increase to last" << endl;
510  ++errors;
511  continue;
512  }
513  lastdid += did;
514  bool bad = false;
515  while (true) {
516  Xapian::termcount wdf;
517  if (!unpack_uint(&pos, end, &wdf)) {
518  if (out)
519  *out << "Failed to unpack wdf" << endl;
520  ++errors;
521  bad = true;
522  break;
523  }
524  ++tf;
525  cf += wdf;
526 
527  if (pos == end) break;
528 
529  Xapian::docid inc;
530  if (!unpack_uint(&pos, end, &inc)) {
531  if (out)
532  *out << "Failed to unpack docid increase" << endl;
533  ++errors;
534  bad = true;
535  break;
536  }
537  ++inc;
538  did += inc;
539  if (did > lastdid) {
540  if (out)
541  *out << "docid " << did << " > last docid " << lastdid
542  << endl;
543  ++errors;
544  }
545  }
546  if (bad) {
547  continue;
548  }
549  if (is_last_chunk) {
550  if (tf != termfreq) {
551  if (out)
552  *out << "termfreq " << termfreq << " != # of entries "
553  << tf << endl;
554  ++errors;
555  }
556  if (cf != collfreq) {
557  if (out)
558  *out << "collfreq " << collfreq << " != sum wdf " << cf
559  << endl;
560  ++errors;
561  }
562  if (did != lastdid) {
563  if (out)
564  *out << "lastdid " << lastdid << " != last did " << did
565  << endl;
566  ++errors;
567  }
568  current_term.resize(0);
569  }
570  }
571  if (!current_term.empty()) {
572  if (out)
573  *out << "Last term '" << current_term << "' has no last chunk"
574  << endl;
575  ++errors;
576  }
577 
578  Xapian::doccount doccount = version_file.get_doccount();
579  if (num_doclens != doccount) {
580  if (out)
581  *out << "Document length list has " << num_doclens
582  << " entries, should be " << doccount << endl;
583  ++errors;
584  }
585 
586  map<Xapian::valueno, VStats>::const_iterator i;
587  for (i = valuestats.begin(); i != valuestats.end(); ++i) {
588  if (i->second.freq != i->second.freq_real) {
589  if (out)
590  *out << "Value stats frequency for slot " << i->first
591  << " is " << i->second.freq << " but recounting "
592  "gives " << i->second.freq_real << endl;
593  ++errors;
594  }
595  }
596  } else if (strcmp(tablename, "docdata") == 0) {
597  // glass doesn't store a docdata entry if the document data is empty,
598  // so we can only check there aren't more docdata entries than
599  // documents.
600  Xapian::doccount doccount = version_file.get_doccount();
601  if (table->get_entry_count() > doccount) {
602  if (out)
603  *out << "More document data (" << table->get_entry_count()
604  << ") then documents (" << doccount << ")" << endl;
605  ++errors;
606  }
607 
608  // Now check the contents of the docdata table.
609  for ( ; !cursor->after_end(); cursor->next()) {
610  string & key = cursor->current_key;
611 
612  // Get docid from key.
613  const char * pos = key.data();
614  const char * end = pos + key.size();
615 
616  Xapian::docid did;
617  if (!unpack_uint_preserving_sort(&pos, end, &did)) {
618  if (out)
619  *out << "Error unpacking docid from key" << endl;
620  ++errors;
621  continue;
622  }
623  if (pos != end) {
624  if (out)
625  *out << "Extra junk in key" << endl;
626  ++errors;
627  } else {
628  if (did > db_last_docid) {
629  if (out)
630  *out << "document id " << did << " in docdata table "
631  "is larger than get_last_docid() "
632  << db_last_docid << endl;
633  ++errors;
634  }
635  }
636 
637  // Fetch and decompress the document data to catch problems with
638  // the splitting into multiple items, corruption of the compressed
639  // data, etc.
640  cursor->read_tag();
641  if (cursor->current_tag.empty()) {
642  // We shouldn't store empty document data.
643  if (out)
644  *out << "Empty document data explicitly stored for "
645  "document id " << did << endl;
646  ++errors;
647  }
648  }
649  } else if (strcmp(tablename, "termlist") == 0) {
650  // Now check the contents of the termlist table.
651  Xapian::doccount num_termlists = 0;
652  Xapian::doccount num_slotsused_entries = 0;
653  for ( ; !cursor->after_end(); cursor->next()) {
654  string & key = cursor->current_key;
655 
656  // Get docid from key.
657  const char * pos = key.data();
658  const char * end = pos + key.size();
659 
660  Xapian::docid did;
661  if (!unpack_uint_preserving_sort(&pos, end, &did)) {
662  if (out)
663  *out << "Error unpacking docid from key" << endl;
664  ++errors;
665  continue;
666  }
667 
668  if (did > db_last_docid) {
669  if (out)
670  *out << "document id " << did << " in termlist table "
671  "is larger than get_last_docid() "
672  << db_last_docid << endl;
673  ++errors;
674  }
675 
676  if (end - pos == 1 && *pos == '\0') {
677  // Value slots used entry.
678  ++num_slotsused_entries;
679  cursor->read_tag();
680 
681  pos = cursor->current_tag.data();
682  end = pos + cursor->current_tag.size();
683 
684  if (pos == end) {
685  if (out) {
686  *out << "document id " << did
687  << ": Empty value slots used tag\n";
688  }
689  ++errors;
690  continue;
691  }
692 
693  Xapian::valueno prev_slot;
694  if (!unpack_uint(&pos, end, &prev_slot)) {
695  if (out) {
696  *out << "document id " << did
697  << ": Value slot encoding corrupt\n";
698  }
699  ++errors;
700  continue;
701  }
702 
703  while (pos != end) {
704  Xapian::valueno slot;
705  if (!unpack_uint(&pos, end, &slot)) {
706  if (out) {
707  *out << "document id " << did
708  << ": Value slot encoding corrupt\n";
709  }
710  ++errors;
711  break;
712  }
713  slot += prev_slot + 1;
714  if (slot <= prev_slot) {
715  if (out) {
716  *out << "document id " << did
717  << ": Value slot number overflowed ("
718  << prev_slot << " -> " << slot << ")\n";
719  }
720  ++errors;
721  }
722  prev_slot = slot;
723  }
724  continue;
725  }
726 
727  if (pos != end) {
728  if (out) {
729  *out << "document id " << did << ": Extra junk in key\n";
730  }
731  ++errors;
732  continue;
733  }
734 
735  ++num_termlists;
736  cursor->read_tag();
737 
738  pos = cursor->current_tag.data();
739  end = pos + cursor->current_tag.size();
740 
741  if (pos == end) {
742  // Empty termlist.
743  continue;
744  }
745 
746  Xapian::termcount doclen, termlist_size;
747 
748  // Read doclen
749  if (!unpack_uint(&pos, end, &doclen)) {
750  if (out) {
751  *out << "document id " << did;
752  if (pos != 0) {
753  *out << ": doclen out of range\n";
754  } else {
755  *out << ": Unexpected end of data when reading "
756  "doclen\n";
757  }
758  }
759  ++errors;
760  continue;
761  }
762 
763  // Check doclen with doclen lower and upper bounds
764  if (doclen > version_file.get_doclength_upper_bound()) {
765  if (out) {
766  *out << "document id " << did
767  << ": doclen " << doclen << " > upper bound "
768  << version_file.get_doclength_upper_bound() << '\n';
769  }
770  ++errors;
771  } else if (doclen < version_file.get_doclength_lower_bound() &&
772  doclen != 0) {
773  if (out) {
774  *out << "document id " << did
775  << ": doclen " << doclen << " < lower bound "
776  << version_file.get_doclength_lower_bound() << '\n';
777  }
778  ++errors;
779  }
780 
781  // Read termlist_size
782  if (!unpack_uint(&pos, end, &termlist_size)) {
783  if (out) {
784  *out << "document id " << did;
785  if (pos != 0) {
786  *out << ": termlist_size out of range\n";
787  } else {
788  *out << ": Unexpected end of data when reading "
789  "termlist_size\n";
790  }
791  }
792  ++errors;
793  continue;
794  }
795 
796  Xapian::termcount actual_doclen = 0, actual_termlist_size = 0;
797  string current_tname;
798 
799  bool bad = false;
800  while (pos != end) {
801  Xapian::doccount current_wdf = 0;
802  bool got_wdf = false;
803  // If there was a previous term, how much to reuse.
804  if (!current_tname.empty()) {
805  string::size_type len = static_cast<unsigned char>(*pos++);
806  if (len > current_tname.length()) {
807  // The wdf was squeezed into the same byte.
808  current_wdf = len / (current_tname.length() + 1) - 1;
809  len %= (current_tname.length() + 1);
810  got_wdf = true;
811  }
812  current_tname.resize(len);
813  }
814  // What to append (note len must be positive, since just truncating
815  // always takes us backwards in the sort order)
816  string::size_type len = static_cast<unsigned char>(*pos++);
817  current_tname.append(pos, len);
818  pos += len;
819 
820  if (!got_wdf) {
821  // Read wdf
822  if (!unpack_uint(&pos, end, &current_wdf)) {
823  if (out) {
824  *out << "document id " << did;
825  if (pos == 0) {
826  *out << ": Unexpected end of data when reading "
827  "termlist current_wdf\n";
828  } else {
829  *out << ": Size of wdf out of range in "
830  "termlist\n";
831  }
832  }
833  ++errors;
834  bad = true;
835  break;
836  }
837  }
838 
839  ++actual_termlist_size;
840  actual_doclen += current_wdf;
841  }
842  if (bad) {
843  continue;
844  }
845 
846  if (termlist_size != actual_termlist_size) {
847  if (out) {
848  *out << "document id " << did << ": termlist_size "
849  << termlist_size << " != # of entries in termlist "
850  << actual_termlist_size << '\n';
851  }
852  ++errors;
853  }
854  if (doclen != actual_doclen) {
855  if (out) {
856  *out << "document id " << did << ": length " << doclen
857  << " != sum(wdf) " << actual_doclen << '\n';
858  }
859  ++errors;
860  }
861 
862  // + 1 so that did is a valid subscript.
863  if (doclens.size() <= did) doclens.resize(did + 1);
864  doclens[did] = actual_doclen;
865  }
866 
867  Xapian::doccount doccount = version_file.get_doccount();
868 
869  // glass doesn't store a termlist entry if there are no terms, so we
870  // can only check there aren't more termlists than documents.
871  if (num_termlists > doccount) {
872  if (out)
873  *out << "More termlists (" << num_termlists
874  << ") then documents (" << doccount << ")" << endl;
875  ++errors;
876  }
877 
878  // glass doesn't store a valueslots used entry if there are no terms,
879  // so we can only check there aren't more such entries than documents.
880  if (num_slotsused_entries > doccount) {
881  if (out)
882  *out << "More slots-used entries (" << num_slotsused_entries
883  << ") then documents (" << doccount << ")" << endl;
884  ++errors;
885  }
886  } else if (strcmp(tablename, "position") == 0) {
887  // Now check the contents of the position table.
888  for ( ; !cursor->after_end(); cursor->next()) {
889  string & key = cursor->current_key;
890 
891  // Get docid from key.
892  const char * pos = key.data();
893  const char * end = pos + key.size();
894 
895  string term;
896  if (!unpack_string_preserving_sort(&pos, end, term)) {
897  if (out)
898  *out << "Error unpacking term from key" << endl;
899  ++errors;
900  continue;
901  }
902 
903  Xapian::docid did;
904  if (!unpack_uint_preserving_sort(&pos, end, &did)) {
905  if (out)
906  *out << "Error unpacking docid from key" << endl;
907  ++errors;
908  continue;
909  }
910 
911  if (pos != end) {
912  if (out)
913  *out << "Extra junk in key with docid " << did << endl;
914  ++errors;
915  continue;
916  }
917 
918  if (did > db_last_docid) {
919  if (out)
920  *out << "document id " << did << " in position table "
921  "is larger than get_last_docid() "
922  << db_last_docid << endl;
923  ++errors;
924  } else if (!doclens.empty()) {
925  // In glass, a document without terms doesn't get a
926  // termlist entry, so we can't tell the difference
927  // easily.
928  if (did >= doclens.size() || doclens[did] == 0) {
929  if (out)
930  *out << "Position list entry for document " << did
931  << " which doesn't exist or has no terms" << endl;
932  ++errors;
933  }
934  }
935 
936  cursor->read_tag();
937 
938  const string & data = cursor->current_tag;
939  pos = data.data();
940  end = pos + data.size();
941 
942  Xapian::termpos pos_last;
943  if (!unpack_uint(&pos, end, &pos_last)) {
944  if (out)
945  *out << tablename << " table: Position list data corrupt"
946  << endl;
947  ++errors;
948  continue;
949  }
950  if (pos == end) {
951  // Special case for single entry position list.
952  } else {
953  // Skip the header we just read.
954  BitReader rd(pos, end);
955  Xapian::termpos pos_first = rd.decode(pos_last);
956  Xapian::termpos pos_size = rd.decode(pos_last - pos_first) + 2;
957  rd.decode_interpolative(0, pos_size - 1, pos_first, pos_last);
959  bool ok = true;
960  while (p != pos_last) {
961  Xapian::termpos pos_prev = p;
963  if (p <= pos_prev) {
964  if (out)
965  *out << tablename << " table: Positions not "
966  "strictly monotonically increasing" << endl;
967  ++errors;
968  ok = false;
969  break;
970  }
971  }
972  if (ok && !rd.check_all_gone()) {
973  if (out)
974  *out << tablename << " table: Junk after position data"
975  << endl;
976  ++errors;
977  }
978  }
979  }
980  } else {
981  if (out)
982  *out << tablename << " table: Full structure check not "
983  "implemented, checking readability\n";
984  for ( ; !cursor->after_end(); cursor->next()) {
985  cursor->read_tag();
986  }
987  }
988 
989  if (out) {
990  if (!errors)
991  *out << tablename << " table structure checked OK\n";
992  else
993  *out << tablename << " table errors found: " << errors << "\n";
994  *out << endl;
995  }
996 
997  return errors;
998 }
999 
1000 #ifdef DISABLE_GPL_LIBXAPIAN
1001 # error GPL source we cannot relicense included in libxapian
1002 #endif
Classes to encode/decode a bitstream.
static GlassTableCheck * check(const char *tablename, std::string_view path, int fd, off_t offset_, const GlassVersion &version_file, int opts, std::ostream *out)
Definition: glass_check.cc:263
The GlassVersion class manages the revision files.
Definition: glass_version.h:96
Xapian::docid get_last_docid() const
Xapian::termcount get_doclength_lower_bound() const
Xapian::doccount get_doccount() const
Xapian::termcount get_doclength_upper_bound() const
Read a stream created by BitWriter.
Definition: bitstream.h:66
bool check_all_gone() const
Definition: bitstream.h:146
Xapian::termpos decode(Xapian::termpos outof, bool force=false)
Definition: bitstream.cc:178
void decode_interpolative(int j, int k, Xapian::termpos pos_j, Xapian::termpos pos_k)
Perform interpolative decoding between elements between j and k.
Definition: bitstream.cc:229
Xapian::termpos decode_interpolative_next()
Perform on-demand interpolative decoding.
Definition: bitstream.cc:239
string term
PositionList * p
Xapian::termpos pos
Utility functions for testing files.
bool file_exists(const char *path)
Test if a file exists.
Definition: filetests.h:40
Btree checking.
Interface to Btree cursors.
size_t check_glass_table(const char *tablename, string_view db_dir, int fd, off_t offset_, const GlassVersion &version_file, int opts, vector< Xapian::termcount > &doclens, ostream *out)
static bool is_user_metadata_key(const string &key)
Check a glass table.
Definitions, types, etc for use inside glass.
#define GLASS_TABLE_EXTENSION
Glass table extension.
Definition: glass_defs.h:27
Btree implementation.
GlassVersion class.
Types used internally.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned valueno
The number for a value slot in a document.
Definition: types.h:90
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
Definition: types.h:75
#define AssertEq(A, B)
Definition: omassert.h:124
Pack types into strings and unpack them again.
bool unpack_uint_last(const char **p, const char *end, U *result)
Decode an unsigned integer as the last item in a string.
Definition: pack.h:118
bool unpack_string_preserving_sort(const char **p, const char *end, std::string &result)
Decode a "sort preserved" std::string from a string.
Definition: pack.h:551
bool unpack_string(const char **p, const char *end, std::string &result)
Decode a std::string from a string.
Definition: pack.h:468
bool unpack_bool(const char **p, const char *end, bool *result)
Decode a bool from a string.
Definition: pack.h:76
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
Definition: pack.h:346
bool unpack_uint_preserving_sort(const char **p, const char *end, U *result)
Decode a "sort preserved" unsigned integer from a string.
Definition: pack.h:251
Xapian::doccount freq_real
Class to hold statistics for a given slot.
Definition: valuestats.h:28
std::string lower_bound
A lower bound on the values stored in the given value slot.
Definition: valuestats.h:36
std::string upper_bound
An upper bound on the values stored in the given value slot.
Definition: valuestats.h:40
Xapian::doccount freq
The number of documents which have a (non-empty) value stored in the slot.
Definition: valuestats.h:32
Statistics about values.
static const char * opts
Public interfaces for the Xapian library.