xapian-core  2.0.0
dbcheck.cc
Go to the documentation of this file.
1 
4 /* Copyright 2009 Richard Boulton
5  * Copyright 2010,2015 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, see
19  * <https://www.gnu.org/licenses/>.
20  */
21 
22 #include <config.h>
23 
24 #include "dbcheck.h"
25 
26 #include "str.h"
27 #include "testsuite.h"
28 
29 using namespace std;
30 
31 string
33  const Xapian::PositionIterator & end,
34  Xapian::termcount * count)
35 {
36  string result;
37  bool need_comma = false;
38  Xapian::termcount c = 0;
39  while (it != end) {
40  if (need_comma)
41  result += ", ";
42  result += str(*it);
43  need_comma = true;
44  ++it;
45  ++c;
46  }
47  if (count) {
48  *count = c;
49  }
50  return result;
51 }
52 
53 string
54 postlist_to_string(const Xapian::Database & db, const string & tname)
55 {
56  string result;
57  bool need_comma = false;
58 
59  for (Xapian::PostingIterator p = db.postlist_begin(tname);
60  p != db.postlist_end(tname);
61  ++p) {
62  if (need_comma)
63  result += ", ";
64 
65  Xapian::PositionIterator it(p.positionlist_begin());
66  string posrepr = positions_to_string(it, p.positionlist_end());
67  if (!posrepr.empty()) {
68  posrepr = ", pos=[" + posrepr + "]";
69  }
70 
71  result += "(" + str(*p) +
72  ", doclen=" + str(p.get_doclength()) +
73  ", wdf=" + str(p.get_wdf()) +
74  posrepr + ")";
75  need_comma = true;
76  }
77  return result;
78 }
79 
80 string
82 {
83  string result;
84  bool need_comma = false;
85 
86  for (Xapian::TermIterator t = db.termlist_begin(did);
87  t != db.termlist_end(did);
88  ++t) {
89  Xapian::PositionIterator it(t.positionlist_begin());
90  string posrepr = positions_to_string(it, t.positionlist_end());
91  if (!posrepr.empty()) {
92  posrepr = ", pos=[" + posrepr + "]";
93  }
94  if (need_comma)
95  result += ", ";
96  result += "Term(" + *t + ", wdf=" + str(t.get_wdf()) + posrepr;
97  result += ")";
98  need_comma = true;
99  }
100  return result;
101 }
102 
103 string
105 {
106  string result;
107 
108  result += "len=" + str(db.get_doclength(did));
109 
110  return result;
111 }
112 
113 string
114 termstats_to_string(const Xapian::Database & db, const string & term)
115 {
116  string result;
117 
118  result += "tf=" + str(db.get_termfreq(term));
119  result += ",cf=" + str(db.get_collection_freq(term));
120 
121  return result;
122 }
123 
124 void
126  Xapian::doccount expected_doccount,
127  Xapian::docid expected_lastdocid)
128 {
129  TEST_EQUAL(db.get_doccount(), expected_doccount);
130  TEST_EQUAL(db.get_lastdocid(), expected_lastdocid);
131 
132  // Note - may not be a very big type, but we're only expecting to use this
133  // for small databases, so should be fine.
134  unsigned long totlen = 0;
135 
136  // A map from term to a representation of the posting list for that term.
137  // We build this up from the documents, and then check it against the
138  // equivalent built up from the posting lists.
139  map<string, string> posting_reprs;
140  map<Xapian::valueno, string> value_reprs;
141 
142  Xapian::termcount doclen_lower_bound = Xapian::termcount(-1);
143  Xapian::termcount doclen_upper_bound = 0;
144 
145  for (Xapian::PostingIterator dociter = db.postlist_begin(string_view());
146  dociter != db.postlist_end(string_view());
147  ++dociter) {
148  Xapian::docid did = *dociter;
149  TEST_EQUAL(dociter.get_wdf(), 1);
150  Xapian::Document doc(db.get_document(did));
151  Xapian::termcount doclen(db.get_doclength(did));
152  Xapian::termcount unique_terms(db.get_unique_terms(did));
153  if (doclen < doclen_lower_bound)
154  doclen_lower_bound = doclen;
155  if (doclen > doclen_upper_bound)
156  doclen_upper_bound = doclen;
157  totlen += doclen;
158 
159  Xapian::termcount found_termcount = 0;
160  Xapian::termcount found_unique_terms = 0;
161  Xapian::termcount wdf_sum = 0;
162  Xapian::TermIterator t, t2;
163  for (t = doc.termlist_begin(), t2 = db.termlist_begin(did);
164  t != doc.termlist_end();
165  ++t, ++t2) {
166  TEST(t2 != db.termlist_end(did));
167 
168  ++found_termcount;
169  auto wdf = t.get_wdf();
170  if (wdf) ++found_unique_terms;
171  wdf_sum += wdf;
172 
173  TEST_EQUAL(*t, *t2);
174  TEST_EQUAL(t.get_wdf(), t2.get_wdf());
175  TEST_EQUAL(db.get_termfreq(*t), t.get_termfreq());
176  TEST_EQUAL(db.get_termfreq(*t), t2.get_termfreq());
177 
178  // Check the position lists are equal.
179  Xapian::termcount tc1, tc2;
181  string posrepr = positions_to_string(it1, t.positionlist_end(), &tc1);
183  string posrepr2 = positions_to_string(it2, t2.positionlist_end(), &tc2);
184  TEST_EQUAL(posrepr, posrepr2);
185  TEST_EQUAL(tc1, tc2);
186  TEST_EQUAL(tc1, t.positionlist_count());
187 
188  // Make a representation of the posting.
189  if (!posrepr.empty()) {
190  posrepr = ",[" + posrepr + "]";
191  }
192  string posting_repr = "(" + str(did) + "," +
193  str(t.get_wdf()) + "/" + str(doclen) +
194  posrepr + ")";
195 
196  // Append the representation to the list for the term.
197  map<string, string>::iterator i = posting_reprs.find(*t);
198  if (i == posting_reprs.end()) {
199  posting_reprs[*t] = posting_repr;
200  } else {
201  i->second += "," + posting_repr;
202  }
203  }
204 
205  Xapian::termcount vcount = 0;
206  for (Xapian::ValueIterator v = doc.values_begin();
207  v != doc.values_end();
208  ++v, ++vcount) {
209  TEST((*v).size() != 0);
210  string value_repr = "(" + str(did) + "," + *v + ")";
211 
212  // Append the values to the value lists.
213  map<Xapian::valueno, string>::iterator i;
214  i = value_reprs.find(v.get_valueno());
215  if (i == value_reprs.end()) {
216  value_reprs[v.get_valueno()] = value_repr;
217  } else {
218  i->second += "," + value_repr;
219  }
220  }
221  TEST_EQUAL(vcount, doc.values_count());
222  TEST(t2 == db.termlist_end(did));
223  Xapian::termcount expected_termcount = doc.termlist_count();
224  TEST_EQUAL(expected_termcount, found_termcount);
225  // Ideally this would be equal, but currently we don't store the
226  // unique_terms values but calculate them, and scanning the termlist
227  // of each document would be slow, so instead get_unique_terms(did)
228  // returns min(doclen, termcount) at present.
229  TEST_REL(unique_terms, >=, found_unique_terms);
230  TEST_REL(unique_terms, <=, found_termcount);
231  TEST_REL(unique_terms, <=, doclen);
232  TEST_EQUAL(doclen, wdf_sum);
233  }
234 
235  TEST_REL(doclen_lower_bound, >=, db.get_doclength_lower_bound());
236  TEST_REL(doclen_upper_bound, <=, db.get_doclength_upper_bound());
237 
239  map<string, string>::const_iterator i;
240  for (t = db.allterms_begin(), i = posting_reprs.begin();
241  t != db.allterms_end();
242  ++t, ++i) {
243  TEST(db.term_exists(*t));
244  TEST(i != posting_reprs.end());
245  TEST_EQUAL(i->first, *t);
246 
247  Xapian::doccount tf_count = 0;
248  Xapian::termcount cf_count = 0;
249  Xapian::termcount wdf_upper_bound = 0;
250  string posting_repr;
251  bool need_comma = false;
253  p != db.postlist_end(*t);
254  ++p) {
255  if (need_comma) {
256  posting_repr += ",";
257  }
258 
259  ++tf_count;
260  cf_count += p.get_wdf();
261 
262  Xapian::PositionIterator it(p.positionlist_begin());
263  string posrepr = positions_to_string(it, p.positionlist_end());
264  if (!posrepr.empty()) {
265  posrepr = ",[" + posrepr + "]";
266  }
267  posting_repr += "(" + str(*p) + "," +
268  str(p.get_wdf()) + "/" +
269  str(p.get_doclength()) + posrepr + ")";
270  if (wdf_upper_bound < p.get_wdf())
271  wdf_upper_bound = p.get_wdf();
272  need_comma = true;
273  }
274 
275  TEST_EQUAL(posting_repr, i->second);
276  TEST_EQUAL(tf_count, t.get_termfreq());
277  TEST_EQUAL(tf_count, db.get_termfreq(*t));
278  TEST_EQUAL(cf_count, db.get_collection_freq(*t));
279  TEST_REL(wdf_upper_bound, <=, db.get_wdf_upper_bound(*t));
280  }
281  TEST(i == posting_reprs.end());
282 
283  map<Xapian::valueno, string>::const_iterator j;
284  for (j = value_reprs.begin(); j != value_reprs.end(); ++j) {
285  string value_repr;
286  string value_lower_bound;
287  string value_upper_bound;
288  bool first = true;
289  for (Xapian::ValueIterator v = db.valuestream_begin(j->first);
290  v != db.valuestream_end(j->first); ++v) {
291  if (first) {
292  value_lower_bound = *v;
293  value_upper_bound = *v;
294  first = false;
295  } else {
296  value_repr += ",";
297  if (*v > value_upper_bound) {
298  value_upper_bound = *v;
299  }
300  if (*v < value_lower_bound) {
301  value_lower_bound = *v;
302  }
303  }
304  value_repr += "(" + str(v.get_docid()) + "," + *v + ")";
305  }
306  TEST_EQUAL(value_repr, j->second);
307  try {
308  TEST_REL(value_upper_bound, <=, db.get_value_upper_bound(j->first));
309  TEST_REL(value_lower_bound, >=, db.get_value_lower_bound(j->first));
310  } catch (const Xapian::UnimplementedError &) {
311  // Skip the checks if the methods to get the bounds aren't
312  // implemented for this backend.
313  }
314  }
315 
316  if (expected_doccount == 0) {
317  TEST_EQUAL(0, db.get_avlength());
318  } else {
319  TEST_EQUAL_DOUBLE(double(totlen) / expected_doccount,
320  db.get_avlength());
321  }
322 }
An indexed database of documents.
Definition: database.h:75
ValueIterator valuestream_begin(Xapian::valueno slot) const
Return an iterator over the value in slot slot for each document.
Definition: database.cc:335
Xapian::doccount get_termfreq(std::string_view term) const
Get the number of documents indexed by a specified term.
Definition: database.cc:262
Xapian::termcount get_doclength_lower_bound() const
Get a lower bound on the length of a document in this DB.
Definition: database.cc:302
PostingIterator postlist_begin(std::string_view term) const
Start iterating the postings of a term.
Definition: database.cc:192
TermIterator termlist_begin(Xapian::docid did) const
Start iterating the terms in a document.
Definition: database.cc:200
double get_avlength() const
Old name for get_average_length() for backward compatibility.
Definition: database.h:322
Xapian::termcount get_wdf_upper_bound(std::string_view term) const
Get an upper bound on the wdf of term term.
Definition: database.cc:314
std::string get_value_upper_bound(Xapian::valueno slot) const
Get an upper bound on the values stored in the given value slot.
Definition: database.cc:296
Xapian::termcount get_doclength(Xapian::docid did) const
Get the length of a specified document.
Definition: database.cc:341
bool term_exists(std::string_view term) const
Test is a particular term is present in any document.
Definition: database.cc:378
std::string get_value_lower_bound(Xapian::valueno slot) const
Get a lower bound on the values stored in the given value slot.
Definition: database.cc:290
TermIterator allterms_end(std::string_view={}) const noexcept
End iterator corresponding to allterms_begin(prefix).
Definition: database.h:307
Xapian::termcount get_collection_freq(std::string_view term) const
Get the total number of occurrences of a specified term.
Definition: database.cc:273
Xapian::doccount get_doccount() const
Get the number of documents in the database.
Definition: database.cc:233
PostingIterator postlist_end(std::string_view) const noexcept
End iterator corresponding to postlist_begin().
Definition: database.h:258
TermIterator termlist_end(Xapian::docid) const noexcept
End iterator corresponding to termlist_begin().
Definition: database.h:271
Xapian::docid get_lastdocid() const
Get the highest document id which has been used in the database.
Definition: database.cc:239
TermIterator allterms_begin(std::string_view prefix={}) const
Start iterating all terms in the database with a given prefix.
Definition: database.cc:209
ValueIterator valuestream_end(Xapian::valueno) const noexcept
Return end iterator corresponding to valuestream_begin().
Definition: database.h:421
Xapian::termcount get_doclength_upper_bound() const
Get an upper bound on the length of a document in this DB.
Definition: database.cc:308
Xapian::Document get_document(Xapian::docid did, unsigned flags=0) const
Get a document from the database.
Definition: database.cc:368
Xapian::termcount get_unique_terms(Xapian::docid did) const
Get the number of unique terms in a specified document.
Definition: database.cc:350
Class representing a document.
Definition: document.h:64
Xapian::valueno values_count() const
Count the value slots used in this document.
Definition: document.cc:203
ValueIterator values_begin() const
Start iterating the values in this document.
Definition: document.cc:208
TermIterator termlist_end() const noexcept
End iterator corresponding to termlist_begin().
Definition: document.h:219
Xapian::termcount termlist_count() const
Return the number of distinct terms in this document.
Definition: document.cc:174
TermIterator termlist_begin() const
Start iterating the terms in this document.
Definition: document.cc:179
ValueIterator values_end() const noexcept
End iterator corresponding to values_begin().
Definition: document.h:259
Class for iterating over term positions.
Class for iterating over a list of terms.
Class for iterating over a list of terms.
Definition: termiterator.h:41
PositionIterator positionlist_end() const noexcept
Return an end PositionIterator for the current term.
Definition: termiterator.h:109
Xapian::doccount get_termfreq() const
Return the term frequency for the term at the current position.
Xapian::termcount positionlist_count() const
Return the length of the position list for the current position.
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
PositionIterator positionlist_begin() const
Return a PositionIterator for the current term.
UnimplementedError indicates an attempt to use an unimplemented feature.
Definition: error.h:313
Class for iterating over document values.
Definition: valueiterator.h:39
string term
PositionList * p
test database contents and consistency.
string str(int value)
Convert int to std::string.
Definition: str.cc:91
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
Convert types to std::string.
#define TEST_REL(A, REL, B)
Test a relation holds,e.g. TEST_REL(a,>,b);.
Definition: testmacros.h:35
string docterms_to_string(const Xapian::Database &db, Xapian::docid did)
Convert the list of terms in a document to a string.
Definition: dbcheck.cc:81
void dbcheck(const Xapian::Database &db, Xapian::doccount expected_doccount, Xapian::docid expected_lastdocid)
Check consistency of database and statistics.
Definition: dbcheck.cc:125
string postlist_to_string(const Xapian::Database &db, const string &tname)
Convert the list of postings in a postlist to a string.
Definition: dbcheck.cc:54
string termstats_to_string(const Xapian::Database &db, const string &term)
Convert statistics about a term to a string.
Definition: dbcheck.cc:114
string positions_to_string(Xapian::PositionIterator &it, const Xapian::PositionIterator &end, Xapian::termcount *count)
Convert the list of positions in a positionlist to a string.
Definition: dbcheck.cc:32
string docstats_to_string(const Xapian::Database &db, Xapian::docid did)
Convert statistics about a document to a string.
Definition: dbcheck.cc:104
a generic test suite engine
#define TEST_EQUAL(a, b)
Test for equality of two things.
Definition: testsuite.h:276
#define TEST_EQUAL_DOUBLE(a, b)
Test two doubles for near equality.
Definition: testsuite.h:293
#define TEST(a)
Test a condition, without an additional explanation for failure.
Definition: testsuite.h:273