xapian-core  1.4.27
dbcheck.cc
Go to the documentation of this file.
1 
4 /* Copyright 2009 Richard Boulton
5  * Copyright 2010,2015 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20  * USA
21  */
22 
23 #include <config.h>
24 
25 #include "dbcheck.h"
26 
27 #include "str.h"
28 #include "testsuite.h"
29 
30 using namespace std;
31 
32 string
34  const Xapian::PositionIterator & end,
35  Xapian::termcount * count)
36 {
37  string result;
38  bool need_comma = false;
39  Xapian::termcount c = 0;
40  while (it != end) {
41  if (need_comma)
42  result += ", ";
43  result += str(*it);
44  need_comma = true;
45  ++it;
46  ++c;
47  }
48  if (count) {
49  *count = c;
50  }
51  return result;
52 }
53 
54 string
55 postlist_to_string(const Xapian::Database & db, const string & tname)
56 {
57  string result;
58  bool need_comma = false;
59 
60  for (Xapian::PostingIterator p = db.postlist_begin(tname);
61  p != db.postlist_end(tname);
62  ++p) {
63  if (need_comma)
64  result += ", ";
65 
66  Xapian::PositionIterator it(p.positionlist_begin());
67  string posrepr = positions_to_string(it, p.positionlist_end());
68  if (!posrepr.empty()) {
69  posrepr = ", pos=[" + posrepr + "]";
70  }
71 
72  result += "(" + str(*p) +
73  ", doclen=" + str(p.get_doclength()) +
74  ", wdf=" + str(p.get_wdf()) +
75  posrepr + ")";
76  need_comma = true;
77  }
78  return result;
79 }
80 
81 string
83 {
84  string result;
85  bool need_comma = false;
86 
87  for (Xapian::TermIterator t = db.termlist_begin(did);
88  t != db.termlist_end(did);
89  ++t) {
90  Xapian::PositionIterator it(t.positionlist_begin());
91  string posrepr = positions_to_string(it, t.positionlist_end());
92  if (!posrepr.empty()) {
93  posrepr = ", pos=[" + posrepr + "]";
94  }
95  if (need_comma)
96  result += ", ";
97  result += "Term(" + *t + ", wdf=" + str(t.get_wdf()) + posrepr;
98  result += ")";
99  need_comma = true;
100  }
101  return result;
102 }
103 
104 string
106 {
107  string result;
108 
109  result += "len=" + str(db.get_doclength(did));
110 
111  return result;
112 }
113 
114 string
115 termstats_to_string(const Xapian::Database & db, const string & term)
116 {
117  string result;
118 
119  result += "tf=" + str(db.get_termfreq(term));
120  result += ",cf=" + str(db.get_collection_freq(term));
121 
122  return result;
123 }
124 
125 void
127  Xapian::doccount expected_doccount,
128  Xapian::docid expected_lastdocid)
129 {
130  TEST_EQUAL(db.get_doccount(), expected_doccount);
131  TEST_EQUAL(db.get_lastdocid(), expected_lastdocid);
132 
133  // Note - may not be a very big type, but we're only expecting to use this
134  // for small databases, so should be fine.
135  unsigned long totlen = 0;
136 
137  // A map from term to a representation of the posting list for that term.
138  // We build this up from the documents, and then check it against the
139  // equivalent built up from the posting lists.
140  map<string, string> posting_reprs;
141  map<Xapian::valueno, string> value_reprs;
142 
143  Xapian::termcount doclen_lower_bound = Xapian::termcount(-1);
144  Xapian::termcount doclen_upper_bound = 0;
145 
146  for (Xapian::PostingIterator dociter = db.postlist_begin(string());
147  dociter != db.postlist_end(string());
148  ++dociter) {
149  Xapian::docid did = *dociter;
150  TEST_EQUAL(dociter.get_wdf(), 1);
151  Xapian::Document doc(db.get_document(did));
152  Xapian::termcount doclen(db.get_doclength(did));
153  Xapian::termcount unique_terms(db.get_unique_terms(did));
154  if (doclen < doclen_lower_bound)
155  doclen_lower_bound = doclen;
156  if (doclen > doclen_upper_bound)
157  doclen_upper_bound = doclen;
158  totlen += doclen;
159 
160  Xapian::termcount found_termcount = 0;
161  Xapian::termcount found_unique_terms = 0;
162  Xapian::termcount wdf_sum = 0;
163  Xapian::TermIterator t, t2;
164  for (t = doc.termlist_begin(), t2 = db.termlist_begin(did);
165  t != doc.termlist_end();
166  ++t, ++t2) {
167  TEST(t2 != db.termlist_end(did));
168 
169  ++found_termcount;
170  auto wdf = t.get_wdf();
171  if (wdf) ++found_unique_terms;
172  wdf_sum += wdf;
173 
174  TEST_EQUAL(*t, *t2);
175  TEST_EQUAL(t.get_wdf(), t2.get_wdf());
176  TEST_EQUAL(db.get_termfreq(*t), t.get_termfreq());
177  TEST_EQUAL(db.get_termfreq(*t), t2.get_termfreq());
178 
179  // Check the position lists are equal.
180  Xapian::termcount tc1, tc2;
182  string posrepr = positions_to_string(it1, t.positionlist_end(), &tc1);
184  string posrepr2 = positions_to_string(it2, t2.positionlist_end(), &tc2);
185  TEST_EQUAL(posrepr, posrepr2);
186  TEST_EQUAL(tc1, tc2);
187  try {
188  TEST_EQUAL(tc1, t.positionlist_count());
189  } catch (const Xapian::UnimplementedError &) {
190  // positionlist_count() isn't implemented for remote databases.
191  }
192 
193  // Make a representation of the posting.
194  if (!posrepr.empty()) {
195  posrepr = ",[" + posrepr + "]";
196  }
197  string posting_repr = "(" + str(did) + "," +
198  str(t.get_wdf()) + "/" + str(doclen) +
199  posrepr + ")";
200 
201  // Append the representation to the list for the term.
202  map<string, string>::iterator i = posting_reprs.find(*t);
203  if (i == posting_reprs.end()) {
204  posting_reprs[*t] = posting_repr;
205  } else {
206  i->second += "," + posting_repr;
207  }
208  }
209 
210  Xapian::termcount vcount = 0;
211  for (Xapian::ValueIterator v = doc.values_begin();
212  v != doc.values_end();
213  ++v, ++vcount) {
214  TEST((*v).size() != 0);
215  string value_repr = "(" + str(did) + "," + *v + ")";
216 
217  // Append the values to the value lists.
218  map<Xapian::valueno, string>::iterator i;
219  i = value_reprs.find(v.get_valueno());
220  if (i == value_reprs.end()) {
221  value_reprs[v.get_valueno()] = value_repr;
222  } else {
223  i->second += "," + value_repr;
224  }
225  }
226  TEST_EQUAL(vcount, doc.values_count());
227  TEST(t2 == db.termlist_end(did));
228  Xapian::termcount expected_termcount = doc.termlist_count();
229  TEST_EQUAL(expected_termcount, found_termcount);
230  // Ideally this would be equal, but currently we don't store the
231  // unique_terms values but calculate them, and scanning the termlist
232  // of each document would be slow, so instead get_unique_terms(did)
233  // returns min(doclen, termcount) at present.
234  TEST_REL(unique_terms, >=, found_unique_terms);
235  TEST_REL(unique_terms, <=, found_termcount);
236  TEST_REL(unique_terms, <=, doclen);
237  TEST_EQUAL(doclen, wdf_sum);
238  }
239 
240  TEST_REL(doclen_lower_bound, >=, db.get_doclength_lower_bound());
241  TEST_REL(doclen_upper_bound, <=, db.get_doclength_upper_bound());
242 
244  map<string, string>::const_iterator i;
245  for (t = db.allterms_begin(), i = posting_reprs.begin();
246  t != db.allterms_end();
247  ++t, ++i) {
248  TEST(db.term_exists(*t));
249  TEST(i != posting_reprs.end());
250  TEST_EQUAL(i->first, *t);
251 
252  Xapian::doccount tf_count = 0;
253  Xapian::termcount cf_count = 0;
254  Xapian::termcount wdf_upper_bound = 0;
255  string posting_repr;
256  bool need_comma = false;
257  for (Xapian::PostingIterator p = db.postlist_begin(*t);
258  p != db.postlist_end(*t);
259  ++p) {
260  if (need_comma) {
261  posting_repr += ",";
262  }
263 
264  ++tf_count;
265  cf_count += p.get_wdf();
266 
267  Xapian::PositionIterator it(p.positionlist_begin());
268  string posrepr = positions_to_string(it, p.positionlist_end());
269  if (!posrepr.empty()) {
270  posrepr = ",[" + posrepr + "]";
271  }
272  posting_repr += "(" + str(*p) + "," +
273  str(p.get_wdf()) + "/" +
274  str(p.get_doclength()) + posrepr + ")";
275  if (wdf_upper_bound < p.get_wdf())
276  wdf_upper_bound = p.get_wdf();
277  need_comma = true;
278  }
279 
280  TEST_EQUAL(posting_repr, i->second);
281  TEST_EQUAL(tf_count, t.get_termfreq());
282  TEST_EQUAL(tf_count, db.get_termfreq(*t));
283  TEST_EQUAL(cf_count, db.get_collection_freq(*t));
284  TEST_REL(wdf_upper_bound, <=, db.get_wdf_upper_bound(*t));
285  }
286  TEST(i == posting_reprs.end());
287 
288  map<Xapian::valueno, string>::const_iterator j;
289  for (j = value_reprs.begin(); j != value_reprs.end(); ++j) {
290  string value_repr;
291  string value_lower_bound;
292  string value_upper_bound;
293  bool first = true;
294  for (Xapian::ValueIterator v = db.valuestream_begin(j->first);
295  v != db.valuestream_end(j->first); ++v) {
296  if (first) {
297  value_lower_bound = *v;
298  value_upper_bound = *v;
299  first = false;
300  } else {
301  value_repr += ",";
302  if (*v > value_upper_bound) {
303  value_upper_bound = *v;
304  }
305  if (*v < value_lower_bound) {
306  value_lower_bound = *v;
307  }
308  }
309  value_repr += "(" + str(v.get_docid()) + "," + *v + ")";
310  }
311  TEST_EQUAL(value_repr, j->second);
312  try {
313  TEST_REL(value_upper_bound, <=, db.get_value_upper_bound(j->first));
314  TEST_REL(value_lower_bound, >=, db.get_value_lower_bound(j->first));
315  } catch (const Xapian::UnimplementedError &) {
316  // Skip the checks if the methods to get the bounds aren't
317  // implemented for this backend.
318  }
319  }
320 
321  if (expected_doccount == 0) {
322  TEST_EQUAL(0, db.get_avlength());
323  } else {
324  TEST_EQUAL_DOUBLE(double(totlen) / expected_doccount,
325  db.get_avlength());
326  }
327 }
Xapian::Document get_document(Xapian::docid did) const
Get a document from the database, given its document id.
Definition: omdatabase.cc:490
void dbcheck(const Xapian::Database &db, Xapian::doccount expected_doccount, Xapian::docid expected_lastdocid)
Check consistency of database and statistics.
Definition: dbcheck.cc:126
TermIterator termlist_begin(Xapian::docid did) const
An iterator pointing to the start of the termlist for a given document.
Definition: omdatabase.cc:198
#define TEST(a)
Test a condition, without an additional explanation for failure.
Definition: testsuite.h:275
This class is used to access a database, or a group of databases.
Definition: database.h:68
test database contents and consistency.
Xapian::termcount get_doclength_lower_bound() const
Get a lower bound on the length of a document in this DB.
Definition: omdatabase.cc:401
TermIterator allterms_end(const std::string &=std::string()) const
Corresponding end iterator to allterms_begin(prefix).
Definition: database.h:269
Xapian::docid get_lastdocid() const
Get the highest document id which has been used in the database.
Definition: omdatabase.cc:279
a generic test suite engine
Xapian::doccount get_termfreq() const
Return the term frequency for the term at the current position.
Class for iterating over document values.
Definition: valueiterator.h:40
STL namespace.
Convert types to std::string.
std::string get_value_upper_bound(Xapian::valueno slot) const
Get an upper bound on the values stored in the given value slot.
Definition: omdatabase.cc:386
Xapian::doccount get_doccount() const
Get the number of documents in the database.
Definition: omdatabase.cc:267
string docstats_to_string(const Xapian::Database &db, Xapian::docid did)
Convert statistics about a document to a string.
Definition: dbcheck.cc:105
Xapian::doclength get_avlength() const
Get the average length of the documents in the database.
Definition: omdatabase.cc:293
Class for iterating over a list of terms.
Definition: termiterator.h:41
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
#define TEST_REL(A, REL, B)
Test a relation holds,e.g. TEST_REL(a,>,b);.
Definition: testmacros.h:32
Class for iterating over a list of terms.
PositionIterator positionlist_end() const
Return an end PositionIterator for the current term.
Definition: termiterator.h:110
Xapian::termcount get_doclength_upper_bound() const
Get an upper bound on the length of a document in this DB.
Definition: omdatabase.cc:421
string docterms_to_string(const Xapian::Database &db, Xapian::docid did)
Convert the list of terms in a document to a string.
Definition: dbcheck.cc:82
string postlist_to_string(const Xapian::Database &db, const string &tname)
Convert the list of postings in a postlist to a string.
Definition: dbcheck.cc:55
ValueIterator valuestream_end(Xapian::valueno) const
Return end iterator corresponding to valuestream_begin().
Definition: database.h:363
string termstats_to_string(const Xapian::Database &db, const string &term)
Convert statistics about a term to a string.
Definition: dbcheck.cc:115
string str(int value)
Convert int to std::string.
Definition: str.cc:90
Xapian::termcount get_doclength(Xapian::docid did) const
Get the length of a document.
Definition: omdatabase.cc:461
Class for iterating over term positions.
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
TermIterator allterms_begin(const std::string &prefix=std::string()) const
An iterator which runs across all terms with a given prefix.
Definition: omdatabase.cc:223
TermIterator termlist_end(Xapian::docid) const
Corresponding end iterator to termlist_begin().
Definition: database.h:240
#define TEST_EQUAL_DOUBLE(a, b)
Test two doubles for near equality.
Definition: testsuite.h:295
ValueIterator valuestream_begin(Xapian::valueno slot) const
Return an iterator over the value in slot slot for each document.
Definition: omdatabase.cc:450
bool term_exists(const std::string &tname) const
Check if a given term exists in the database.
Definition: omdatabase.cc:524
string positions_to_string(Xapian::PositionIterator &it, const Xapian::PositionIterator &end, Xapian::termcount *count)
Convert the list of positions in a positionlist to a string.
Definition: dbcheck.cc:33
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
std::string get_value_lower_bound(Xapian::valueno slot) const
Get a lower bound on the values stored in the given value slot.
Definition: omdatabase.cc:368
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
#define TEST_EQUAL(a, b)
Test for equality of two things.
Definition: testsuite.h:278
Xapian::termcount get_unique_terms(Xapian::docid did) const
Get the number of unique terms in document.
Definition: omdatabase.cc:476
PostingIterator postlist_end(const std::string &) const
Corresponding end iterator to postlist_begin().
Definition: database.h:225
Xapian::doccount get_termfreq(const std::string &tname) const
Get the number of documents in the database indexed by a given term.
Definition: omdatabase.cc:323
PositionIterator positionlist_begin() const
Return a PositionIterator for the current term.
A handle representing a document in a Xapian database.
Definition: document.h:61
Xapian::termcount positionlist_count() const
Return the length of the position list for the current position.
UnimplementedError indicates an attempt to use an unimplemented feature.
Definition: error.h:325
PostingIterator postlist_begin(const std::string &tname) const
An iterator pointing to the start of the postlist for a given term.
Definition: omdatabase.cc:162
Xapian::termcount get_collection_freq(const std::string &tname) const
Return the total number of occurrences of the given term.
Definition: omdatabase.cc:339
Xapian::termcount get_wdf_upper_bound(const std::string &term) const
Get an upper bound on the wdf of term term.
Definition: omdatabase.cc:435