tests/dbcheck.cc

Go to the documentation of this file.
00001 /* dbcheck.cc: test database contents and consistency.
00002  *
00003  * Copyright 2009 Richard Boulton
00004  * Copyright 2010 Olly Betts
00005  *
00006  * This program is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU General Public License as
00008  * published by the Free Software Foundation; either version 2 of the
00009  * License, or (at your option) any later version.
00010  *
00011  * This program is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  * GNU General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU General Public License
00017  * along with this program; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
00019  * USA
00020  */
00021 
00022 #include <config.h>
00023 
00024 #include "dbcheck.h"
00025 
00026 #include "utils.h"
00027 #include "testsuite.h"
00028 
00029 using namespace std;
00030 
00031 string
00032 positions_to_string(Xapian::PositionIterator & it,
00033                     const Xapian::PositionIterator & end,
00034                     Xapian::termcount * count)
00035 {
00036     string result;
00037     bool need_comma = false;
00038     Xapian::termcount c = 0;
00039     while (it != end) {
00040         if (need_comma)
00041             result += ", ";
00042         result += om_tostring(*it);
00043         need_comma = true;
00044         ++it;
00045         ++c;
00046     }
00047     if (count) {
00048         *count = c;
00049     }
00050     return result;
00051 }
00052 
00053 string
00054 postlist_to_string(const Xapian::Database & db, const string & tname)
00055 {
00056     string result;
00057     bool need_comma = false;
00058 
00059     for (Xapian::PostingIterator p = db.postlist_begin(tname);
00060          p != db.postlist_end(tname);
00061          ++p) {
00062         if (need_comma)
00063             result += ", ";
00064 
00065         Xapian::PositionIterator it(p.positionlist_begin());
00066         string posrepr = positions_to_string(it, p.positionlist_end());
00067         if (!posrepr.empty()) {
00068             posrepr = ", pos=[" + posrepr + "]";
00069         }
00070 
00071         result += "(" + om_tostring(*p) +
00072                 ", doclen=" + om_tostring(p.get_doclength()) +
00073                 ", wdf=" + om_tostring(p.get_wdf()) +
00074                 posrepr + ")";
00075         need_comma = true;
00076     }
00077     return result;
00078 }
00079 
00080 string
00081 docterms_to_string(const Xapian::Database & db, Xapian::docid did)
00082 {
00083     string result;
00084     bool need_comma = false;
00085 
00086     for (Xapian::TermIterator t = db.termlist_begin(did);
00087          t != db.termlist_end(did);
00088          ++t) {
00089         Xapian::PositionIterator it(t.positionlist_begin());
00090         string posrepr = positions_to_string(it, t.positionlist_end());
00091         if (!posrepr.empty()) {
00092             posrepr = ", pos=[" + posrepr + "]";
00093         }
00094         if (need_comma)
00095             result += ", ";
00096         result += "Term(" + *t + ", wdf=" + om_tostring(t.get_wdf()) + posrepr + ")";
00097         need_comma = true;
00098     }
00099     return result;
00100 }
00101 
00102 string
00103 docstats_to_string(const Xapian::Database & db, Xapian::docid did)
00104 {
00105     string result;
00106 
00107     result += "len=" + om_tostring(db.get_doclength(did));
00108 
00109     return result;
00110 }
00111 
00112 string
00113 termstats_to_string(const Xapian::Database & db, const string & term)
00114 {
00115     string result;
00116 
00117     result += "tf=" + om_tostring(db.get_termfreq(term));
00118     result += ",cf=" + om_tostring(db.get_collection_freq(term));
00119 
00120     return result;
00121 }
00122 
00123 string
00124 dbstats_to_string(const Xapian::Database & db)
00125 {
00126     string result;
00127 
00128     result += "dc=" + om_tostring(db.get_doccount());
00129     result += ",al=" + om_tostring(db.get_avlength());
00130     result += ",ld=" + om_tostring(db.get_lastdocid());
00131 
00132     return result;
00133 }
00134 
00135 void
00136 dbcheck(const Xapian::Database & db,
00137         Xapian::doccount expected_doccount,
00138         Xapian::docid expected_lastdocid)
00139 {
00140     TEST_EQUAL(db.get_doccount(), expected_doccount);
00141     TEST_EQUAL(db.get_lastdocid(), expected_lastdocid);
00142 
00143     // Note - may not be a very big type, but we're only expecting to use this
00144     // for small databases, so should be fine.
00145     unsigned long totlen = 0;
00146 
00147     // A map from term to a representation of the posting list for that term.
00148     // We build this up from the documents, and then check it against the
00149     // equivalent built up from the posting lists.
00150     map<string, string> posting_reprs;
00151 
00152     for (Xapian::PostingIterator dociter = db.postlist_begin(string());
00153          dociter != db.postlist_end(string());
00154          ++dociter) {
00155         Xapian::docid did = *dociter;
00156         TEST_EQUAL(dociter.get_wdf(), 1);
00157         Xapian::Document doc(db.get_document(did));
00158         Xapian::termcount doclen(db.get_doclength(did));
00159         totlen += doclen;
00160 
00161         Xapian::termcount found_termcount = 0;
00162         Xapian::termcount wdf_sum = 0;
00163         Xapian::TermIterator t, t2;
00164         for (t = doc.termlist_begin(), t2 = db.termlist_begin(did);
00165              t != doc.termlist_end();
00166              ++t, ++t2) {
00167             TEST(t2 != db.termlist_end(did));
00168 
00169             ++found_termcount;
00170             wdf_sum += t.get_wdf();
00171 
00172             TEST_EQUAL(*t, *t2);
00173             TEST_EQUAL(t.get_wdf(), t2.get_wdf());
00174             TEST_EQUAL(db.get_termfreq(*t), t.get_termfreq());
00175             TEST_EQUAL(db.get_termfreq(*t), t2.get_termfreq());
00176 
00177             // Check the position lists are equal.
00178             Xapian::termcount tc1, tc2;
00179             Xapian::PositionIterator it1(t.positionlist_begin());
00180             string posrepr = positions_to_string(it1, t.positionlist_end(), &tc1);
00181             Xapian::PositionIterator it2(t2.positionlist_begin());
00182             string posrepr2 = positions_to_string(it2, t2.positionlist_end(), &tc2);
00183             TEST_EQUAL(posrepr, posrepr2);
00184             TEST_EQUAL(tc1, tc2);
00185             try {
00186                 TEST_EQUAL(tc1, t.positionlist_count());
00187             } catch (const Xapian::UnimplementedError &) {
00188                 // positionlist_count() isn't implemented for remote databases.
00189             }
00190 
00191             // Make a representation of the posting.
00192             if (!posrepr.empty()) {
00193                 posrepr = ",[" + posrepr + "]";
00194             }
00195             string posting_repr = "(" + om_tostring(did) + "," +
00196                     om_tostring(t.get_wdf()) + "/" + om_tostring(doclen) +
00197                     posrepr + ")";
00198 
00199             // Append the representation to the list for the term.
00200             map<string, string>::iterator i = posting_reprs.find(*t);
00201             if (i == posting_reprs.end()) {
00202                 posting_reprs[*t] = posting_repr;
00203             } else {
00204                 i->second += "," + posting_repr;
00205             }
00206         }
00207         TEST(t2 == db.termlist_end(did));
00208         Xapian::termcount expected_termcount = doc.termlist_count();
00209         TEST_EQUAL(expected_termcount, found_termcount);
00210         TEST_EQUAL(doclen, wdf_sum);
00211     }
00212 
00213     Xapian::TermIterator t;
00214     map<string, string>::const_iterator i;
00215     for (t = db.allterms_begin(), i = posting_reprs.begin();
00216          t != db.allterms_end();
00217          ++t, ++i) {
00218         TEST(db.term_exists(*t));
00219         TEST(i != posting_reprs.end());
00220         TEST_EQUAL(i->first, *t);
00221 
00222         Xapian::doccount tf_count = 0;
00223         Xapian::termcount cf_count = 0;
00224         string posting_repr;
00225         bool need_comma = false;
00226         for (Xapian::PostingIterator p = db.postlist_begin(*t);
00227              p != db.postlist_end(*t);
00228              ++p) {
00229             if (need_comma) {
00230                 posting_repr += ",";
00231             }
00232 
00233             ++tf_count;
00234             cf_count += p.get_wdf();
00235 
00236             Xapian::PositionIterator it(p.positionlist_begin());
00237             string posrepr = positions_to_string(it, p.positionlist_end());
00238             if (!posrepr.empty()) {
00239                 posrepr = ",[" + posrepr + "]";
00240             }
00241             posting_repr += "(" + om_tostring(*p) + "," +
00242                     om_tostring(p.get_wdf()) + "/" + om_tostring(p.get_doclength()) +
00243                     posrepr + ")";
00244             need_comma = true;
00245         }
00246 
00247         TEST_EQUAL(posting_repr, i->second);
00248         TEST_EQUAL(tf_count, t.get_termfreq());
00249         TEST_EQUAL(tf_count, db.get_termfreq(*t));
00250         TEST_EQUAL(cf_count, db.get_collection_freq(*t));
00251     }
00252     TEST(i == posting_reprs.end());
00253 
00254     if (expected_doccount == 0) {
00255         TEST_EQUAL(0, db.get_avlength());
00256     } else {
00257         TEST_EQUAL_DOUBLE(double(totlen) / expected_doccount,
00258                           db.get_avlength());
00259     }
00260 }

Documentation for Xapian (version 1.0.20).
Generated on 28 Apr 2010 by Doxygen 1.5.2.