00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include <config.h>
00024
00025 #include <xapian.h>
00026
00027 #include <algorithm>
00028 #include <iomanip>
00029 #include <iostream>
00030 #include <vector>
00031
00032 #include "gnu_getopt.h"
00033
00034 #include <cstring>
00035 #include <cstdlib>
00036 #include "safeerrno.h"
00037
00038 using namespace Xapian;
00039 using namespace std;
00040
00041 static char separator = ' ';
00042
00043 static bool verbose = false;
00044 static bool showvalues = false;
00045 static bool showdocdata = false;
00046 static bool count_zero_length_docs = false;
00047
00048 #define PROG_NAME "delve"
00049 #define PROG_DESC "Inspect the contents of a Xapian database"
00050
00051 static void show_usage() {
00052 cout << "Usage: "PROG_NAME" [OPTIONS] DATABASE...\n\n"
00053 "Options:\n"
00054 " -a show all terms in the database\n"
00055 " -r <recno> for term list(s)\n"
00056 " -t <term> for posting list(s)\n"
00057 " -t <term> -r <recno> for position list(s)\n"
00058 " -s, --stemmer=LANG set the stemming language, the default is 'none'\n"
00059 " -1 output one list entry per line\n"
00060 " -V output values for each document referred to\n"
00061 " -V<valueno> output value valueno for each document referred to\n"
00062 " (or each document in the database if no -r options)\n"
00063 " -d output document data for each document referred to\n"
00064 " -z for db, count documents with length 0\n"
00065 " -v extra info (wdf and len for postlist;\n"
00066 " wdf and termfreq for termlist; number of terms for db;\n"
00067 " termfreq when showing all terms)\n"
00068 " --help display this help and exit\n"
00069 " --version output version information and exit" << endl;
00070 }
00071
00072 static void
00073 show_db_stats(Database &db)
00074 {
00075
00076 cout << "UUID = " << db.get_uuid() << endl;
00077 cout << "number of documents = " << db.get_doccount() << endl;
00078 cout << "average document length = " << db.get_avlength() << endl;
00079 cout << "document length lower bound = " << db.get_doclength_lower_bound()
00080 << endl;
00081 cout << "document length upper bound = " << db.get_doclength_upper_bound()
00082 << endl;
00083 cout << "highest document id ever used = " << db.get_lastdocid() << endl;
00084 cout << boolalpha;
00085 cout << "has positional information = " << db.has_positions() << endl;
00086
00087 if (count_zero_length_docs) {
00088 Xapian::doccount empty_docs = 0;
00089 if (db.get_avlength() == 0) {
00090
00091 empty_docs = db.get_doccount();
00092 } else {
00093 Xapian::PostingIterator d = db.postlist_begin(string());
00094 while (d != db.postlist_end(string())) {
00095 if (d.get_doclength() == 0)
00096 ++empty_docs;
00097 ++d;
00098 }
00099 }
00100 cout << "number of zero-length documents = " << empty_docs << endl;
00101 }
00102
00103 if (verbose) {
00104
00105
00106
00107 termcount terms = 0;
00108 TermIterator t = db.allterms_begin();
00109 const TermIterator end = db.allterms_end();
00110 while (t != end) {
00111 ++terms;
00112 ++t;
00113 }
00114 cout << "number of distinct terms = " << terms << endl;
00115 }
00116 }
00117
00118 static void
00119 show_values(Database &db, docid docid, char sep)
00120 {
00121 Document doc = db.get_document(docid);
00122 ValueIterator v = doc.values_begin();
00123 while (v != doc.values_end()) {
00124 cout << sep << v.get_valueno() << ':' << *v;
00125 ++v;
00126 }
00127 }
00128
00129 static void
00130 show_values(Database &db,
00131 vector<docid>::const_iterator i,
00132 vector<docid>::const_iterator end)
00133 {
00134 while (i != end) {
00135 cout << "Values for record #" << *i << ':';
00136 show_values(db, *i, separator);
00137 cout << endl;
00138 ++i;
00139 }
00140 }
00141
00142 static void
00143 show_value(Database &db,
00144 vector<docid>::const_iterator i,
00145 vector<docid>::const_iterator end,
00146 Xapian::valueno slot)
00147 {
00148 while (i != end) {
00149 Xapian::docid did = *i;
00150 cout << "Value " << slot << " for record #" << did << ": "
00151 << db.get_document(did).get_value(slot) << endl;
00152 ++i;
00153 }
00154 }
00155
00156 static void
00157 show_docdata(Database &db, docid docid, char sep)
00158 {
00159 cout << sep << "[" << db.get_document(docid).get_data() << ']';
00160 }
00161
00162 static void
00163 show_docdata(Database &db,
00164 vector<docid>::const_iterator i,
00165 vector<docid>::const_iterator end)
00166 {
00167 while (i != end) {
00168 cout << "Data for record #" << *i << ':' << endl;
00169 cout << db.get_document(*i).get_data() << endl;
00170 ++i;
00171 }
00172 }
00173
00174 static void
00175 show_termlist(const Database &db, Xapian::docid did)
00176 {
00177 TermIterator t, tend;
00178 if (did == 0) {
00179 t = db.allterms_begin();
00180 tend = db.allterms_end();
00181 cout << "All terms in database:";
00182 } else {
00183 t = db.termlist_begin(did);
00184 tend = db.termlist_end(did);
00185 cout << "Term List for record #" << did << ':';
00186 }
00187
00188 while (t != tend) {
00189 cout << separator << *t;
00190 if (verbose) {
00191 if (did != 0)
00192 cout << ' ' << t.get_wdf();
00193 cout << ' ' << t.get_termfreq();
00194 }
00195 ++t;
00196 }
00197 cout << endl;
00198 }
00199
00200 static void
00201 show_termlists(Database &db,
00202 vector<docid>::const_iterator i,
00203 vector<docid>::const_iterator end)
00204 {
00205
00206 while (i != end) {
00207 show_termlist(db, *i);
00208 ++i;
00209 }
00210 }
00211
00212 static Stem stemmer;
00213
00214 int
00215 main(int argc, char **argv) try {
00216 if (argc > 1 && argv[1][0] == '-') {
00217 if (strcmp(argv[1], "--help") == 0) {
00218 cout << PROG_NAME" - "PROG_DESC"\n\n";
00219 show_usage();
00220 exit(0);
00221 }
00222 if (strcmp(argv[1], "--version") == 0) {
00223 cout << PROG_NAME" - "PACKAGE_STRING << endl;
00224 exit(0);
00225 }
00226 }
00227
00228 bool all_terms = false;
00229 vector<docid> recnos;
00230 vector<string> terms;
00231 vector<string> dbs;
00232
00233 valueno slot = 0;
00234 bool slot_set = false;
00235
00236 int c;
00237 while ((c = gnu_getopt(argc, argv, "ar:t:s:1vV::dz")) != -1) {
00238 switch (c) {
00239 case 'a':
00240 all_terms = true;
00241 break;
00242 case 'r': {
00243 char * end;
00244 errno = 0;
00245 unsigned long n = strtoul(optarg, &end, 10);
00246 if (optarg == end || *end) {
00247 cout << "Non-numeric document id: " << optarg << endl;
00248 exit(1);
00249 }
00250 Xapian::docid did(n);
00251 if (errno == ERANGE || n == 0 || did != n) {
00252 cout << "Document id out of range: " << optarg << endl;
00253 exit(1);
00254 }
00255 recnos.push_back(did);
00256 break;
00257 }
00258 case 't':
00259 terms.push_back(optarg);
00260 break;
00261 case 's':
00262 stemmer = Stem(optarg);
00263 break;
00264 case '1':
00265 separator = '\n';
00266 break;
00267 case 'V':
00268 if (optarg) {
00269 char * end;
00270 errno = 0;
00271 unsigned long n = strtoul(optarg, &end, 10);
00272 if (optarg == end || *end) {
00273 cout << "Non-numeric value slot: " << optarg << endl;
00274 exit(1);
00275 }
00276 slot = Xapian::valueno(n);
00277 if (errno == ERANGE || slot != n) {
00278 cout << "Value slot out of range: " << optarg << endl;
00279 exit(1);
00280 }
00281 slot_set = true;
00282 } else {
00283 showvalues = true;
00284 }
00285 break;
00286 case 'd':
00287 showdocdata = true;
00288 break;
00289 case 'v':
00290 verbose = true;
00291 break;
00292 case 'z':
00293 count_zero_length_docs = true;
00294 break;
00295 default:
00296 show_usage();
00297 exit(1);
00298 }
00299 }
00300
00301 while (argv[optind]) dbs.push_back(argv[optind++]);
00302
00303 if (dbs.empty()) {
00304 show_usage();
00305 exit(1);
00306 }
00307
00308 std::sort(recnos.begin(), recnos.end());
00309
00310 Database db;
00311 {
00312 vector<string>::const_iterator i;
00313 for (i = dbs.begin(); i != dbs.end(); i++) {
00314 try {
00315 db.add_database(Database(*i));
00316 } catch (const Error &e) {
00317 cout << "Error opening database `" << *i << "': ";
00318 cout << e.get_description() << endl;
00319 return 1;
00320 }
00321 }
00322 }
00323
00324 if (!all_terms && terms.empty() && recnos.empty() && !slot_set) {
00325
00326 show_db_stats(db);
00327 return 0;
00328 }
00329
00330 if (all_terms) {
00331 show_termlist(db, 0);
00332 }
00333
00334 if (!recnos.empty()) {
00335 if (showvalues) {
00336 show_values(db, recnos.begin(), recnos.end());
00337 } else if (slot_set) {
00338 show_value(db, recnos.begin(), recnos.end(), slot);
00339 }
00340
00341 if (showdocdata) {
00342 show_docdata(db, recnos.begin(), recnos.end());
00343 }
00344 } else {
00345 if (slot_set) {
00346 cout << "Value " << slot << " for each document:";
00347 ValueIterator it = db.valuestream_begin(slot);
00348 while (it != db.valuestream_end(slot)) {
00349 cout << separator << it.get_docid() << ':' << *it;
00350 ++it;
00351 }
00352 cout << endl;
00353 }
00354 }
00355
00356 if (terms.empty()) {
00357 show_termlists(db, recnos.begin(), recnos.end());
00358 return 0;
00359 }
00360
00361 vector<string>::const_iterator i;
00362 for (i = terms.begin(); i != terms.end(); i++) {
00363 string term = stemmer(*i);
00364 PostingIterator p = db.postlist_begin(term);
00365 PostingIterator pend = db.postlist_end(term);
00366 if (p == pend) {
00367 cout << "term `" << term << "' not in database\n";
00368 continue;
00369 }
00370 if (recnos.empty()) {
00371
00372 cout << "Posting List for term `" << term << "' (termfreq "
00373 << db.get_termfreq(term) << ", collfreq "
00374 << db.get_collection_freq(term) << ", wdf_max "
00375 << db.get_wdf_upper_bound(term) << "):";
00376 while (p != pend) {
00377 cout << separator << *p;
00378 if (verbose) {
00379 cout << ' ' << p.get_wdf() << ' ' << p.get_doclength();
00380 }
00381 if (showvalues) show_values(db, *p, ' ');
00382 if (showdocdata) show_docdata(db, *p, ' ');
00383 p++;
00384 }
00385 cout << endl;
00386 } else {
00387
00388 vector<docid>::const_iterator j;
00389 for (j = recnos.begin(); j != recnos.end(); j++) {
00390 p.skip_to(*j);
00391 if (p == pend || *p != *j) {
00392 cout << "term `" << term <<
00393 "' doesn't index document #" << *j << endl;
00394 } else {
00395 cout << "Position List for term `" << term
00396 << "', record #" << *j << ':';
00397 try {
00398 PositionIterator pos = p.positionlist_begin();
00399 PositionIterator posend = p.positionlist_end();
00400 while (pos != posend) {
00401 cout << separator << *pos;
00402 ++pos;
00403 }
00404 cout << endl;
00405 } catch (const Error &e) {
00406 cout << "Error: " << e.get_description() << endl;
00407 }
00408 }
00409 }
00410 }
00411 }
00412 } catch (const Error &e) {
00413 cout << "\nError: " << e.get_description() << endl;
00414 return 1;
00415 }