00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include <config.h>
00024
00025 #include <xapian.h>
00026
00027 #include <algorithm>
00028 #include <iostream>
00029 #include <vector>
00030
00031 #include "gnu_getopt.h"
00032
00033 #include <string.h>
00034 #include <stdlib.h>
00035 #include "safeerrno.h"
00036
00037 using namespace Xapian;
00038 using namespace std;
00039
00040 static char separator = ' ';
00041
00042 static bool verbose = false;
00043 static bool showvalues = false;
00044 static bool showdocdata = false;
00045
00046 #define PROG_NAME "delve"
00047 #define PROG_DESC "Inspect the contents of a Xapian database"
00048
00049 static void show_usage() {
00050 cout << "Usage: "PROG_NAME" [OPTIONS] DATABASE...\n\n"
00051 "Options:\n"
00052 " -a show all terms in the database\n"
00053 " -r <recno> for term list(s)\n"
00054 " -t <term> for posting list(s)\n"
00055 " -t <term> -r <recno> for position list(s)\n"
00056 " -s, --stemmer=LANG set the stemming language, the default is 'none'\n"
00057 " -1 output one list entry per line\n"
00058 " -V output values for each document referred to\n"
00059 " -V<valueno> output value valueno for each document in the database\n"
00060 " -d output document data for each document referred to\n"
00061 " -v extra info (wdf and len for postlist;\n"
00062 " wdf and termfreq for termlist; number of terms for db;\n"
00063 " termfreq when showing all terms)\n"
00064 " --help display this help and exit\n"
00065 " --version output version information and exit" << endl;
00066 }
00067
00068 static void
00069 show_db_stats(Database &db)
00070 {
00071
00072 cout << "number of documents = " << db.get_doccount() << endl;
00073 cout << "average document length = " << db.get_avlength() << endl;
00074 cout << "highest document id ever used = " << db.get_lastdocid() << endl;
00075
00076 if (verbose) {
00077
00078
00079
00080 termcount terms = 0;
00081 TermIterator t = db.allterms_begin();
00082 const TermIterator end = db.allterms_end();
00083 while (t != end) {
00084 ++terms;
00085 ++t;
00086 }
00087 cout << "number of distinct terms = " << terms << endl;
00088 }
00089 }
00090
00091 static void
00092 show_values(Database &db, docid docid, char sep)
00093 {
00094 Document doc = db.get_document(docid);
00095 ValueIterator v = doc.values_begin();
00096 ValueIterator vend = doc.values_end();
00097 while (v != vend) {
00098 cout << sep << v.get_valueno() << ':' << *v;
00099 ++v;
00100 }
00101 }
00102
00103 static void
00104 show_values(Database &db,
00105 vector<docid>::const_iterator i,
00106 vector<docid>::const_iterator end)
00107 {
00108 while (i != end) {
00109 cout << "Values for record #" << *i << ':';
00110 show_values(db, *i, separator);
00111 cout << endl;
00112 ++i;
00113 }
00114 }
00115
00116 static void
00117 show_docdata(Database &db, docid docid, char sep)
00118 {
00119 cout << sep << "[" << db.get_document(docid).get_data() << ']';
00120 }
00121
00122 static void
00123 show_docdata(Database &db,
00124 vector<docid>::const_iterator i,
00125 vector<docid>::const_iterator end)
00126 {
00127 while (i != end) {
00128 cout << "Data for record #" << *i << ':' << endl;
00129 cout << db.get_document(*i).get_data() << endl;
00130 ++i;
00131 }
00132 }
00133
00134 static void
00135 show_termlist(const Database &db, Xapian::docid did)
00136 {
00137 TermIterator t, tend;
00138 if (did == 0) {
00139 t = db.allterms_begin();
00140 tend = db.allterms_end();
00141 cout << "All terms in database:";
00142 } else {
00143 t = db.termlist_begin(did);
00144 tend = db.termlist_end(did);
00145 cout << "Term List for record #" << did << ':';
00146 }
00147
00148 while (t != tend) {
00149 cout << separator << *t;
00150 if (verbose) {
00151 if (did != 0)
00152 cout << ' ' << t.get_wdf();
00153 cout << ' ' << t.get_termfreq();
00154 }
00155 ++t;
00156 }
00157 cout << endl;
00158 }
00159
00160 static void
00161 show_termlists(Database &db,
00162 vector<docid>::const_iterator i,
00163 vector<docid>::const_iterator end)
00164 {
00165
00166 while (i != end) {
00167 show_termlist(db, *i);
00168 ++i;
00169 }
00170 }
00171
00172 static Stem stemmer;
00173
00174 int
00175 main(int argc, char **argv)
00176 {
00177 if (argc > 1 && argv[1][0] == '-') {
00178 if (strcmp(argv[1], "--help") == 0) {
00179 cout << PROG_NAME" - "PROG_DESC"\n\n";
00180 show_usage();
00181 exit(0);
00182 }
00183 if (strcmp(argv[1], "--version") == 0) {
00184 cout << PROG_NAME" - "PACKAGE_STRING << endl;
00185 exit(0);
00186 }
00187 }
00188
00189 bool all_terms = false;
00190 vector<docid> recnos;
00191 vector<string> terms;
00192 vector<string> dbs;
00193
00194 valueno valno = 0;
00195 bool valno_set = false;
00196
00197 int c;
00198 while ((c = gnu_getopt(argc, argv, "ar:t:s:1vkV::d")) != -1) {
00199 switch (c) {
00200 case 'a':
00201 all_terms = true;
00202 break;
00203 case 'r': {
00204 char * end;
00205 errno = 0;
00206 unsigned long n = strtoul(optarg, &end, 10);
00207 if (optarg == end || *end) {
00208 cout << "Non-numeric document id: " << optarg << endl;
00209 exit(1);
00210 }
00211 Xapian::docid did(n);
00212 if (errno == ERANGE || n == 0 || did != n) {
00213 cout << "Document id out of range: " << optarg << endl;
00214 exit(1);
00215 }
00216 recnos.push_back(did);
00217 break;
00218 }
00219 case 't':
00220 terms.push_back(optarg);
00221 break;
00222 case 's':
00223 stemmer = Stem(optarg);
00224 break;
00225 case '1':
00226 separator = '\n';
00227 break;
00228 case 'V': case 'k':
00229 showvalues = true;
00230 if (optarg) {
00231 char * end;
00232 errno = 0;
00233 unsigned long n = strtoul(optarg, &end, 10);
00234 if (optarg == end || *end) {
00235 cout << "Non-numeric value slot: " << optarg << endl;
00236 exit(1);
00237 }
00238 valno = Xapian::valueno(n);
00239 if (errno == ERANGE || valno != n) {
00240 cout << "Value slot out of range: " << optarg << endl;
00241 exit(1);
00242 }
00243 valno_set = true;
00244 }
00245 break;
00246 case 'd':
00247 showdocdata = true;
00248 break;
00249 case 'v':
00250 verbose = true;
00251 break;
00252 default:
00253 show_usage();
00254 exit(1);
00255 }
00256 }
00257
00258 while (argv[optind]) dbs.push_back(argv[optind++]);
00259
00260 if (dbs.empty()) {
00261 show_usage();
00262 exit(1);
00263 }
00264
00265 std::sort(recnos.begin(), recnos.end());
00266
00267 Database db;
00268 {
00269 vector<string>::const_iterator i;
00270 for (i = dbs.begin(); i != dbs.end(); i++) {
00271 try {
00272 db.add_database(Database(*i));
00273 } catch (const Error &e) {
00274 cout << "Error opening database `" << *i << "': ";
00275 cout << e.get_description() << endl;
00276 return 1;
00277 }
00278 }
00279 }
00280
00281 try {
00282 if (!all_terms && terms.empty() && recnos.empty() && !valno_set) {
00283 show_db_stats(db);
00284 return 0;
00285 }
00286
00287 if (all_terms) {
00288 show_termlist(db, 0);
00289 }
00290
00291 if (!recnos.empty()) {
00292 if (showvalues) {
00293 show_values(db, recnos.begin(), recnos.end());
00294 }
00295
00296 if (showdocdata) {
00297 show_docdata(db, recnos.begin(), recnos.end());
00298 }
00299 }
00300
00301 if (valno_set) {
00302 doccount n = db.get_doccount();
00303 docid did = 0;
00304 docid hwm = db.get_lastdocid();
00305 cout << "Value " << valno << " for each document:";
00306 while (n && did != hwm) {
00307 try {
00308 Document doc = db.get_document(++did);
00309 string val = doc.get_value(valno);
00310 if (!val.empty())
00311 cout << separator << did << ':' << doc.get_value(valno);
00312 --n;
00313 } catch (DocNotFoundError &) {
00314 }
00315 }
00316 cout << endl;
00317 }
00318
00319 if (terms.empty()) {
00320 show_termlists(db, recnos.begin(), recnos.end());
00321 return 0;
00322 }
00323
00324 vector<string>::const_iterator i;
00325 for (i = terms.begin(); i != terms.end(); i++) {
00326 string term = stemmer(*i);
00327 PostingIterator p = db.postlist_begin(term);
00328 PostingIterator pend = db.postlist_end(term);
00329 if (p == pend) {
00330 cout << "term `" << term << "' not in database\n";
00331 continue;
00332 }
00333 if (recnos.empty()) {
00334
00335 cout << "Posting List for term `" << term << "' (termfreq "
00336 << db.get_termfreq(term) << ", collfreq "
00337 << db.get_collection_freq(term) << "):";
00338 while (p != pend) {
00339 cout << separator << *p;
00340 if (verbose) {
00341 cout << ' ' << p.get_wdf()
00342 << ' ' << p.get_doclength();
00343 }
00344 if (showvalues) show_values(db, *p, ' ');
00345 if (showdocdata) show_docdata(db, *p, ' ');
00346 p++;
00347 }
00348 cout << endl;
00349 } else {
00350
00351 vector<docid>::const_iterator j;
00352 for (j = recnos.begin(); j != recnos.end(); j++) {
00353 p.skip_to(*j);
00354 if (p == pend || *p != *j) {
00355 cout << "term `" << term <<
00356 "' doesn't index document #" << *j << endl;
00357 } else {
00358 cout << "Position List for term `" << term
00359 << "', record #" << *j << ':';
00360 try {
00361 PositionIterator pos = p.positionlist_begin();
00362 PositionIterator posend = p.positionlist_end();
00363 while (pos != posend) {
00364 cout << separator << *pos;
00365 ++pos;
00366 }
00367 cout << endl;
00368 } catch (const Error &e) {
00369 cout << "Error: " << e.get_description() << endl;
00370 }
00371 }
00372 }
00373 }
00374 }
00375 } catch (const Error &e) {
00376 cout << "\nError: " << e.get_description() << endl;
00377 return 1;
00378 }
00379 }