examples/delve.cc

Go to the documentation of this file.
00001 /* delve.cc: Allow inspection of the contents of a Xapian database
00002  *
00003  * Copyright 1999,2000,2001 BrightStation PLC
00004  * Copyright 2002 Ananova Ltd
00005  * Copyright 2002,2003,2004,2006,2007,2008,2010 Olly Betts
00006  *
00007  * This program is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU General Public License as
00009  * published by the Free Software Foundation; either version 2 of the
00010  * License, or (at your option) any later version.
00011  *
00012  * This program is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
00020  * USA
00021  */
00022 
00023 #include <config.h>
00024 
00025 #include <xapian.h>
00026 
00027 #include <algorithm>
00028 #include <iostream>
00029 #include <vector>
00030 
00031 #include "gnu_getopt.h"
00032 
00033 #include <string.h>
00034 #include <stdlib.h>
00035 #include "safeerrno.h"
00036 
00037 using namespace Xapian;
00038 using namespace std;
00039 
00040 static char separator = ' ';
00041 
00042 static bool verbose = false;
00043 static bool showvalues = false;
00044 static bool showdocdata = false;
00045 
00046 #define PROG_NAME "delve"
00047 #define PROG_DESC "Inspect the contents of a Xapian database"
00048 
00049 static void show_usage() {
00050     cout << "Usage: "PROG_NAME" [OPTIONS] DATABASE...\n\n"
00051 "Options:\n"
00052 "  -a                    show all terms in the database\n"
00053 "  -r <recno>            for term list(s)\n"
00054 "  -t <term>             for posting list(s)\n"
00055 "  -t <term> -r <recno>  for position list(s)\n"
00056 "  -s, --stemmer=LANG    set the stemming language, the default is 'none'\n"
00057 "  -1                    output one list entry per line\n"
00058 "  -V                    output values for each document referred to\n"
00059 "  -V<valueno>           output value valueno for each document in the database\n"
00060 "  -d                    output document data for each document referred to\n"
00061 "  -v                    extra info (wdf and len for postlist;\n"
00062 "                        wdf and termfreq for termlist; number of terms for db;\n"
00063 "                        termfreq when showing all terms)\n"
00064 "      --help            display this help and exit\n"
00065 "      --version         output version information and exit" << endl;
00066 }
00067 
00068 static void
00069 show_db_stats(Database &db)
00070 {
00071     // Display a few database stats.
00072     cout << "number of documents = " << db.get_doccount() << endl;
00073     cout << "average document length = " << db.get_avlength() << endl;
00074     cout << "highest document id ever used = " << db.get_lastdocid() << endl;
00075 
00076     if (verbose) {
00077         // To find the number of terms, we have to count them!
00078         // This will take a few seconds or minutes, so only do it if -v
00079         // was specified.
00080         termcount terms = 0;
00081         TermIterator t = db.allterms_begin();
00082         const TermIterator end = db.allterms_end();
00083         while (t != end) {
00084             ++terms;
00085             ++t;
00086         }
00087         cout << "number of distinct terms = " << terms << endl;
00088     }
00089 }
00090 
00091 static void
00092 show_values(Database &db, docid docid, char sep)
00093 {
00094     Document doc = db.get_document(docid);
00095     ValueIterator v = doc.values_begin();
00096     ValueIterator vend = doc.values_end();
00097     while (v != vend) {
00098         cout << sep << v.get_valueno() << ':' << *v;
00099         ++v;
00100     }
00101 }
00102 
00103 static void
00104 show_values(Database &db,
00105             vector<docid>::const_iterator i,
00106             vector<docid>::const_iterator end)
00107 {
00108     while (i != end) {
00109         cout << "Values for record #" << *i << ':';
00110         show_values(db, *i, separator);
00111         cout << endl;
00112         ++i;
00113     }
00114 }
00115 
00116 static void
00117 show_docdata(Database &db, docid docid, char sep)
00118 {
00119     cout << sep << "[" << db.get_document(docid).get_data() << ']';
00120 }
00121 
00122 static void
00123 show_docdata(Database &db,
00124              vector<docid>::const_iterator i,
00125              vector<docid>::const_iterator end)
00126 {
00127     while (i != end) {
00128         cout << "Data for record #" << *i << ':' << endl;
00129         cout << db.get_document(*i).get_data() << endl;
00130         ++i;
00131     }
00132 }
00133 
00134 static void
00135 show_termlist(const Database &db, Xapian::docid did)
00136 {
00137     TermIterator t, tend;
00138     if (did == 0) {
00139         t = db.allterms_begin();
00140         tend = db.allterms_end();
00141         cout << "All terms in database:";
00142     } else {
00143         t = db.termlist_begin(did);
00144         tend = db.termlist_end(did);
00145         cout << "Term List for record #" << did << ':';
00146     }
00147 
00148     while (t != tend) {
00149         cout << separator << *t;
00150         if (verbose) {
00151             if (did != 0)
00152                 cout << ' ' << t.get_wdf();
00153             cout << ' ' << t.get_termfreq();
00154         }
00155         ++t;
00156     }
00157     cout << endl;
00158 }
00159 
00160 static void
00161 show_termlists(Database &db,
00162                vector<docid>::const_iterator i,
00163                vector<docid>::const_iterator end)
00164 {
00165     // Display termlists
00166     while (i != end) {
00167         show_termlist(db, *i);
00168         ++i;
00169     }
00170 }
00171 
00172 static Stem stemmer;
00173 
00174 int
00175 main(int argc, char **argv)
00176 {
00177     if (argc > 1 && argv[1][0] == '-') {
00178         if (strcmp(argv[1], "--help") == 0) {
00179             cout << PROG_NAME" - "PROG_DESC"\n\n";
00180             show_usage();
00181             exit(0);
00182         }
00183         if (strcmp(argv[1], "--version") == 0) {
00184             cout << PROG_NAME" - "PACKAGE_STRING << endl;
00185             exit(0);
00186         }
00187     }
00188 
00189     bool all_terms = false;
00190     vector<docid> recnos;
00191     vector<string> terms;
00192     vector<string> dbs;
00193 
00194     valueno valno = 0; // Avoid "may be used uninitialised" warnings.
00195     bool valno_set = false;
00196 
00197     int c;
00198     while ((c = gnu_getopt(argc, argv, "ar:t:s:1vkV::d")) != -1) {
00199         switch (c) {
00200             case 'a':
00201                 all_terms = true;
00202                 break;
00203             case 'r': {
00204                 char * end;
00205                 errno = 0;
00206                 unsigned long n = strtoul(optarg, &end, 10);
00207                 if (optarg == end || *end) {
00208                     cout << "Non-numeric document id: " << optarg << endl;
00209                     exit(1);
00210                 }
00211                 Xapian::docid did(n);
00212                 if (errno == ERANGE || n == 0 || did != n) {
00213                     cout << "Document id out of range: " << optarg << endl;
00214                     exit(1);
00215                 }
00216                 recnos.push_back(did);
00217                 break;
00218             }
00219             case 't':
00220                 terms.push_back(optarg);
00221                 break;
00222             case 's':
00223                 stemmer = Stem(optarg);
00224                 break;
00225             case '1':
00226                 separator = '\n';
00227                 break;
00228             case 'V': case 'k': /* -k for backward compatibility */
00229                 showvalues = true;
00230                 if (optarg) {
00231                     char * end;
00232                     errno = 0;
00233                     unsigned long n = strtoul(optarg, &end, 10);
00234                     if (optarg == end || *end) {
00235                         cout << "Non-numeric value slot: " << optarg << endl;
00236                         exit(1);
00237                     }
00238                     valno = Xapian::valueno(n);
00239                     if (errno == ERANGE || valno != n) {
00240                         cout << "Value slot out of range: " << optarg << endl;
00241                         exit(1);
00242                     }
00243                     valno_set = true;
00244                 }
00245                 break;
00246             case 'd':
00247                 showdocdata = true;
00248                 break;
00249             case 'v':
00250                 verbose = true;
00251                 break;
00252             default:
00253                 show_usage();
00254                 exit(1);
00255         }
00256     }
00257 
00258     while (argv[optind]) dbs.push_back(argv[optind++]);
00259 
00260     if (dbs.empty()) {
00261         show_usage();
00262         exit(1);
00263     }
00264 
00265     std::sort(recnos.begin(), recnos.end());
00266 
00267     Database db;
00268     {
00269         vector<string>::const_iterator i;
00270         for (i = dbs.begin(); i != dbs.end(); i++) {
00271             try {
00272                 db.add_database(Database(*i));
00273             } catch (const Error &e) {
00274                 cout << "Error opening database `" << *i << "': ";
00275                 cout << e.get_description() << endl;
00276                 return 1;
00277             }
00278         }
00279     }
00280 
00281     try {
00282         if (!all_terms && terms.empty() && recnos.empty() && !valno_set) {
00283             show_db_stats(db);
00284             return 0;
00285         }
00286 
00287         if (all_terms) {
00288             show_termlist(db, 0);
00289         }
00290 
00291         if (!recnos.empty()) {
00292             if (showvalues) {
00293                 show_values(db, recnos.begin(), recnos.end());
00294             }
00295 
00296             if (showdocdata) {
00297                 show_docdata(db, recnos.begin(), recnos.end());
00298             }
00299         }
00300 
00301         if (valno_set) {
00302             doccount n = db.get_doccount();
00303             docid did = 0;
00304             docid hwm = db.get_lastdocid();
00305             cout << "Value " << valno << " for each document:";
00306             while (n && did != hwm) {
00307                 try {
00308                     Document doc = db.get_document(++did);
00309                     string val = doc.get_value(valno);
00310                     if (!val.empty())
00311                         cout << separator << did << ':' << doc.get_value(valno);
00312                     --n;
00313                 } catch (DocNotFoundError &) {
00314                 }
00315             }
00316             cout << endl;
00317         }
00318 
00319         if (terms.empty()) {
00320             show_termlists(db, recnos.begin(), recnos.end());
00321             return 0;
00322         }
00323 
00324         vector<string>::const_iterator i;
00325         for (i = terms.begin(); i != terms.end(); i++) {
00326             string term = stemmer(*i);
00327             PostingIterator p = db.postlist_begin(term);
00328             PostingIterator pend = db.postlist_end(term);
00329             if (p == pend) {
00330                 cout << "term `" << term << "' not in database\n";
00331                 continue;
00332             }
00333             if (recnos.empty()) {
00334                 // Display posting list
00335                 cout << "Posting List for term `" << term << "' (termfreq "
00336                      << db.get_termfreq(term) << ", collfreq "
00337                      << db.get_collection_freq(term) << "):";
00338                 while (p != pend) {
00339                     cout << separator << *p;
00340                     if (verbose) {
00341                         cout << ' ' << p.get_wdf()
00342                             << ' ' << p.get_doclength();
00343                     }
00344                     if (showvalues) show_values(db, *p, ' ');
00345                     if (showdocdata) show_docdata(db, *p, ' ');
00346                     p++;
00347                 }
00348                 cout << endl;
00349             } else {
00350                 // Display position lists
00351                 vector<docid>::const_iterator j;
00352                 for (j = recnos.begin(); j != recnos.end(); j++) {
00353                     p.skip_to(*j);
00354                     if (p == pend || *p != *j) {
00355                         cout << "term `" << term <<
00356                             "' doesn't index document #" << *j << endl;
00357                     } else {
00358                         cout << "Position List for term `" << term
00359                             << "', record #" << *j << ':';
00360                         try {
00361                             PositionIterator pos = p.positionlist_begin();
00362                             PositionIterator posend = p.positionlist_end();
00363                             while (pos != posend) {
00364                                 cout << separator << *pos;
00365                                 ++pos;
00366                             }
00367                             cout << endl;
00368                         } catch (const Error &e) {
00369                             cout << "Error: " << e.get_description() << endl;
00370                         }
00371                     }
00372                 }
00373             }
00374         }
00375     } catch (const Error &e) {
00376         cout << "\nError: " << e.get_description() << endl;
00377         return 1;
00378     }
00379 }

Documentation for Xapian (version 1.0.20).
Generated on 28 Apr 2010 by Doxygen 1.5.2.