xapian-core  1.4.19
xapian-delve.cc
Go to the documentation of this file.
1 
4 /* Copyright 1999,2000,2001 BrightStation PLC
5  * Copyright 2002 Ananova Ltd
6  * Copyright 2002,2003,2004,2006,2007,2008,2009,2010,2011,2012,2013,2014,2016,2017,2018 Olly Betts
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as
10  * published by the Free Software Foundation; either version 2 of the
11  * License, or (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21  * USA
22  */
23 
24 #include <config.h>
25 
26 #include <xapian.h>
27 
28 #include <algorithm>
29 #include <ios>
30 #include <iostream>
31 #include <vector>
32 
33 #include "gnu_getopt.h"
34 
35 #include <cerrno>
36 #include <cstring>
37 #include <cstdlib>
39 
41 
42 using namespace Xapian;
43 using namespace std;
44 
45 static char separator = ' ';
46 
47 static int verbose = 0;
48 static bool showvalues = false;
49 static bool showdocdata = false;
50 static bool count_zero_length_docs = false;
51 
52 // How to decode document values.
53 static enum {
59 
60 #define PROG_NAME "delve"
61 #define PROG_DESC "Inspect the contents of a Xapian database"
62 
63 static void show_usage() {
64  cout << "Usage: " PROG_NAME " [OPTIONS] DATABASE...\n\n"
65 "Options:\n"
66 " -a show all terms in the database\n"
67 " -A <prefix> show all terms in the database with given prefix\n"
68 " -r <recno> for term list(s)\n"
69 " -t <term> for posting list(s)\n"
70 " -t <term> -r <recno> for position list(s)\n"
71 " -s, --stemmer=LANG set the stemming language, the default is 'none'\n"
72 " -1 output one list entry per line\n"
73 " -V[<type>]<valueno> output value valueno for each document referred to\n"
74 " (or each document in the database if no -r options).\n"
75 " <type> can be:\n"
76 " E: escape in a C-like way (default)\n"
77 " I: decode as a packed integer\n"
78 " R: show the raw value (which may contain binary data,\n"
79 " newlines, invalid UTF-8, etc)\n"
80 " S: decode using Xapian::sortable_unserialise()\n"
81 " -V[<type>] output all values for each document referred to.\n"
82 " <type> is as above.\n"
83 " -d output document data for each document referred to\n"
84 " -z for db, count documents with length 0\n"
85 " -v extra info (wdf and len for postlist;\n"
86 " wdf and termfreq for termlist; number of terms for db;\n"
87 " termfreq when showing all terms)\n"
88 " -vv even more info (also show collection freq and wdf\n"
89 " upper bound for terms)\n"
90 " --help display this help and exit\n"
91 " --version output version information and exit" << endl;
92 }
93 
94 static void
96 {
97  // Display a few database stats.
98  cout << "UUID = " << db.get_uuid() << endl;
99  cout << "number of documents = " << db.get_doccount() << endl;
100  cout << "average document length = " << db.get_avlength() << endl;
101  cout << "document length lower bound = " << db.get_doclength_lower_bound()
102  << endl;
103  cout << "document length upper bound = " << db.get_doclength_upper_bound()
104  << endl;
105  cout << "highest document id ever used = " << db.get_lastdocid() << endl;
106  cout << boolalpha;
107  cout << "has positional information = " << db.has_positions() << endl;
108  cout << "revision = ";
109  if (db.size() > 1) {
110  cout << "N/A (sharded DB)\n";
111  } else {
112  try {
113  cout << db.get_revision() << endl;
114  } catch (const Xapian::InvalidOperationError& e) {
115  cout << e.get_description() << endl;
116  } catch (const Xapian::UnimplementedError& e) {
117  cout << "N/A (" << e.get_msg() << ")\n";
118  }
119  }
120  cout << "currently open for writing = ";
121  try {
122  cout << db.locked() << endl;
123  } catch (const Xapian::Error& e) {
124  cout << e.get_description() << endl;
125  }
126 
128  Xapian::doccount empty_docs = 0;
129  if (db.get_total_length() == 0) {
130  // All documents are empty.
131  empty_docs = db.get_doccount();
132  } else {
133  Xapian::PostingIterator d = db.postlist_begin(string());
134  while (d != db.postlist_end(string())) {
135  if (d.get_doclength() == 0)
136  ++empty_docs;
137  ++d;
138  }
139  }
140  cout << "number of zero-length documents = " << empty_docs << endl;
141  }
142 
143  if (verbose) {
144  // To find the number of terms, we have to count them!
145  // This will take a few seconds or minutes, so only do it if -v
146  // was specified.
147  termcount terms = 0;
148  TermIterator t = db.allterms_begin();
149  while (t != db.allterms_end()) {
150  ++terms;
151  ++t;
152  }
153  cout << "number of distinct terms = " << terms << endl;
154  }
155 }
156 
157 static void
158 decode_and_show_value(const string& value)
159 {
160  switch (value_decode) {
161  case VALUE_ESCAPE: {
162  string esc;
163  description_append(esc, value);
164  cout << esc;
165  break;
166  }
168  cout << Xapian::sortable_unserialise(value);
169  break;
170  case VALUE_PACKED_INT: {
171  unsigned long long i = 0;
172  for (unsigned char ch : value) {
173  i = (i << 8) | ch;
174  }
175  cout << i;
176  break;
177  }
178  default: // VALUE_RAW
179  cout << value;
180  break;
181  }
182 }
183 
184 static void
186 {
187  Document doc = db.get_document(docid);
188  ValueIterator v = doc.values_begin();
189  while (v != doc.values_end()) {
190  cout << sep << v.get_valueno() << ':';
192  ++v;
193  }
194 }
195 
196 static void
198  vector<docid>::const_iterator i,
199  vector<docid>::const_iterator end)
200 {
201  while (i != end) {
202  cout << "Values for record #" << *i << ':';
203  show_values(db, *i, separator);
204  cout << endl;
205  ++i;
206  }
207 }
208 
209 static void
211  vector<docid>::const_iterator i,
212  vector<docid>::const_iterator end,
213  Xapian::valueno slot)
214 {
215  while (i != end) {
216  Xapian::docid did = *i;
217  cout << "Value " << slot << " for record #" << did << ": ";
219  cout << endl;
220  ++i;
221  }
222 }
223 
224 static void
226 {
227  cout << sep << "[" << db.get_document(docid).get_data() << ']';
228 }
229 
230 static void
232  vector<docid>::const_iterator i,
233  vector<docid>::const_iterator end)
234 {
235  while (i != end) {
236  cout << "Data for record #" << *i << ':' << endl;
237  cout << db.get_document(*i).get_data() << endl;
238  ++i;
239  }
240 }
241 
242 static void
244  const char * all_pfx = NULL)
245 {
246  TermIterator t, tend;
247  if (all_pfx) {
248  t = db.allterms_begin(all_pfx);
249  tend = db.allterms_end(all_pfx);
250  cout << "All terms in database";
251  if (all_pfx[0])
252  cout << " with prefix \"" << all_pfx << "\"";
253  } else {
254  t = db.termlist_begin(did);
255  tend = db.termlist_end(did);
256  cout << "Term List for record #" << did;
257  }
258  if (verbose) {
259  cout << " (";
260  if (did != 0)
261  cout << "wdf, ";
262  cout << "termfreq";
263  if (verbose > 1)
264  cout << ", collection freq, wdf upper bound";
265  cout << ')';
266  }
267  cout << ':';
268 
269  while (t != tend) {
270  const string & term = *t;
271  cout << separator << term;
272  if (verbose) {
273  if (did != 0)
274  cout << ' ' << t.get_wdf();
275  cout << ' ' << t.get_termfreq();
276  if (verbose > 1) {
277  cout << ' ' << db.get_collection_freq(term)
278  << ' ' << db.get_wdf_upper_bound(term);
279  }
280  }
281  ++t;
282  }
283  cout << endl;
284 }
285 
286 static void
288  vector<docid>::const_iterator i,
289  vector<docid>::const_iterator end)
290 {
291  // Display termlists
292  while (i != end) {
293  show_termlist(db, *i);
294  ++i;
295  }
296 }
297 
298 int
299 main(int argc, char **argv) try {
300  if (argc > 1 && argv[1][0] == '-') {
301  if (strcmp(argv[1], "--help") == 0) {
302  cout << PROG_NAME " - " PROG_DESC "\n\n";
303  show_usage();
304  exit(0);
305  }
306  if (strcmp(argv[1], "--version") == 0) {
307  cout << PROG_NAME " - " PACKAGE_STRING << endl;
308  exit(0);
309  }
310  }
311 
312  const char * all_terms = NULL;
313  vector<docid> recnos;
314  vector<string> terms;
315  vector<string> dbs;
316  Stem stemmer;
317 
318  valueno slot = 0; // Avoid "may be used uninitialised" warnings.
319  bool slot_set = false;
320 
321  int c;
322  while ((c = gnu_getopt(argc, argv, "aA:r:t:s:1vV::dz")) != -1) {
323  switch (c) {
324  case 'a':
325  all_terms = "";
326  break;
327  case 'A':
328  all_terms = optarg;
329  break;
330  case 'r': {
331  char * end;
332  errno = 0;
333  unsigned long n = strtoul(optarg, &end, 10);
334  if (optarg == end || *end) {
335  cout << "Non-numeric document id: " << optarg << endl;
336  exit(1);
337  }
338  Xapian::docid did(n);
339  if (errno == ERANGE || n == 0 || did != n) {
340  cout << "Document id out of range: " << optarg << endl;
341  exit(1);
342  }
343  recnos.push_back(did);
344  break;
345  }
346  case 't':
347  terms.push_back(optarg);
348  break;
349  case 's':
350  stemmer = Stem(optarg);
351  break;
352  case '1':
353  separator = '\n';
354  break;
355  case 'V':
356  if (optarg) {
357  switch (*optarg) {
358  case 'R':
360  ++optarg;
361  break;
362  case 'I':
364  ++optarg;
365  break;
366  case 'S':
368  ++optarg;
369  break;
370  case 'E':
372  ++optarg;
373  break;
374  }
375  char * end;
376  errno = 0;
377  unsigned long n = strtoul(optarg, &end, 10);
378  if (optarg == end || *end) {
379  cout << "Non-numeric value slot: " << optarg << endl;
380  exit(1);
381  }
382  slot = Xapian::valueno(n);
383  if (errno == ERANGE || slot != n) {
384  cout << "Value slot out of range: " << optarg << endl;
385  exit(1);
386  }
387  slot_set = true;
388  } else {
389  showvalues = true;
390  }
391  break;
392  case 'd':
393  showdocdata = true;
394  break;
395  case 'v':
396  ++verbose;
397  break;
398  case 'z':
399  count_zero_length_docs = true;
400  break;
401  default:
402  show_usage();
403  exit(1);
404  }
405  }
406 
407  while (argv[optind]) dbs.push_back(argv[optind++]);
408 
409  if (dbs.empty()) {
410  show_usage();
411  exit(1);
412  }
413 
414  std::sort(recnos.begin(), recnos.end());
415 
416  Database db;
417  {
418  vector<string>::const_iterator i;
419  for (i = dbs.begin(); i != dbs.end(); ++i) {
420  try {
421  db.add_database(Database(*i));
422  } catch (const Error &e) {
423  cerr << "Error opening database '" << *i << "': ";
424  cerr << e.get_description() << endl;
425  return 1;
426  }
427  }
428  }
429 
430  if (!all_terms && terms.empty() && recnos.empty() && !slot_set) {
431  // Show some statistics about the database.
432  show_db_stats(db);
433  return 0;
434  }
435 
436  if (all_terms) {
437  show_termlist(db, 0, all_terms);
438  }
439 
440  if (!recnos.empty()) {
441  if (showvalues) {
442  show_values(db, recnos.begin(), recnos.end());
443  } else if (slot_set) {
444  show_value(db, recnos.begin(), recnos.end(), slot);
445  }
446 
447  if (showdocdata) {
448  show_docdata(db, recnos.begin(), recnos.end());
449  }
450  } else {
451  if (slot_set) {
452  cout << "Value " << slot << " for each document:";
453  ValueIterator it = db.valuestream_begin(slot);
454  while (it != db.valuestream_end(slot)) {
455  cout << separator << it.get_docid() << ':';
457  ++it;
458  }
459  cout << endl;
460  }
461  }
462 
463  if (terms.empty()) {
464  show_termlists(db, recnos.begin(), recnos.end());
465  return 0;
466  }
467 
468  vector<string>::const_iterator i;
469  for (i = terms.begin(); i != terms.end(); ++i) {
470  string term = stemmer(*i);
471  PostingIterator p = db.postlist_begin(term);
472  PostingIterator pend = db.postlist_end(term);
473  if (p == pend) {
474  cout << "term '" << term << "' not in database\n";
475  continue;
476  }
477  if (recnos.empty()) {
478  // Display posting list
479  cout << "Posting List for term '" << term << "' (termfreq "
480  << db.get_termfreq(term) << ", collfreq "
481  << db.get_collection_freq(term) << ", wdf_max "
482  << db.get_wdf_upper_bound(term) << "):";
483  while (p != pend) {
484  cout << separator << *p;
485  if (verbose) {
486  cout << ' ' << p.get_wdf() << ' ' << p.get_doclength();
487  }
488  if (showvalues) show_values(db, *p, ' ');
489  if (showdocdata) show_docdata(db, *p, ' ');
490  ++p;
491  }
492  cout << endl;
493  } else {
494  // Display position lists
495  vector<docid>::const_iterator j;
496  for (j = recnos.begin(); j != recnos.end(); ++j) {
497  p.skip_to(*j);
498  if (p == pend || *p != *j) {
499  cout << "term '" << term <<
500  "' doesn't index document #" << *j << endl;
501  } else {
502  cout << "Position List for term '" << term
503  << "', record #" << *j << ':';
504  try {
506  while (pos != p.positionlist_end()) {
507  cout << separator << *pos;
508  ++pos;
509  }
510  cout << endl;
511  } catch (const Error &e) {
512  cerr << "Error: " << e.get_description() << endl;
513  }
514  }
515  }
516  }
517  }
518 } catch (const Error &e) {
519  cerr << "\nError: " << e.get_description() << endl;
520  return 1;
521 }
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
Xapian::Document get_document(Xapian::docid did) const
Get a document from the database, given its document id.
Definition: omdatabase.cc:490
int gnu_getopt(int argc_, char *const *argv_, const char *shortopts_)
Definition: gnu_getopt.h:90
Wrappers to allow GNU getopt to be used cleanly from C++ code.
TermIterator termlist_begin(Xapian::docid did) const
An iterator pointing to the start of the termlist for a given document.
Definition: omdatabase.cc:198
static enum @5 value_decode
int optind
Definition: getopt.cc:94
static char separator
Definition: xapian-delve.cc:45
This class is used to access a database, or a group of databases.
Definition: database.h:68
static bool count_zero_length_docs
Definition: xapian-delve.cc:50
Xapian::termcount get_wdf() const
Return the wdf for the document at the current position.
static void show_usage()
Definition: xapian-delve.cc:63
InvalidOperationError indicates the API was used in an invalid way.
Definition: error.h:283
Class representing a stemming algorithm.
Definition: stem.h:62
static bool showdocdata
Definition: xapian-delve.cc:49
Xapian::termcount get_doclength_lower_bound() const
Get a lower bound on the length of a document in this DB.
Definition: omdatabase.cc:401
bool has_positions() const
Does this database have any positional information?
Definition: omdatabase.cc:238
ValueIterator values_begin() const
Iterator for the values in this document.
Definition: omdocument.cc:210
TermIterator allterms_end(const std::string &=std::string()) const
Corresponding end iterator to allterms_begin(prefix).
Definition: database.h:265
Xapian::docid get_lastdocid() const
Get the highest document id which has been used in the database.
Definition: omdatabase.cc:279
const std::string & get_msg() const
Message giving details of the error, intended for human consumption.
Definition: error.h:122
PositionIterator positionlist_end() const
Return an end PositionIterator for the current document.
Xapian::doccount get_termfreq() const
Return the term frequency for the term at the current position.
Class for iterating over document values.
Definition: valueiterator.h:40
STL namespace.
Xapian::rev get_revision() const
Get the revision of the database.
Definition: omdatabase.cc:805
static Xapian::Stem stemmer
Definition: stemtest.cc:41
Xapian::doccount get_doccount() const
Get the number of documents in the database.
Definition: omdatabase.cc:267
static void show_termlists(Database &db, vector< docid >::const_iterator i, vector< docid >::const_iterator end)
Xapian::totallength get_total_length() const
Get the total length of all the documents in the database.
Definition: omdatabase.cc:312
static void show_values(Database &db, docid docid, char sep)
Xapian::doclength get_avlength() const
Get the average length of the documents in the database.
Definition: omdatabase.cc:293
Class for iterating over a list of terms.
Definition: termiterator.h:41
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
Class for iterating over a list of terms.
Xapian::termcount get_doclength_upper_bound() const
Get an upper bound on the length of a document in this DB.
Definition: omdatabase.cc:421
static void show_docdata(Database &db, docid docid, char sep)
void description_append(std::string &desc, const std::string &s)
Definition: unittest.cc:100
static int verbose
Definition: xapian-delve.cc:47
#define PROG_DESC
Definition: xapian-delve.cc:61
#define PROG_NAME
Definition: xapian-delve.cc:60
static void decode_and_show_value(const string &value)
Public interfaces for the Xapian library.
bool locked() const
Test if this database is currently locked for writing.
Definition: omdatabase.cc:793
char * optarg
Definition: getopt.cc:79
static bool showvalues
Definition: xapian-delve.cc:48
Class for iterating over term positions.
int main(int argc, char **argv)
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
ValueIterator values_end() const
Equivalent end iterator for values_begin().
Definition: document.h:271
TermIterator allterms_begin(const std::string &prefix=std::string()) const
An iterator which runs across all terms with a given prefix.
Definition: omdatabase.cc:223
TermIterator termlist_end(Xapian::docid) const
Corresponding end iterator to termlist_begin().
Definition: database.h:238
void add_database(const Database &database)
Add an existing database (or group of databases) to those accessed by this object.
Definition: omdatabase.cc:148
Append a string to an object description, escaping invalid UTF-8.
size_t size() const
Return number of shards in this Database object.
Definition: database.h:93
double sortable_unserialise(const std::string &serialised)
Convert a string encoded using sortable_serialise back to a floating point number.
std::string get_description() const
Return a string describing this object.
Definition: error.cc:93
Append a string to an object description, escaping invalid UTF-8.
static void show_termlist(const Database &db, Xapian::docid did, const char *all_pfx=NULL)
static void show_db_stats(Database &db)
Definition: xapian-delve.cc:95
PositionIterator positionlist_begin() const
Return a PositionIterator for the current document.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Xapian::termcount get_doclength() const
Return the length of the document at the current position.
All exceptions thrown by Xapian are subclasses of Xapian::Error.
Definition: error.h:43
void skip_to(Xapian::docid did)
Advance the iterator to document did.
unsigned valueno
The number for a value slot in a document.
Definition: types.h:108
#define PACKAGE_STRING
Definition: config.h:315
Xapian::docid get_docid() const
Return the docid at the current position.
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
std::string get_data() const
Get data stored in the document.
Definition: omdocument.cc:71
static void show_value(Database &db, vector< docid >::const_iterator i, vector< docid >::const_iterator end, Xapian::valueno slot)
PostingIterator postlist_end(const std::string &) const
Corresponding end iterator to postlist_begin().
Definition: database.h:225
std::string get_value(Xapian::valueno slot) const
Get value by number.
Definition: omdocument.cc:64
A handle representing a document in a Xapian database.
Definition: document.h:61
Xapian::valueno get_valueno() const
Return the value slot number for the current position.
std::string get_uuid() const
Get a UUID for the database.
Definition: omdatabase.cc:776
UnimplementedError indicates an attempt to use an unimplemented feature.
Definition: error.h:325
PostingIterator postlist_begin(const std::string &tname) const
An iterator pointing to the start of the postlist for a given term.
Definition: omdatabase.cc:162
Xapian::termcount get_collection_freq(const std::string &tname) const
Return the total number of occurrences of the given term.
Definition: omdatabase.cc:339
Xapian::termcount get_wdf_upper_bound(const std::string &term) const
Get an upper bound on the wdf of term term.
Definition: omdatabase.cc:435