xapian-core  1.4.20
xapian-delve.cc
Go to the documentation of this file.
1 
4 /* Copyright 1999,2000,2001 BrightStation PLC
5  * Copyright 2002 Ananova Ltd
6  * Copyright 2002-2022 Olly Betts
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as
10  * published by the Free Software Foundation; either version 2 of the
11  * License, or (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21  * USA
22  */
23 
24 #include <config.h>
25 
26 #include <xapian.h>
27 
28 #include <algorithm>
29 #include <ios>
30 #include <iostream>
31 #include <vector>
32 
33 #include "gnu_getopt.h"
34 
35 #include <cerrno>
36 #include <cstring>
37 #include <cstdlib>
39 
41 
42 using namespace Xapian;
43 using namespace std;
44 
45 static char separator = ' ';
46 
47 static int verbose = 0;
48 static bool showvalues = false;
49 static bool showdocdata = false;
50 static bool count_zero_length_docs = false;
51 
52 // How to decode document values.
53 static enum {
59 
60 #define PROG_NAME "delve"
61 #define PROG_DESC "Inspect the contents of a Xapian database"
62 
63 static void show_usage() {
64  cout << "Usage: " PROG_NAME " [OPTIONS] DATABASE...\n\n"
65 "Options:\n"
66 " -a show all terms in the database\n"
67 " -A <prefix> show all terms in the database with given prefix\n"
68 " -r <recno> for term list(s)\n"
69 " -t <term> for posting list(s)\n"
70 " -t <term> -r <recno> for position list(s)\n"
71 " -s, --stemmer=LANG set the stemming language, the default is 'none'\n"
72 " -1 output one list entry per line\n"
73 " -V[<type>]<valueno> output value valueno for each document referred to\n"
74 " (or each document in the database if no -r options).\n"
75 " <type> can be:\n"
76 " E: escape in a C-like way (default)\n"
77 " I: decode as a packed integer\n"
78 " R: show the raw value (which may contain binary data,\n"
79 " newlines, invalid UTF-8, etc)\n"
80 " S: decode using Xapian::sortable_unserialise()\n"
81 " -V[<type>] output all values for each document referred to.\n"
82 " <type> is as above.\n"
83 " -d output document data for each document referred to\n"
84 " -z for db, count documents with length 0\n"
85 " -v extra info (wdf and len for postlist;\n"
86 " wdf and termfreq for termlist; number of terms for db;\n"
87 " termfreq when showing all terms; value bounds and freq\n"
88 " when showing all values in a slot)\n"
89 " -vv even more info (also show collection freq and wdf\n"
90 " upper bound for terms)\n"
91 " --help display this help and exit\n"
92 " --version output version information and exit" << endl;
93 }
94 
95 static void
97 {
98  // Display a few database stats.
99  cout << "UUID = " << db.get_uuid() << endl;
100  cout << "number of documents = " << db.get_doccount() << endl;
101  cout << "average document length = " << db.get_avlength() << endl;
102  cout << "document length lower bound = " << db.get_doclength_lower_bound()
103  << endl;
104  cout << "document length upper bound = " << db.get_doclength_upper_bound()
105  << endl;
106  cout << "highest document id ever used = " << db.get_lastdocid() << endl;
107  cout << boolalpha;
108  cout << "has positional information = " << db.has_positions() << endl;
109  cout << "revision = ";
110  if (db.size() > 1) {
111  cout << "N/A (sharded DB)\n";
112  } else {
113  try {
114  cout << db.get_revision() << endl;
115  } catch (const Xapian::InvalidOperationError& e) {
116  cout << e.get_description() << endl;
117  } catch (const Xapian::UnimplementedError& e) {
118  cout << "N/A (" << e.get_msg() << ")\n";
119  }
120  }
121  cout << "currently open for writing = ";
122  try {
123  cout << db.locked() << endl;
124  } catch (const Xapian::Error& e) {
125  cout << e.get_description() << endl;
126  }
127 
129  Xapian::doccount empty_docs = 0;
130  if (db.get_total_length() == 0) {
131  // All documents are empty.
132  empty_docs = db.get_doccount();
133  } else {
134  Xapian::PostingIterator d = db.postlist_begin(string());
135  while (d != db.postlist_end(string())) {
136  if (d.get_doclength() == 0)
137  ++empty_docs;
138  ++d;
139  }
140  }
141  cout << "number of zero-length documents = " << empty_docs << endl;
142  }
143 
144  if (verbose) {
145  // To find the number of terms, we have to count them!
146  // This will take a few seconds or minutes, so only do it if -v
147  // was specified.
148  termcount terms = 0;
149  TermIterator t = db.allterms_begin();
150  while (t != db.allterms_end()) {
151  ++terms;
152  ++t;
153  }
154  cout << "number of distinct terms = " << terms << endl;
155  }
156 }
157 
158 static void
159 decode_and_show_value(const string& value)
160 {
161  switch (value_decode) {
162  case VALUE_ESCAPE: {
163  string esc;
164  description_append(esc, value);
165  cout << esc;
166  break;
167  }
169  cout << Xapian::sortable_unserialise(value);
170  break;
171  case VALUE_PACKED_INT: {
172  unsigned long long i = 0;
173  for (unsigned char ch : value) {
174  i = (i << 8) | ch;
175  }
176  cout << i;
177  break;
178  }
179  default: // VALUE_RAW
180  cout << value;
181  break;
182  }
183 }
184 
185 static void
187 {
188  Document doc = db.get_document(docid);
189  ValueIterator v = doc.values_begin();
190  while (v != doc.values_end()) {
191  cout << sep << v.get_valueno() << ':';
193  ++v;
194  }
195 }
196 
197 static void
199  vector<docid>::const_iterator i,
200  vector<docid>::const_iterator end)
201 {
202  while (i != end) {
203  cout << "Values for record #" << *i << ':';
204  show_values(db, *i, separator);
205  cout << endl;
206  ++i;
207  }
208 }
209 
210 static void
212  vector<docid>::const_iterator i,
213  vector<docid>::const_iterator end,
214  Xapian::valueno slot)
215 {
216  while (i != end) {
217  Xapian::docid did = *i;
218  cout << "Value " << slot << " for record #" << did << ": ";
220  cout << endl;
221  ++i;
222  }
223 }
224 
225 static void
227 {
228  cout << sep << "[" << db.get_document(docid).get_data() << ']';
229 }
230 
231 static void
233  vector<docid>::const_iterator i,
234  vector<docid>::const_iterator end)
235 {
236  while (i != end) {
237  cout << "Data for record #" << *i << ':' << endl;
238  cout << db.get_document(*i).get_data() << endl;
239  ++i;
240  }
241 }
242 
243 static void
245  const char * all_pfx = NULL)
246 {
247  TermIterator t, tend;
248  if (all_pfx) {
249  t = db.allterms_begin(all_pfx);
250  tend = db.allterms_end(all_pfx);
251  cout << "All terms in database";
252  if (all_pfx[0])
253  cout << " with prefix \"" << all_pfx << "\"";
254  } else {
255  t = db.termlist_begin(did);
256  tend = db.termlist_end(did);
257  cout << "Term List for record #" << did;
258  }
259  if (verbose) {
260  cout << " (";
261  if (did != 0)
262  cout << "wdf, ";
263  cout << "termfreq";
264  if (verbose > 1)
265  cout << ", collection freq, wdf upper bound";
266  cout << ')';
267  }
268  cout << ':';
269 
270  while (t != tend) {
271  const string & term = *t;
272  cout << separator << term;
273  if (verbose) {
274  if (did != 0)
275  cout << ' ' << t.get_wdf();
276  cout << ' ' << t.get_termfreq();
277  if (verbose > 1) {
278  cout << ' ' << db.get_collection_freq(term)
279  << ' ' << db.get_wdf_upper_bound(term);
280  }
281  }
282  ++t;
283  }
284  cout << endl;
285 }
286 
287 static void
289  vector<docid>::const_iterator i,
290  vector<docid>::const_iterator end)
291 {
292  // Display termlists
293  while (i != end) {
294  show_termlist(db, *i);
295  ++i;
296  }
297 }
298 
299 int
300 main(int argc, char **argv) try {
301  if (argc > 1 && argv[1][0] == '-') {
302  if (strcmp(argv[1], "--help") == 0) {
303  cout << PROG_NAME " - " PROG_DESC "\n\n";
304  show_usage();
305  exit(0);
306  }
307  if (strcmp(argv[1], "--version") == 0) {
308  cout << PROG_NAME " - " PACKAGE_STRING << endl;
309  exit(0);
310  }
311  }
312 
313  const char * all_terms = NULL;
314  vector<docid> recnos;
315  vector<string> terms;
316  vector<string> dbs;
317  Stem stemmer;
318 
319  valueno slot = 0; // Avoid "may be used uninitialised" warnings.
320  bool slot_set = false;
321 
322  int c;
323  while ((c = gnu_getopt(argc, argv, "aA:r:t:s:1vV::dz")) != -1) {
324  switch (c) {
325  case 'a':
326  all_terms = "";
327  break;
328  case 'A':
329  all_terms = optarg;
330  break;
331  case 'r': {
332  char * end;
333  errno = 0;
334  unsigned long n = strtoul(optarg, &end, 10);
335  if (optarg == end || *end) {
336  cout << "Non-numeric document id: " << optarg << endl;
337  exit(1);
338  }
339  Xapian::docid did(n);
340  if (errno == ERANGE || n == 0 || did != n) {
341  cout << "Document id out of range: " << optarg << endl;
342  exit(1);
343  }
344  recnos.push_back(did);
345  break;
346  }
347  case 't':
348  terms.push_back(optarg);
349  break;
350  case 's':
351  stemmer = Stem(optarg);
352  break;
353  case '1':
354  separator = '\n';
355  break;
356  case 'V':
357  if (optarg) {
358  switch (*optarg) {
359  case 'R':
361  ++optarg;
362  break;
363  case 'I':
365  ++optarg;
366  break;
367  case 'S':
369  ++optarg;
370  break;
371  case 'E':
373  ++optarg;
374  break;
375  }
376  char * end;
377  errno = 0;
378  unsigned long n = strtoul(optarg, &end, 10);
379  if (optarg == end || *end) {
380  cout << "Non-numeric value slot: " << optarg << endl;
381  exit(1);
382  }
383  slot = Xapian::valueno(n);
384  if (errno == ERANGE || slot != n) {
385  cout << "Value slot out of range: " << optarg << endl;
386  exit(1);
387  }
388  slot_set = true;
389  } else {
390  showvalues = true;
391  }
392  break;
393  case 'd':
394  showdocdata = true;
395  break;
396  case 'v':
397  ++verbose;
398  break;
399  case 'z':
400  count_zero_length_docs = true;
401  break;
402  default:
403  show_usage();
404  exit(1);
405  }
406  }
407 
408  while (argv[optind]) dbs.push_back(argv[optind++]);
409 
410  if (dbs.empty()) {
411  show_usage();
412  exit(1);
413  }
414 
415  std::sort(recnos.begin(), recnos.end());
416 
417  Database db;
418  {
419  vector<string>::const_iterator i;
420  for (i = dbs.begin(); i != dbs.end(); ++i) {
421  try {
422  db.add_database(Database(*i));
423  } catch (const Error &e) {
424  cerr << "Error opening database '" << *i << "': ";
425  cerr << e.get_description() << endl;
426  return 1;
427  }
428  }
429  }
430 
431  if (!all_terms && terms.empty() && recnos.empty() && !slot_set) {
432  // Show some statistics about the database.
433  show_db_stats(db);
434  return 0;
435  }
436 
437  if (all_terms) {
438  show_termlist(db, 0, all_terms);
439  }
440 
441  if (!recnos.empty()) {
442  if (showvalues) {
443  show_values(db, recnos.begin(), recnos.end());
444  } else if (slot_set) {
445  show_value(db, recnos.begin(), recnos.end(), slot);
446  }
447 
448  if (showdocdata) {
449  show_docdata(db, recnos.begin(), recnos.end());
450  }
451  } else {
452  if (slot_set) {
453  cout << "Value " << slot;
454  if (verbose) {
455  cout << " (lower bound=";
456  decode_and_show_value(db.get_value_lower_bound(slot));
457  cout << " upper bound=";
458  decode_and_show_value(db.get_value_upper_bound(slot));
459  cout << " freq=" << db.get_value_freq(slot) << ")";
460  }
461  cout << " for each document:";
462  ValueIterator it = db.valuestream_begin(slot);
463  while (it != db.valuestream_end(slot)) {
464  cout << separator << it.get_docid() << ':';
466  ++it;
467  }
468  cout << endl;
469  }
470  }
471 
472  if (terms.empty()) {
473  show_termlists(db, recnos.begin(), recnos.end());
474  return 0;
475  }
476 
477  vector<string>::const_iterator i;
478  for (i = terms.begin(); i != terms.end(); ++i) {
479  string term = stemmer(*i);
480  PostingIterator p = db.postlist_begin(term);
481  PostingIterator pend = db.postlist_end(term);
482  if (p == pend) {
483  cout << "term '" << term << "' not in database\n";
484  continue;
485  }
486  if (recnos.empty()) {
487  // Display posting list
488  cout << "Posting List for term '" << term << "' (termfreq "
489  << db.get_termfreq(term) << ", collfreq "
490  << db.get_collection_freq(term) << ", wdf_max "
491  << db.get_wdf_upper_bound(term) << "):";
492  while (p != pend) {
493  cout << separator << *p;
494  if (verbose) {
495  cout << ' ' << p.get_wdf() << ' ' << p.get_doclength();
496  }
497  if (showvalues) show_values(db, *p, ' ');
498  if (showdocdata) show_docdata(db, *p, ' ');
499  ++p;
500  }
501  cout << endl;
502  } else {
503  // Display position lists
504  vector<docid>::const_iterator j;
505  for (j = recnos.begin(); j != recnos.end(); ++j) {
506  p.skip_to(*j);
507  if (p == pend || *p != *j) {
508  cout << "term '" << term <<
509  "' doesn't index document #" << *j << endl;
510  } else {
511  cout << "Position List for term '" << term
512  << "', record #" << *j << ':';
513  try {
515  while (pos != p.positionlist_end()) {
516  cout << separator << *pos;
517  ++pos;
518  }
519  cout << endl;
520  } catch (const Error &e) {
521  cerr << "Error: " << e.get_description() << endl;
522  }
523  }
524  }
525  }
526  }
527 } catch (const Error &e) {
528  cerr << "\nError: " << e.get_description() << endl;
529  return 1;
530 }
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
Xapian::Document get_document(Xapian::docid did) const
Get a document from the database, given its document id.
Definition: omdatabase.cc:490
int gnu_getopt(int argc_, char *const *argv_, const char *shortopts_)
Definition: gnu_getopt.h:90
Wrappers to allow GNU getopt to be used cleanly from C++ code.
TermIterator termlist_begin(Xapian::docid did) const
An iterator pointing to the start of the termlist for a given document.
Definition: omdatabase.cc:198
static enum @5 value_decode
int optind
Definition: getopt.cc:94
static char separator
Definition: xapian-delve.cc:45
This class is used to access a database, or a group of databases.
Definition: database.h:68
static bool count_zero_length_docs
Definition: xapian-delve.cc:50
Xapian::termcount get_wdf() const
Return the wdf for the document at the current position.
static void show_usage()
Definition: xapian-delve.cc:63
InvalidOperationError indicates the API was used in an invalid way.
Definition: error.h:283
Class representing a stemming algorithm.
Definition: stem.h:62
static bool showdocdata
Definition: xapian-delve.cc:49
Xapian::termcount get_doclength_lower_bound() const
Get a lower bound on the length of a document in this DB.
Definition: omdatabase.cc:401
bool has_positions() const
Does this database have any positional information?
Definition: omdatabase.cc:238
ValueIterator values_begin() const
Iterator for the values in this document.
Definition: omdocument.cc:210
TermIterator allterms_end(const std::string &=std::string()) const
Corresponding end iterator to allterms_begin(prefix).
Definition: database.h:265
Xapian::docid get_lastdocid() const
Get the highest document id which has been used in the database.
Definition: omdatabase.cc:279
const std::string & get_msg() const
Message giving details of the error, intended for human consumption.
Definition: error.h:122
PositionIterator positionlist_end() const
Return an end PositionIterator for the current document.
Xapian::doccount get_termfreq() const
Return the term frequency for the term at the current position.
Class for iterating over document values.
Definition: valueiterator.h:40
STL namespace.
Xapian::rev get_revision() const
Get the revision of the database.
Definition: omdatabase.cc:805
static Xapian::Stem stemmer
Definition: stemtest.cc:41
Xapian::doccount get_doccount() const
Get the number of documents in the database.
Definition: omdatabase.cc:267
static void show_termlists(Database &db, vector< docid >::const_iterator i, vector< docid >::const_iterator end)
Xapian::totallength get_total_length() const
Get the total length of all the documents in the database.
Definition: omdatabase.cc:312
static void show_values(Database &db, docid docid, char sep)
Xapian::doclength get_avlength() const
Get the average length of the documents in the database.
Definition: omdatabase.cc:293
Class for iterating over a list of terms.
Definition: termiterator.h:41
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
Class for iterating over a list of terms.
Xapian::termcount get_doclength_upper_bound() const
Get an upper bound on the length of a document in this DB.
Definition: omdatabase.cc:421
static void show_docdata(Database &db, docid docid, char sep)
void description_append(std::string &desc, const std::string &s)
Definition: unittest.cc:100
static int verbose
Definition: xapian-delve.cc:47
#define PROG_DESC
Definition: xapian-delve.cc:61
#define PROG_NAME
Definition: xapian-delve.cc:60
static void decode_and_show_value(const string &value)
Public interfaces for the Xapian library.
bool locked() const
Test if this database is currently locked for writing.
Definition: omdatabase.cc:793
char * optarg
Definition: getopt.cc:79
static bool showvalues
Definition: xapian-delve.cc:48
Class for iterating over term positions.
int main(int argc, char **argv)
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
ValueIterator values_end() const
Equivalent end iterator for values_begin().
Definition: document.h:271
TermIterator allterms_begin(const std::string &prefix=std::string()) const
An iterator which runs across all terms with a given prefix.
Definition: omdatabase.cc:223
TermIterator termlist_end(Xapian::docid) const
Corresponding end iterator to termlist_begin().
Definition: database.h:238
void add_database(const Database &database)
Add an existing database (or group of databases) to those accessed by this object.
Definition: omdatabase.cc:148
Append a string to an object description, escaping invalid UTF-8.
size_t size() const
Return number of shards in this Database object.
Definition: database.h:93
double sortable_unserialise(const std::string &serialised)
Convert a string encoded using sortable_serialise back to a floating point number.
std::string get_description() const
Return a string describing this object.
Definition: error.cc:93
Append a string to an object description, escaping invalid UTF-8.
static void show_termlist(const Database &db, Xapian::docid did, const char *all_pfx=NULL)
static void show_db_stats(Database &db)
Definition: xapian-delve.cc:96
PositionIterator positionlist_begin() const
Return a PositionIterator for the current document.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Xapian::termcount get_doclength() const
Return the length of the document at the current position.
All exceptions thrown by Xapian are subclasses of Xapian::Error.
Definition: error.h:43
void skip_to(Xapian::docid did)
Advance the iterator to document did.
unsigned valueno
The number for a value slot in a document.
Definition: types.h:108
#define PACKAGE_STRING
Definition: config.h:337
Xapian::docid get_docid() const
Return the docid at the current position.
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
std::string get_data() const
Get data stored in the document.
Definition: omdocument.cc:71
static void show_value(Database &db, vector< docid >::const_iterator i, vector< docid >::const_iterator end, Xapian::valueno slot)
PostingIterator postlist_end(const std::string &) const
Corresponding end iterator to postlist_begin().
Definition: database.h:225
std::string get_value(Xapian::valueno slot) const
Get value by number.
Definition: omdocument.cc:64
A handle representing a document in a Xapian database.
Definition: document.h:61
Xapian::valueno get_valueno() const
Return the value slot number for the current position.
std::string get_uuid() const
Get a UUID for the database.
Definition: omdatabase.cc:776
UnimplementedError indicates an attempt to use an unimplemented feature.
Definition: error.h:325
PostingIterator postlist_begin(const std::string &tname) const
An iterator pointing to the start of the postlist for a given term.
Definition: omdatabase.cc:162
Xapian::termcount get_collection_freq(const std::string &tname) const
Return the total number of occurrences of the given term.
Definition: omdatabase.cc:339
Xapian::termcount get_wdf_upper_bound(const std::string &term) const
Get an upper bound on the wdf of term term.
Definition: omdatabase.cc:435