xapian-core  2.0.0
xapian-delve.cc
Go to the documentation of this file.
1 
4 /* Copyright 1999,2000,2001 BrightStation PLC
5  * Copyright 2002 Ananova Ltd
6  * Copyright 2002-2022 Olly Betts
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as
10  * published by the Free Software Foundation; either version 2 of the
11  * License, or (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, see
20  * <https://www.gnu.org/licenses/>.
21  */
22 
23 #include <config.h>
24 
25 #include <xapian.h>
26 
27 #include <algorithm>
28 #include <ios>
29 #include <iostream>
30 #include <vector>
31 
32 #include "gnu_getopt.h"
33 
34 #include <cerrno>
35 #include <cstring>
36 #include <cstdlib>
38 
40 
41 using namespace Xapian;
42 using namespace std;
43 
44 static char separator = ' ';
45 
46 static int verbose = 0;
47 static bool showvalues = false;
48 static bool showdocdata = false;
49 static bool count_zero_length_docs = false;
50 
51 // How to decode document values.
52 static enum {
56  VALUE_RAW
58 
59 #define PROG_NAME "delve"
60 #define PROG_DESC "Inspect the contents of a Xapian database"
61 
62 static void show_usage() {
63  cout << "Usage: " PROG_NAME " [OPTIONS] DATABASE...\n\n"
64 "Options:\n"
65 " -a show all terms in the database\n"
66 " -A <prefix> show all terms in the database with given prefix\n"
67 " -r <recno> for term list(s)\n"
68 " -t <term> for posting list(s)\n"
69 " -t <term> -r <recno> for position list(s)\n"
70 " -s, --stemmer=LANG set the stemming language, the default is 'none'\n"
71 " -1 output one list entry per line\n"
72 " -V[<type>]<valueno> output value valueno for each document referred to\n"
73 " (or each document in the database if no -r options).\n"
74 " <type> can be:\n"
75 " E: escape in a C-like way (default)\n"
76 " I: decode as a packed integer\n"
77 " R: show the raw value (which may contain binary data,\n"
78 " newlines, invalid UTF-8, etc)\n"
79 " S: decode using Xapian::sortable_unserialise()\n"
80 " -V[<type>] output all values for each document referred to.\n"
81 " <type> is as above.\n"
82 " -d output document data for each document referred to\n"
83 " -z for db, count documents with length 0\n"
84 " -v extra info (wdf and len for postlist;\n"
85 " wdf and termfreq for termlist; number of terms for db;\n"
86 " termfreq when showing all terms; value bounds and freq\n"
87 " when showing all values in a slot)\n"
88 " -vv even more info (also show collection freq and wdf\n"
89 " upper bound for terms)\n"
90 " --help display this help and exit\n"
91 " --version output version information and exit\n";
92 }
93 
94 static void
96 {
97  // Display a few database stats.
98  cout << "UUID = " << db.get_uuid() << '\n';
99  cout << "number of documents = " << db.get_doccount() << '\n';
100  cout << "average document length = " << db.get_avlength() << '\n';
101  cout << "document length lower bound = " << db.get_doclength_lower_bound()
102  << '\n';
103  cout << "document length upper bound = " << db.get_doclength_upper_bound()
104  << '\n';
105  cout << "highest document id ever used = " << db.get_lastdocid() << '\n';
106  cout << boolalpha;
107  cout << "has positional information = " << db.has_positions() << '\n';
108  cout << "revision = ";
109  if (db.size() > 1) {
110  cout << "N/A (sharded DB)\n";
111  } else {
112  try {
113  cout << db.get_revision() << '\n';
114  } catch (const Xapian::InvalidOperationError& e) {
115  cout << e.get_description() << '\n';
116  } catch (const Xapian::UnimplementedError& e) {
117  cout << "N/A (" << e.get_msg() << ")\n";
118  }
119  }
120  cout << "currently open for writing = ";
121  try {
122  cout << db.locked() << '\n';
123  } catch (const Xapian::Error& e) {
124  cout << e.get_description() << '\n';
125  }
126 
128  Xapian::doccount empty_docs = 0;
129  if (db.get_total_length() == 0) {
130  // All documents are empty.
131  empty_docs = db.get_doccount();
132  } else {
133  Xapian::PostingIterator d = db.postlist_begin(string_view{});
134  while (d != db.postlist_end(string_view{})) {
135  if (d.get_doclength() == 0)
136  ++empty_docs;
137  ++d;
138  }
139  }
140  cout << "number of zero-length documents = " << empty_docs << '\n';
141  }
142 
143  if (verbose) {
144  // To find the number of terms, we have to count them!
145  // This will take a few seconds or minutes, so only do it if -v
146  // was specified.
147  termcount terms = 0;
148  TermIterator t = db.allterms_begin();
149  while (t != db.allterms_end()) {
150  ++terms;
151  ++t;
152  }
153  cout << "number of distinct terms = " << terms << '\n';
154  }
155 }
156 
157 static void
158 decode_and_show_value(const string& value)
159 {
160  switch (value_decode) {
161  case VALUE_ESCAPE: {
162  string esc;
163  description_append(esc, value);
164  cout << esc;
165  break;
166  }
168  cout << Xapian::sortable_unserialise(value);
169  break;
170  case VALUE_PACKED_INT: {
171  unsigned long long i = 0;
172  for (unsigned char ch : value) {
173  i = (i << 8) | ch;
174  }
175  cout << i;
176  break;
177  }
178  default: // VALUE_RAW
179  cout << value;
180  break;
181  }
182 }
183 
184 static void
186 {
187  Document doc = db.get_document(docid);
188  ValueIterator v = doc.values_begin();
189  while (v != doc.values_end()) {
190  cout << sep << v.get_valueno() << ':';
192  ++v;
193  }
194 }
195 
196 static void
198  vector<docid>::const_iterator i,
199  vector<docid>::const_iterator end)
200 {
201  while (i != end) {
202  cout << "Values for record #" << *i << ':';
203  show_values(db, *i, separator);
204  cout << '\n';
205  ++i;
206  }
207 }
208 
209 static void
211  vector<docid>::const_iterator i,
212  vector<docid>::const_iterator end,
213  Xapian::valueno slot)
214 {
215  while (i != end) {
216  Xapian::docid did = *i;
217  cout << "Value " << slot << " for record #" << did << ": ";
219  cout << '\n';
220  ++i;
221  }
222 }
223 
224 static void
226 {
227  cout << sep << "[" << db.get_document(docid).get_data() << ']';
228 }
229 
230 static void
232  vector<docid>::const_iterator i,
233  vector<docid>::const_iterator end)
234 {
235  while (i != end) {
236  cout << "Data for record #" << *i << ":\n";
237  cout << db.get_document(*i).get_data() << '\n';
238  ++i;
239  }
240 }
241 
242 static void
244  const char * all_pfx = NULL)
245 {
246  TermIterator t, tend;
247  if (all_pfx) {
248  t = db.allterms_begin(all_pfx);
249  tend = db.allterms_end(all_pfx);
250  cout << "All terms in database";
251  if (all_pfx[0])
252  cout << " with prefix \"" << all_pfx << "\"";
253  } else {
254  t = db.termlist_begin(did);
255  tend = db.termlist_end(did);
256  cout << "Term List for record #" << did;
257  }
258  if (verbose) {
259  cout << " (";
260  if (did != 0)
261  cout << "wdf, ";
262  cout << "termfreq";
263  if (verbose > 1)
264  cout << ", collection freq, wdf upper bound";
265  cout << ')';
266  }
267  cout << ':';
268 
269  while (t != tend) {
270  const string & term = *t;
271  cout << separator << term;
272  if (verbose) {
273  if (did != 0)
274  cout << ' ' << t.get_wdf();
275  cout << ' ' << t.get_termfreq();
276  if (verbose > 1) {
277  cout << ' ' << db.get_collection_freq(term)
278  << ' ' << db.get_wdf_upper_bound(term);
279  }
280  }
281  ++t;
282  }
283  cout << '\n';
284 }
285 
286 static void
288  vector<docid>::const_iterator i,
289  vector<docid>::const_iterator end)
290 {
291  // Display termlists
292  while (i != end) {
293  show_termlist(db, *i);
294  ++i;
295  }
296 }
297 
298 int
299 main(int argc, char **argv) try {
300  if (argc > 1 && argv[1][0] == '-') {
301  if (strcmp(argv[1], "--help") == 0) {
302  cout << PROG_NAME " - " PROG_DESC "\n\n";
303  show_usage();
304  exit(0);
305  }
306  if (strcmp(argv[1], "--version") == 0) {
307  cout << PROG_NAME " - " PACKAGE_STRING "\n";
308  exit(0);
309  }
310  }
311 
312  const char * all_terms = NULL;
313  vector<docid> recnos;
314  vector<string> terms;
315  vector<string> dbs;
316  Stem stemmer;
317 
318  valueno slot = 0; // Avoid "may be used uninitialised" warnings.
319  bool slot_set = false;
320 
321  int c;
322  while ((c = gnu_getopt(argc, argv, "aA:r:t:s:1vV::dz")) != -1) {
323  switch (c) {
324  case 'a':
325  all_terms = "";
326  break;
327  case 'A':
328  all_terms = optarg;
329  break;
330  case 'r': {
331  char * end;
332  errno = 0;
333  unsigned long n = strtoul(optarg, &end, 10);
334  if (optarg == end || *end) {
335  cout << "Non-numeric document id: " << optarg << '\n';
336  exit(1);
337  }
338  Xapian::docid did(n);
339  if (errno == ERANGE || n == 0 || did != n) {
340  cout << "Document id out of range: " << optarg << '\n';
341  exit(1);
342  }
343  recnos.push_back(did);
344  break;
345  }
346  case 't':
347  terms.push_back(optarg);
348  break;
349  case 's':
350  stemmer = Stem(optarg);
351  break;
352  case '1':
353  separator = '\n';
354  break;
355  case 'V':
356  if (optarg) {
357  switch (*optarg) {
358  case 'R':
360  ++optarg;
361  break;
362  case 'I':
364  ++optarg;
365  break;
366  case 'S':
368  ++optarg;
369  break;
370  case 'E':
372  ++optarg;
373  break;
374  }
375  char * end;
376  errno = 0;
377  unsigned long n = strtoul(optarg, &end, 10);
378  if (optarg == end || *end) {
379  cout << "Non-numeric value slot: " << optarg << '\n';
380  exit(1);
381  }
382  slot = Xapian::valueno(n);
383  if (errno == ERANGE || slot != n) {
384  cout << "Value slot out of range: " << optarg << '\n';
385  exit(1);
386  }
387  slot_set = true;
388  } else {
389  showvalues = true;
390  }
391  break;
392  case 'd':
393  showdocdata = true;
394  break;
395  case 'v':
396  ++verbose;
397  break;
398  case 'z':
399  count_zero_length_docs = true;
400  break;
401  default:
402  show_usage();
403  exit(1);
404  }
405  }
406 
407  while (argv[optind]) dbs.push_back(argv[optind++]);
408 
409  if (dbs.empty()) {
410  show_usage();
411  exit(1);
412  }
413 
414  std::sort(recnos.begin(), recnos.end());
415 
416  Database db;
417  {
418  vector<string>::const_iterator i;
419  for (i = dbs.begin(); i != dbs.end(); ++i) {
420  try {
421  db.add_database(Database(*i));
422  } catch (const Error &e) {
423  cerr << "Error opening database '" << *i << "': ";
424  cerr << e.get_description() << '\n';
425  return 1;
426  }
427  }
428  }
429 
430  if (!all_terms && terms.empty() && recnos.empty() && !slot_set) {
431  // Show some statistics about the database.
432  show_db_stats(db);
433  return 0;
434  }
435 
436  if (all_terms) {
437  show_termlist(db, 0, all_terms);
438  }
439 
440  if (!recnos.empty()) {
441  if (showvalues) {
442  show_values(db, recnos.begin(), recnos.end());
443  } else if (slot_set) {
444  show_value(db, recnos.begin(), recnos.end(), slot);
445  }
446 
447  if (showdocdata) {
448  show_docdata(db, recnos.begin(), recnos.end());
449  }
450  } else {
451  if (slot_set) {
452  cout << "Value " << slot;
453  if (verbose) {
454  cout << " (lower bound=";
456  cout << " upper bound=";
458  cout << " freq=" << db.get_value_freq(slot) << ")";
459  }
460  cout << " for each document:";
461  ValueIterator it = db.valuestream_begin(slot);
462  while (it != db.valuestream_end(slot)) {
463  cout << separator << it.get_docid() << ':';
465  ++it;
466  }
467  cout << '\n';
468  }
469  }
470 
471  if (terms.empty()) {
472  show_termlists(db, recnos.begin(), recnos.end());
473  return 0;
474  }
475 
476  vector<string>::const_iterator i;
477  for (i = terms.begin(); i != terms.end(); ++i) {
478  string term = stemmer(*i);
480  PostingIterator pend = db.postlist_end(term);
481  if (p == pend) {
482  cout << "term '" << term << "' not in database\n";
483  continue;
484  }
485  if (recnos.empty()) {
486  // Display posting list
487  cout << "Posting List for term '" << term << "' (termfreq "
488  << db.get_termfreq(term) << ", collfreq "
489  << db.get_collection_freq(term) << ", wdf_max "
490  << db.get_wdf_upper_bound(term) << "):";
491  while (p != pend) {
492  cout << separator << *p;
493  if (verbose) {
494  cout << ' ' << p.get_wdf() << ' ' << p.get_doclength();
495  }
496  if (showvalues) show_values(db, *p, ' ');
497  if (showdocdata) show_docdata(db, *p, ' ');
498  ++p;
499  }
500  cout << '\n';
501  } else {
502  // Display position lists
503  vector<docid>::const_iterator j;
504  for (j = recnos.begin(); j != recnos.end(); ++j) {
505  p.skip_to(*j);
506  if (p == pend || *p != *j) {
507  cout << "term '" << term <<
508  "' doesn't index document #" << *j << '\n';
509  } else {
510  cout << "Position List for term '" << term
511  << "', record #" << *j << ':';
512  try {
513  PositionIterator pos = p.positionlist_begin();
514  while (pos != p.positionlist_end()) {
515  cout << separator << *pos;
516  ++pos;
517  }
518  cout << '\n';
519  } catch (const Error &e) {
520  cerr << "Error: " << e.get_description() << '\n';
521  }
522  }
523  }
524  }
525  }
526 } catch (const Error &e) {
527  cerr << "\nError: " << e.get_description() << '\n';
528  return 1;
529 }
An indexed database of documents.
Definition: database.h:75
Xapian::rev get_revision() const
Get the revision of the database.
Definition: database.cc:527
ValueIterator valuestream_begin(Xapian::valueno slot) const
Return an iterator over the value in slot slot for each document.
Definition: database.cc:335
Xapian::doccount get_termfreq(std::string_view term) const
Get the number of documents indexed by a specified term.
Definition: database.cc:262
Xapian::totallength get_total_length() const
Get the total length of all the documents in the database.
Definition: database.cc:256
Xapian::termcount get_doclength_lower_bound() const
Get a lower bound on the length of a document in this DB.
Definition: database.cc:302
PostingIterator postlist_begin(std::string_view term) const
Start iterating the postings of a term.
Definition: database.cc:192
bool locked() const
Test if this database is currently locked for writing.
Definition: database.cc:511
TermIterator termlist_begin(Xapian::docid did) const
Start iterating the terms in a document.
Definition: database.cc:200
double get_avlength() const
Old name for get_average_length() for backward compatibility.
Definition: database.h:322
Xapian::termcount get_wdf_upper_bound(std::string_view term) const
Get an upper bound on the wdf of term term.
Definition: database.cc:314
size_t size() const
Return number of shards in this Database object.
Definition: database.cc:105
std::string get_value_upper_bound(Xapian::valueno slot) const
Get an upper bound on the values stored in the given value slot.
Definition: database.cc:296
void add_database(const Database &other)
Add shards from another Database.
Definition: database.h:109
std::string get_value_lower_bound(Xapian::valueno slot) const
Get a lower bound on the values stored in the given value slot.
Definition: database.cc:290
TermIterator allterms_end(std::string_view={}) const noexcept
End iterator corresponding to allterms_begin(prefix).
Definition: database.h:307
bool has_positions() const
Does this database have any positional information?
Definition: database.cc:215
Xapian::termcount get_collection_freq(std::string_view term) const
Get the total number of occurrences of a specified term.
Definition: database.cc:273
Xapian::doccount get_doccount() const
Get the number of documents in the database.
Definition: database.cc:233
PostingIterator postlist_end(std::string_view) const noexcept
End iterator corresponding to postlist_begin().
Definition: database.h:258
TermIterator termlist_end(Xapian::docid) const noexcept
End iterator corresponding to termlist_begin().
Definition: database.h:271
Xapian::docid get_lastdocid() const
Get the highest document id which has been used in the database.
Definition: database.cc:239
Xapian::doccount get_value_freq(Xapian::valueno slot) const
Return the frequency of a given value slot.
Definition: database.cc:284
TermIterator allterms_begin(std::string_view prefix={}) const
Start iterating all terms in the database with a given prefix.
Definition: database.cc:209
ValueIterator valuestream_end(Xapian::valueno) const noexcept
Return end iterator corresponding to valuestream_begin().
Definition: database.h:421
Xapian::termcount get_doclength_upper_bound() const
Get an upper bound on the length of a document in this DB.
Definition: database.cc:308
Xapian::Document get_document(Xapian::docid did, unsigned flags=0) const
Get a document from the database.
Definition: database.cc:368
std::string get_uuid() const
Get the UUID for the database.
Definition: database.cc:505
Class representing a document.
Definition: document.h:64
std::string get_data() const
Get the document data.
Definition: document.cc:75
ValueIterator values_begin() const
Start iterating the values in this document.
Definition: document.cc:208
std::string get_value(Xapian::valueno slot) const
Read a value slot in this document.
Definition: document.cc:185
ValueIterator values_end() const noexcept
End iterator corresponding to values_begin().
Definition: document.h:259
All exceptions thrown by Xapian are subclasses of Xapian::Error.
Definition: error.h:41
const std::string & get_msg() const noexcept
Message giving details of the error, intended for human consumption.
Definition: error.h:111
std::string get_description() const
Return a string describing this object.
Definition: error.cc:93
InvalidOperationError indicates the API was used in an invalid way.
Definition: error.h:271
virtual bool skip_to(Xapian::termpos termpos)=0
Skip forward to the specified position.
Class for iterating over term positions.
Class for iterating over a list of terms.
Xapian::termcount get_doclength() const
Return the length of the document at the current position.
Class representing a stemming algorithm.
Definition: stem.h:74
Class for iterating over a list of terms.
Definition: termiterator.h:41
Xapian::doccount get_termfreq() const
Return the term frequency for the term at the current position.
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
UnimplementedError indicates an attempt to use an unimplemented feature.
Definition: error.h:313
Class for iterating over document values.
Definition: valueiterator.h:39
Xapian::docid get_docid() const
Return the docid at the current position.
Xapian::valueno get_valueno() const
Return the value slot number for the current position.
#define PACKAGE_STRING
Definition: config.h:361
string term
PositionList * p
Xapian::termpos pos
Append a string to an object description, escaping invalid UTF-8.
Append a string to an object description, escaping invalid UTF-8.
int optind
Definition: getopt.cc:93
char * optarg
Definition: getopt.cc:78
Wrappers to allow GNU getopt to be used cleanly from C++ code.
int gnu_getopt(int argc_, char *const *argv_, const char *shortopts_)
Definition: gnu_getopt.h:89
void sort(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:277
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
double sortable_unserialise(std::string_view serialised) noexcept
Convert a string encoded using sortable_serialise back to a floating point number.
unsigned valueno
The number for a value slot in a document.
Definition: types.h:90
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
static Xapian::Stem stemmer
Definition: stemtest.cc:42
void description_append(std::string &desc, std::string_view s)
Definition: unittest.cc:105
static int verbose
Definition: xapian-delve.cc:46
static bool showdocdata
Definition: xapian-delve.cc:48
static void show_usage()
Definition: xapian-delve.cc:62
static void show_db_stats(Database &db)
Definition: xapian-delve.cc:95
int main(int argc, char **argv)
#define PROG_NAME
Definition: xapian-delve.cc:59
static enum @9 value_decode
static void decode_and_show_value(const string &value)
static char separator
Definition: xapian-delve.cc:44
static bool showvalues
Definition: xapian-delve.cc:47
static void show_value(Database &db, vector< docid >::const_iterator i, vector< docid >::const_iterator end, Xapian::valueno slot)
#define PROG_DESC
Definition: xapian-delve.cc:60
@ VALUE_ESCAPE
Definition: xapian-delve.cc:53
@ VALUE_RAW
Definition: xapian-delve.cc:56
@ VALUE_PACKED_INT
Definition: xapian-delve.cc:55
@ VALUE_SORTABLE_SERIALISE
Definition: xapian-delve.cc:54
static void show_termlist(const Database &db, Xapian::docid did, const char *all_pfx=NULL)
static void show_values(Database &db, docid docid, char sep)
static void show_docdata(Database &db, docid docid, char sep)
static bool count_zero_length_docs
Definition: xapian-delve.cc:49
static void show_termlists(Database &db, vector< docid >::const_iterator i, vector< docid >::const_iterator end)
Public interfaces for the Xapian library.