xapian-core  1.4.26
xapian-pos.cc
Go to the documentation of this file.
1 
4 /* Copyright 2018-2022 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19  * USA
20  */
21 
22 #include <config.h>
23 
24 #include <xapian.h>
25 #include <xapian/iterator.h>
26 
27 #include <iostream>
28 
29 #include <algorithm>
30 #include <cerrno>
31 #include <cstdlib>
32 #include <limits>
33 
34 #include "gnu_getopt.h"
35 #include "stringutils.h"
36 
37 using namespace std;
38 
39 #define PROG_NAME "xapian-pos"
40 #define PROG_DESC "Debug positional data in a Xapian database"
41 
42 #define OPT_HELP 1
43 #define OPT_VERSION 2
44 
45 static void
47 {
48  cout << "Usage: " PROG_NAME " [OPTIONS] DATABASE\n\n"
49 "Options:\n"
50 " -d, --doc=DOCID show positions for document DOCID\n"
51 " -s, --start=POS specifies the first position to show\n"
52 " -e, --end=POS specifies the last position to show\n"
53 " --help display this help and exit\n"
54 " --version output version information and exit\n";
55 }
56 
57 class Pos {
59 
61 
62  string term;
63 
64  public:
65  Pos(const string& term_, const Xapian::PositionIterator& p_)
66  : p(p_), term(term_) { pos = *p; }
67 
68  Xapian::termpos get_pos() const { return pos; }
69 
70  const string& get_term() const { return term; }
71 
72  bool next() {
73  if (!Xapian::iterator_valid(++p)) {
74  return false;
75  }
76  pos = *p;
77  return true;
78  }
79 };
80 
81 struct PosCmp {
82  bool operator()(const Pos* a, const Pos* b) {
83  if (a->get_pos() != b->get_pos()) {
84  return a->get_pos() > b->get_pos();
85  }
86  return a->get_term() > b->get_term();
87  }
88 };
89 
90 template<typename T>
91 bool to_unsigned_int(const char* s, T& result)
92 {
93  errno = 0;
94  char* e;
95  auto v = strtoull(s, &e, 0);
96  if (errno == 0) {
97  if (*e || e == s) {
98  // Junk after or empty input.
99  errno = EINVAL;
100  } else if (v > numeric_limits<T>::max()) {
101  // Exceeds the type.
102  errno = ERANGE;
103  } else {
104  result = T(v);
105  return true;
106  }
107  }
108  return false;
109 }
110 
111 int
112 main(int argc, char **argv)
113 try {
114  static const struct option long_opts[] = {
115  {"doc", required_argument, 0, 'd'},
116  {"start", required_argument, 0, 's'},
117  {"end", required_argument, 0, 'e'},
118  {"help", no_argument, 0, OPT_HELP},
119  {"version", no_argument, 0, OPT_VERSION},
120  {NULL, 0, 0, 0}
121  };
122 
123  Xapian::docid did = 0;
124  Xapian::termpos startpos = 0;
125  Xapian::termpos endpos = numeric_limits<Xapian::termpos>::max();
126  int c;
127  while ((c = gnu_getopt_long(argc, argv, "d:e:s:", long_opts, 0)) != -1) {
128  switch (c) {
129  case 'd':
130  if (!to_unsigned_int(optarg, did) || did == 0) {
131  if (errno == 0) errno = ERANGE;
132  cerr << "Bad docid value '" << optarg << "': "
133  << strerror(errno) << '\n';
134  exit(1);
135  }
136  break;
137  case 's':
138  if (!to_unsigned_int(optarg, startpos)) {
139  cerr << "Bad start position '" << optarg << "': "
140  << strerror(errno) << '\n';
141  exit(1);
142  }
143  break;
144  case 'e':
145  if (!to_unsigned_int(optarg, endpos)) {
146  cerr << "Bad end position '" << optarg << "': "
147  << strerror(errno) << '\n';
148  exit(1);
149  }
150  break;
151  case OPT_HELP:
152  cout << PROG_NAME " - " PROG_DESC "\n\n";
153  show_usage();
154  exit(0);
155  case OPT_VERSION:
156  cout << PROG_NAME " - " PACKAGE_STRING "\n";
157  exit(0);
158  default:
159  show_usage();
160  exit(1);
161  }
162  }
163 
164  // We expect one argument - a database path.
165  if (argc - optind != 1) {
166  show_usage();
167  exit(1);
168  }
169 
170  if (did == 0) {
171  cerr << "--doc=DOCID option required.\n";
172  exit(1);
173  }
174 
175  vector<Pos*> heap;
176 
177  Xapian::Database db(argv[optind]);
178 
179  for (auto term_it = db.termlist_begin(did);
180  term_it != db.termlist_end(did); ++term_it) {
181  const string& term = *term_it;
182  auto pos_it = db.positionlist_begin(did, term);
183  if (startpos) pos_it.skip_to(startpos);
184  if (pos_it != db.positionlist_end(did, term)) {
185  heap.push_back(new Pos(term, pos_it));
186  }
187  }
188 
189  make_heap(heap.begin(), heap.end(), PosCmp());
190 
191  Xapian::termpos old_pos = startpos - 1;
192  while (!heap.empty()) {
193  auto tip = heap.front();
194  Xapian::termpos pos = tip->get_pos();
195  if (pos > endpos) break;
196 
197  switch (pos - old_pos) {
198  case 0:
199  // Another term at the same position.
200  cout << ' ';
201  break;
202  case 1:
203  cout << '\n' << pos << '\t';
204  break;
205  default:
206  cout << "\nGap of " << (pos - old_pos - 1)
207  << " unused positions\n" << pos << '\t';
208  break;
209  }
210  cout << tip->get_term();
211 
212  old_pos = pos;
213 
214  if (tip->next()) {
215  pop_heap(heap.begin(), heap.end(), PosCmp());
216  push_heap(heap.begin(), heap.end(), PosCmp());
217  } else {
218  pop_heap(heap.begin(), heap.end(), PosCmp());
219  heap.resize(heap.size() - 1);
220  }
221  }
222 
223  cout << '\n';
224 } catch (const Xapian::Error & e) {
225  cerr << '\n' << argv[0] << ": " << e.get_description() << '\n';
226  exit(1);
227 }
PositionIterator positionlist_end(Xapian::docid, const std::string &) const
Corresponding end iterator to positionlist_begin().
Definition: database.h:254
Wrappers to allow GNU getopt to be used cleanly from C++ code.
Xapian::termpos pos
Definition: xapian-pos.cc:58
TermIterator termlist_begin(Xapian::docid did) const
An iterator pointing to the start of the termlist for a given document.
Definition: omdatabase.cc:198
int optind
Definition: getopt.cc:94
int main(int argc, char **argv)
Definition: xapian-pos.cc:112
void skip_to(Xapian::termpos termpos)
Advance the iterator to term position termpos.
bool next()
Definition: xapian-pos.cc:72
This class is used to access a database, or a group of databases.
Definition: database.h:68
int gnu_getopt_long(int argc_, char *const *argv_, const char *shortopts_, const struct option *longopts_, int *optind_)
Definition: gnu_getopt.h:97
#define OPT_HELP
Definition: xapian-pos.cc:42
bool to_unsigned_int(const char *s, T &result)
Definition: xapian-pos.cc:91
PositionIterator positionlist_begin(Xapian::docid did, const std::string &tname) const
An iterator pointing to the start of the position list for a given term in a given document...
Definition: omdatabase.cc:250
STL namespace.
Xapian::termpos get_pos() const
Definition: xapian-pos.cc:68
#define no_argument
Definition: gnu_getopt.h:79
Xapian::PositionIterator p
Definition: xapian-pos.cc:60
#define OPT_VERSION
Definition: xapian-pos.cc:43
string term
Definition: xapian-pos.cc:62
#define PROG_NAME
Definition: xapian-pos.cc:39
Public interfaces for the Xapian library.
#define PROG_DESC
Definition: xapian-pos.cc:40
bool iterator_valid(const Xapian::ESetIterator &it)
Definition: iterator.h:47
bool operator()(const Pos *a, const Pos *b)
Definition: xapian-pos.cc:82
char * optarg
Definition: getopt.cc:79
Class for iterating over term positions.
#define required_argument
Definition: gnu_getopt.h:80
TermIterator termlist_end(Xapian::docid) const
Corresponding end iterator to termlist_begin().
Definition: database.h:240
std::string get_description() const
Return a string describing this object.
Definition: error.cc:93
Pos(const string &term_, const Xapian::PositionIterator &p_)
Definition: xapian-pos.cc:65
static void show_usage()
Definition: xapian-pos.cc:46
All exceptions thrown by Xapian are subclasses of Xapian::Error.
Definition: error.h:43
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
Definition: types.h:83
Various handy helpers which std::string really should provide.
#define PACKAGE_STRING
Definition: config.h:337
Functions to assist creating language-idiomatic iterator wrappers.
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
const string & get_term() const
Definition: xapian-pos.cc:70