xapian-core  2.0.0
xapian-pos.cc
Go to the documentation of this file.
1 
4 /* Copyright 2018-2022 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include <xapian.h>
24 #include <xapian/iterator.h>
25 
26 #include <iostream>
27 
28 #include <cerrno>
29 #include <cstdlib>
30 #include <limits>
31 
32 #include "gnu_getopt.h"
33 #include "heap.h"
34 #include "parseint.h"
35 #include "stringutils.h"
36 
37 using namespace std;
38 
39 #define PROG_NAME "xapian-pos"
40 #define PROG_DESC "Debug positional data in a Xapian database"
41 
42 #define OPT_HELP 1
43 #define OPT_VERSION 2
44 
45 static void
47 {
48  cout << "Usage: " PROG_NAME " [OPTIONS] DATABASE\n\n"
49 "Options:\n"
50 " -d, --doc=DOCID show positions for document DOCID\n"
51 " -s, --start=POS specifies the first position to show\n"
52 " -e, --end=POS specifies the last position to show\n"
53 " -r, --reconstruct[=PREFIX] reconstruct text for prefix PREFIX\n"
54 " --help display this help and exit\n"
55 " --version output version information and exit\n";
56 }
57 
58 class Pos {
60 
62 
63  string term;
64 
65  public:
66  Pos(const string& term_, const Xapian::PositionIterator& p_)
67  : p(p_), term(term_) { pos = *p; }
68 
69  Xapian::termpos get_pos() const { return pos; }
70 
71  const string& get_term() const { return term; }
72 
73  bool next() {
74  if (!Xapian::iterator_valid(++p)) {
75  return false;
76  }
77  pos = *p;
78  return true;
79  }
80 };
81 
82 struct PosCmp {
83  bool operator()(const Pos* a, const Pos* b) {
84  if (a->get_pos() != b->get_pos()) {
85  return a->get_pos() > b->get_pos();
86  }
87  return a->get_term() > b->get_term();
88  }
89 };
90 
91 int
92 main(int argc, char **argv)
93 try {
94  static const struct option long_opts[] = {
95  {"doc", required_argument, 0, 'd'},
96  {"start", required_argument, 0, 's'},
97  {"end", required_argument, 0, 'e'},
98  {"reconstruct", optional_argument, 0, 'r'},
99  {"help", no_argument, 0, OPT_HELP},
100  {"version", no_argument, 0, OPT_VERSION},
101  {NULL, 0, 0, 0}
102  };
103 
104  Xapian::docid did = 0;
105  Xapian::termpos startpos = 0;
106  Xapian::termpos endpos = numeric_limits<Xapian::termpos>::max();
107  bool reconstruct = false;
108  string reconstruct_prefix;
109  int c;
110  while ((c = gnu_getopt_long(argc, argv, "d:e:s:r::", long_opts, 0)) != -1) {
111  switch (c) {
112  case 'd':
113  if (!parse_unsigned(optarg, did) || did == 0) {
114  cerr << "Bad docid value '" << optarg << "'\n";
115  exit(1);
116  }
117  break;
118  case 's':
119  if (!parse_unsigned(optarg, startpos)) {
120  cerr << "Bad start position '" << optarg << "'\n";
121  exit(1);
122  }
123  break;
124  case 'e':
125  if (!parse_unsigned(optarg, endpos)) {
126  cerr << "Bad end position '" << optarg << "'\n";
127  exit(1);
128  }
129  break;
130  case 'r':
131  reconstruct = true;
132  if (optarg) {
133  reconstruct_prefix = optarg;
134  }
135  break;
136  case OPT_HELP:
137  cout << PROG_NAME " - " PROG_DESC "\n\n";
138  show_usage();
139  exit(0);
140  case OPT_VERSION:
141  cout << PROG_NAME " - " PACKAGE_STRING "\n";
142  exit(0);
143  default:
144  show_usage();
145  exit(1);
146  }
147  }
148 
149  // We expect one argument - a database path.
150  if (argc - optind != 1) {
151  show_usage();
152  exit(1);
153  }
154 
155  if (did == 0) {
156  cerr << "--doc=DOCID option required.\n";
157  exit(1);
158  }
159 
160  Xapian::Database db(argv[optind]);
161 
162  if (reconstruct) {
163  cout << db.reconstruct_text(did, 0, reconstruct_prefix,
164  startpos, endpos)
165  << '\n';
166  exit(0);
167  }
168 
169  vector<Pos*> heap;
170 
171  for (auto term_it = db.termlist_begin(did);
172  term_it != db.termlist_end(did); ++term_it) {
173  const string& term = *term_it;
174  auto pos_it = db.positionlist_begin(did, term);
175  if (startpos) pos_it.skip_to(startpos);
176  if (pos_it != db.positionlist_end(did, term)) {
177  heap.push_back(new Pos(term, pos_it));
178  }
179  }
180 
181  Heap::make(heap.begin(), heap.end(), PosCmp());
182 
183  Xapian::termpos old_pos = startpos - 1;
184  while (!heap.empty()) {
185  auto tip = heap.front();
186  Xapian::termpos pos = tip->get_pos();
187  if (pos > endpos) break;
188 
189  switch (pos - old_pos) {
190  case 0:
191  // Another term at the same position.
192  cout << ' ';
193  break;
194  case 1:
195  cout << '\n' << pos << '\t';
196  break;
197  default:
198  cout << "\nGap of " << (pos - old_pos - 1)
199  << " unused positions\n" << pos << '\t';
200  break;
201  }
202  cout << tip->get_term();
203 
204  old_pos = pos;
205 
206  if (tip->next()) {
207  Heap::replace(heap.begin(), heap.end(), PosCmp());
208  } else {
209  Heap::pop(heap.begin(), heap.end(), PosCmp());
210  heap.resize(heap.size() - 1);
211  }
212  }
213 
214  cout << '\n';
215 } catch (const Xapian::Error & e) {
216  cerr << '\n' << argv[0] << ": " << e.get_description() << '\n';
217  exit(1);
218 }
Pos(const string &term_, const Xapian::PositionIterator &p_)
Definition: xapian-pos.cc:66
Xapian::PositionIterator p
Definition: xapian-pos.cc:61
const string & get_term() const
Definition: xapian-pos.cc:71
bool next()
Definition: xapian-pos.cc:73
string term
Definition: xapian-pos.cc:63
Xapian::termpos get_pos() const
Definition: xapian-pos.cc:69
Xapian::termpos pos
Definition: xapian-pos.cc:59
An indexed database of documents.
Definition: database.h:75
PositionIterator positionlist_end(Xapian::docid, std::string_view) const noexcept
End iterator corresponding to positionlist_begin().
Definition: database.h:292
std::string reconstruct_text(Xapian::docid did, size_t length=0, std::string_view prefix={}, Xapian::termpos start_pos=0, Xapian::termpos end_pos=0) const
Reconstruct document text.
Definition: database.cc:533
TermIterator termlist_begin(Xapian::docid did) const
Start iterating the terms in a document.
Definition: database.cc:200
PositionIterator positionlist_begin(Xapian::docid did, std::string_view term) const
Start iterating positions for a term in a document.
Definition: database.cc:221
TermIterator termlist_end(Xapian::docid) const noexcept
End iterator corresponding to termlist_begin().
Definition: database.h:271
All exceptions thrown by Xapian are subclasses of Xapian::Error.
Definition: error.h:41
std::string get_description() const
Return a string describing this object.
Definition: error.cc:93
Class for iterating over term positions.
void skip_to(Xapian::termpos termpos)
Advance the iterator to term position termpos.
#define PACKAGE_STRING
Definition: config.h:361
string term
PositionList * p
Xapian::termpos pos
int optind
Definition: getopt.cc:93
char * optarg
Definition: getopt.cc:78
Wrappers to allow GNU getopt to be used cleanly from C++ code.
#define no_argument
Definition: gnu_getopt.h:78
#define required_argument
Definition: gnu_getopt.h:79
int gnu_getopt_long(int argc_, char *const *argv_, const char *shortopts_, const struct option *longopts_, int *optind_)
Definition: gnu_getopt.h:96
#define optional_argument
Definition: gnu_getopt.h:80
C++ STL heap implementation with extensions.
Functions to assist creating language-idiomatic iterator wrappers.
void pop(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:213
void replace(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:230
void make(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:259
bool iterator_valid(const Xapian::ESetIterator &it)
Definition: iterator.h:46
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
Definition: types.h:75
Parse signed and unsigned type from string and check for trailing characters.
bool parse_unsigned(const char *p, T &res)
Definition: parseint.h:29
Various handy string-related helpers.
bool operator()(const Pos *a, const Pos *b)
Definition: xapian-pos.cc:83
static void show_usage()
Definition: xapian-pos.cc:46
#define OPT_VERSION
Definition: xapian-pos.cc:43
int main(int argc, char **argv)
Definition: xapian-pos.cc:92
#define PROG_NAME
Definition: xapian-pos.cc:39
#define PROG_DESC
Definition: xapian-pos.cc:40
#define OPT_HELP
Definition: xapian-pos.cc:42
static const struct option long_opts[]
Public interfaces for the Xapian library.