xapian-core  2.0.0
xapian-quest.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2004-2026 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include <xapian.h>
24 
25 #include <cstdlib>
26 #include <cstring>
27 
28 #include <algorithm>
29 #include <iostream>
30 
31 #include "gnu_getopt.h"
32 #include "stringutils.h"
33 
34 using namespace std;
35 
36 #define PROG_NAME "xapian-quest"
37 #define PROG_DESC "Xapian command line search tool"
38 
39 // Stopwords:
40 static const char * const sw[] = {
41  "a", "about", "an", "and", "are", "as", "at",
42  "be", "by",
43  "en",
44  "for", "from",
45  "how",
46  "i", "in", "is", "it",
47  "of", "on", "or",
48  "that", "the", "this", "to",
49  "was", "what", "when", "where", "which", "who", "why", "will", "with"
50 };
51 
53 struct tab_entry {
54  const char* s;
55 
56  unsigned f;
57 
58  bool operator<(const char* s_) const {
59  return strcmp(s, s_) < 0;
60  }
61 };
62 
68 template<typename T, std::size_t N>
69 static int
70 decode(const T (&table)[N], const char* s)
71 {
72  auto p = lower_bound(begin(table), end(table), s);
73  if (p == end(table) || strcmp(s, p->s) != 0)
74  return -1;
75  return p->f;
76 }
77 
78 static const tab_entry flag_tab[] = {
79  { "accumulate", Xapian::QueryParser::FLAG_ACCUMULATE },
80  { "auto_multiword_synonyms", Xapian::QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS },
81  { "auto_synonyms", Xapian::QueryParser::FLAG_AUTO_SYNONYMS },
82  { "boolean", Xapian::QueryParser::FLAG_BOOLEAN },
83  { "boolean_any_case", Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE },
84  { "cjk_ngram", Xapian::QueryParser::FLAG_CJK_NGRAM },
85  { "default", Xapian::QueryParser::FLAG_DEFAULT },
87  { "lovehate", Xapian::QueryParser::FLAG_LOVEHATE },
89  { "no_positions", Xapian::QueryParser::FLAG_NO_POSITIONS },
90  { "no_proper_noun_heuristic", Xapian::QueryParser::FLAG_NO_PROPER_NOUN_HEURISTIC },
91  { "partial", Xapian::QueryParser::FLAG_PARTIAL },
93  { "pure_not", Xapian::QueryParser::FLAG_PURE_NOT },
94  { "spelling_correction", Xapian::QueryParser::FLAG_SPELLING_CORRECTION },
95  { "synonym", Xapian::QueryParser::FLAG_SYNONYM },
96  { "wildcard", Xapian::QueryParser::FLAG_WILDCARD },
97  { "wildcard_glob", Xapian::QueryParser::FLAG_WILDCARD_GLOB },
98  { "wildcard_multi", Xapian::QueryParser::FLAG_WILDCARD_MULTI },
99  { "wildcard_single", Xapian::QueryParser::FLAG_WILDCARD_SINGLE },
100  { "word_breaks", Xapian::QueryParser::FLAG_WORD_BREAKS }
101 };
102 
103 static const tab_entry default_op_tab[] = {
104  { "and", Xapian::Query::OP_AND },
105  { "elite_set", Xapian::Query::OP_ELITE_SET },
106  { "max", Xapian::Query::OP_MAX },
107  { "near", Xapian::Query::OP_NEAR },
108  { "or", Xapian::Query::OP_OR },
109  { "phrase", Xapian::Query::OP_PHRASE },
110  { "synonym", Xapian::Query::OP_SYNONYM }
111 };
112 
113 static const tab_entry stem_strategy_tab[] = {
115  { "all_z", Xapian::QueryParser::STEM_ALL_Z },
116  { "none", Xapian::QueryParser::STEM_NONE },
117  { "some", Xapian::QueryParser::STEM_SOME },
118  { "some_full_pos", Xapian::QueryParser::STEM_SOME_FULL_POS }
119 };
120 
125 #define INDENT \
126 " "
127 
132 template<typename T>
133 static char
134 print_table(const T& table)
135 {
136  size_t pos = 256;
137  for (auto& i : table) {
138  size_t len = strlen(i.s);
139  if (pos < 256) cout << ',';
140  if (pos + len >= 78) {
141  cout << "\n" INDENT;
142  pos = sizeof(INDENT) - 2;
143  } else {
144  cout << ' ';
145  }
146  cout << i.s;
147  pos += len + 2;
148  }
149  return '\n';
150 }
151 
153 static char
155 {
156  size_t pos = 256;
157  const string langs = Xapian::Stem::get_available_languages();
158  size_t p = 0;
159  while (p != string::npos) {
160  size_t space = langs.find(' ', p);
161  size_t len = (space != string::npos) ? space - p : langs.size() - p;
162  if (pos < 256) cout << ',';
163  if (pos + len >= 78) {
164  cout << "\n" INDENT;
165  pos = sizeof(INDENT) - 2;
166  } else {
167  cout << ' ';
168  }
169  cout << string_view(langs.data() + p, len);
170  pos += len + 2;
171  p = space;
172  if (p != string::npos) ++p;
173  }
174  return '\n';
175 }
176 
181 template<typename T>
182 static void
183 list_table(const T& table)
184 {
185  for (auto& i : table) {
186  cout << i.s << '\n';
187  }
188 }
189 
190 static void show_usage() {
191  cout << "Usage: " PROG_NAME " [OPTIONS] 'QUERY'\n"
192 "NB: QUERY should be quoted to protect it from the shell.\n\n"
193 "Options:\n"
194 " -d, --db=DIRECTORY database to search (multiple databases may\n"
195 " be specified)\n"
196 " -m, --msize=MSIZE maximum number of matches to return\n"
197 " -c, --check-at-least=HOWMANY minimum number of matches to check\n"
198 " -s, --stemmer=LANG set the stemming language, the default is\n"
199 " 'english' (pass 'none' to disable stemming).\n"
200 " Valid stemmers:"
201 << print_stemmers() <<
202 " -S, --stem-strategy=STRATEGY set the stemming strategy (default: some).\n"
203 " Valid strategies:"
205 " -p, --prefix=PFX:TERMPFX add a prefix\n"
206 " -b, --boolean-prefix=PFX:TERMPFX add a boolean prefix\n"
207 " -f, --flags=FLAG1[,FLAG2]... specify QueryParser flags (default:\n"
208 " default). Valid flags:"
209 << print_table(flag_tab) <<
210 " -o, --default-op=OP specify QueryParser default operator\n"
211 " (default: or). Valid operators:"
213 " -w, --weight=SCHEME specify weighting scheme to use, which\n"
214 " can include parameters, e.g.\n"
215 " --weight='bm25 1 0 0 1 0' (default: bm25).\n"
216 // FIXME: It would be nice to report valid schemes like we used to when we had
217 // a hard-coded list of scheme names here.
218 " -F, --freqs show query term frequencies\n"
219 " -h, --help display this help and exit\n"
220 " -v, --version output version information and exit\n";
221 }
222 
223 int
224 main(int argc, char **argv)
225 try {
226  const char * opts = "d:m:c:s:S:p:b:f:o:w:Fhv";
227  static const struct option long_opts[] = {
228  { "db", required_argument, 0, 'd' },
229  { "msize", required_argument, 0, 'm' },
230  { "check-at-least", required_argument, 0, 'c' },
231  { "stemmer", required_argument, 0, 's' },
232  { "stem-strategy", required_argument, 0, 'S' },
233  { "prefix", required_argument, 0, 'p' },
234  { "boolean-prefix", required_argument, 0, 'b' },
235  { "flags", required_argument, 0, 'f' },
236  { "default-op", required_argument, 0, 'o' },
237  { "weight", required_argument, 0, 'w' },
238  { "freqs", no_argument, 0, 'F' },
239  { "help", no_argument, 0, 'h' },
240  { "version", no_argument, 0, 'v' },
241  { NULL, 0, 0, 0}
242  };
243 
244  Xapian::SimpleStopper mystopper(begin(sw), end(sw));
245  Xapian::Stem stemmer("english");
246  Xapian::doccount msize = 10;
247  Xapian::doccount check_at_least = 0;
248 
249  bool have_database = false;
250 
251  Xapian::Database db;
252  Xapian::QueryParser parser;
253  unsigned flags = 0;
254  bool flags_set = false;
255  bool show_termfreqs = false;
256  const char* weighting_scheme = "bm25";
257 
258  int c;
259  while ((c = gnu_getopt_long(argc, argv, opts, long_opts, 0)) != -1) {
260  switch (c) {
261  case 'm': {
262  char * p;
263  unsigned long v = strtoul(optarg, &p, 10);
264  msize = static_cast<Xapian::doccount>(v);
265  if (*p || v != msize) {
266  cerr << PROG_NAME": Bad value '" << optarg
267  << "' passed for msize\n";
268  exit(1);
269  }
270  break;
271  }
272  case 'c': {
273  char * p;
274  unsigned long v = strtoul(optarg, &p, 10);
275  check_at_least = static_cast<Xapian::doccount>(v);
276  if (*p || v != check_at_least) {
277  cerr << PROG_NAME": Bad value '" << optarg
278  << "' passed for check_at_least\n";
279  exit(1);
280  }
281  break;
282  }
283  case 'd':
285  have_database = true;
286  break;
287  case 's':
288  try {
290  } catch (const Xapian::InvalidArgumentError &) {
291  cerr << "Unknown stemming language '" << optarg << "'.\n"
292  "Available language names are: "
294  exit(1);
295  }
296  break;
297  case 'b': case 'p': {
298  const char * colon = strchr(optarg, ':');
299  if (colon == NULL) {
300  cerr << argv[0] << ": need ':' when setting prefix\n";
301  exit(1);
302  }
303  string prefix(optarg, colon - optarg);
304  string termprefix(colon + 1);
305  if (c == 'b') {
306  parser.add_boolean_prefix(prefix, termprefix);
307  } else {
308  parser.add_prefix(prefix, termprefix);
309  }
310  break;
311  }
312  case 'f':
313  flags_set = true;
314  do {
315  char * comma = strchr(optarg, ',');
316  if (comma)
317  *comma++ = '\0';
318  int flag = decode(flag_tab, optarg);
319  if (flag < 0) {
320  cerr << "Unknown flag '" << optarg << "'\n"
321  "Available flags are:\n";
323  exit(1);
324  }
325  flags |= unsigned(flag);
326  optarg = comma;
327  } while (optarg);
328  break;
329  case 'o': {
330  int op = decode(default_op_tab, optarg);
331  if (op < 0) {
332  cerr << "Unknown operator '" << optarg << "'\n"
333  "Available operators are:\n";
335  exit(1);
336  }
337  parser.set_default_op(static_cast<Xapian::Query::op>(op));
338  break;
339  }
340  case 'S': {
341  int s = decode(stem_strategy_tab, optarg);
342  if (s < 0) {
343  cerr << "Unknown stem strategy '" << optarg << "'\n"
344  "Available stem strategies are:\n";
346  exit(1);
347  }
348  auto strategy =
349  static_cast<Xapian::QueryParser::stem_strategy>(s);
350  parser.set_stemming_strategy(strategy);
351  break;
352  }
353  case 'w':
354  weighting_scheme = optarg;
355  break;
356  case 'F':
357  show_termfreqs = true;
358  break;
359  case 'v':
360  cout << PROG_NAME " - " PACKAGE_STRING "\n";
361  exit(0);
362  case 'h':
363  cout << PROG_NAME " - " PROG_DESC "\n\n";
364  show_usage();
365  exit(0);
366  case ':': // missing parameter
367  case '?': // unknown option
368  show_usage();
369  exit(1);
370  }
371  }
372 
373  if (argc - optind != 1) {
374  show_usage();
375  exit(1);
376  }
377 
378  parser.set_database(db);
379  parser.set_stemmer(stemmer);
380  parser.set_stopper(&mystopper);
381 
382  if (!flags_set) {
384  }
385  Xapian::Query query = parser.parse_query(argv[optind], flags);
386  const string & correction = parser.get_corrected_query_string();
387  if (!correction.empty())
388  cout << "Did you mean: " << correction << "\n\n";
389 
390  cout << "Parsed Query: " << query.get_description() << '\n';
391 
392  if (!have_database) {
393  cout << "No database specified so not running the query.\n";
394  exit(0);
395  }
396 
397  Xapian::Enquire enquire(db);
398  enquire.set_query(query);
399  {
400  const Xapian::Weight* weight = Xapian::Weight::create(weighting_scheme);
401  enquire.set_weighting_scheme(*weight);
402  delete weight;
403  }
404 
405  Xapian::MSet mset = enquire.get_mset(0, msize, check_at_least);
406 
407  if (show_termfreqs) {
408  cout << "Query term frequencies:\n";
409  for (auto t = query.get_terms_begin();
410  t != query.get_terms_end();
411  ++t) {
412  const string& term = *t;
413  cout << " " << mset.get_termfreq(term) << '\t' << term << '\n';
414  }
415  }
416  auto lower_bound = mset.get_matches_lower_bound();
417  auto estimate = mset.get_matches_estimated();
418  auto upper_bound = mset.get_matches_upper_bound();
419  if (lower_bound == upper_bound) {
420  cout << "Exactly " << estimate << " matches\n";
421  } else {
422  cout << "Between " << lower_bound << " and " << upper_bound
423  << " matches, best estimate is " << estimate << '\n';
424  }
425 
426  cout << "MSet:\n";
427  for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); ++i) {
428  Xapian::Document doc = i.get_document();
429  string data = doc.get_data();
430  cout << *i << ": [" << i.get_weight() << "]\n" << data << "\n";
431  }
432  cout << flush;
433 } catch (const Xapian::QueryParserError & e) {
434  cout << "Couldn't parse query: " << e.get_msg() << '\n';
435  exit(1);
436 } catch (const Xapian::Error & err) {
437  cout << err.get_description() << '\n';
438  exit(1);
439 }
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
Definition: api_anydb.cc:62
An indexed database of documents.
Definition: database.h:75
void add_database(const Database &other)
Add shards from another Database.
Definition: database.h:109
Class representing a document.
Definition: document.h:64
std::string get_data() const
Get the document data.
Definition: document.cc:75
Querying session.
Definition: enquire.h:57
void set_weighting_scheme(const Weight &weight)
Set the weighting scheme to use.
Definition: enquire.cc:85
MSet get_mset(doccount first, doccount maxitems, doccount checkatleast=0, const RSet *rset=NULL, const MatchDecider *mdecider=NULL) const
Run the query.
Definition: enquire.cc:200
void set_query(const Query &query, termcount query_length=0)
Set the query.
Definition: enquire.cc:72
All exceptions thrown by Xapian are subclasses of Xapian::Error.
Definition: error.h:41
const std::string & get_msg() const noexcept
Message giving details of the error, intended for human consumption.
Definition: error.h:111
std::string get_description() const
Return a string describing this object.
Definition: error.cc:93
InvalidArgumentError indicates an invalid parameter value was passed to the API.
Definition: error.h:229
Iterator over a Xapian::MSet.
Definition: mset.h:535
Class representing a list of search results.
Definition: mset.h:46
Xapian::doccount get_termfreq(std::string_view term) const
Get the termfreq of a term.
Definition: mset.cc:281
Xapian::doccount get_matches_upper_bound() const
Upper bound on the total number of matching documents.
Definition: mset.cc:334
MSetIterator begin() const
Return iterator pointing to the first item in this MSet.
Definition: mset.h:786
Xapian::doccount get_matches_lower_bound() const
Lower bound on the total number of matching documents.
Definition: mset.cc:318
MSetIterator end() const
Return iterator pointing to just after the last item in this MSet.
Definition: mset.h:791
Xapian::doccount get_matches_estimated() const
Estimate of the total number of matching documents.
Definition: mset.cc:324
Indicates a query string can't be parsed.
Definition: error.h:875
Build a Xapian::Query object from a user query string.
Definition: queryparser.h:516
void set_database(const Database &db)
Specify the database being searched.
Definition: queryparser.cc:138
void set_stemmer(const Xapian::Stem &stemmer)
Set the stemmer.
Definition: queryparser.cc:75
void set_stemming_strategy(stem_strategy strategy)
Set the stemming strategy.
Definition: queryparser.cc:81
void add_boolean_prefix(std::string_view field, std::string_view prefix, const std::string *grouping=NULL)
Add a boolean term prefix allowing the user to restrict a search with a boolean filter specified in t...
Definition: queryparser.cc:212
void add_prefix(std::string_view field, std::string_view prefix)
Add a free-text field term prefix.
Definition: queryparser.cc:200
Query parse_query(std::string_view query_string, unsigned flags=FLAG_DEFAULT, std::string_view default_prefix={})
Parse a query.
Definition: queryparser.cc:174
void set_default_op(Query::op default_op)
Set the default operator.
Definition: queryparser.cc:99
void set_stopper(const Stopper *stop=NULL)
Set the stopper.
Definition: queryparser.cc:87
std::string get_corrected_query_string() const
Get the spelling-corrected query string.
Definition: queryparser.cc:254
stem_strategy
Stemming strategies, for use with set_stemming_strategy().
Definition: queryparser.h:788
@ FLAG_LOVEHATE
Support + and -.
Definition: queryparser.h:530
@ FLAG_AUTO_SYNONYMS
Enable automatic use of synonyms for single terms.
Definition: queryparser.h:604
@ FLAG_AUTO_MULTIWORD_SYNONYMS
Enable automatic use of synonyms for single terms and groups of terms.
Definition: queryparser.h:611
@ FLAG_NGRAMS
Generate n-grams for scripts without explicit word breaks.
Definition: queryparser.h:635
@ FLAG_ACCUMULATE
Accumulate unstem and stoplist results.
Definition: queryparser.h:726
@ FLAG_CJK_NGRAM
Generate n-grams for scripts without explicit word breaks.
Definition: queryparser.h:644
@ FLAG_DEFAULT
The default flags.
Definition: queryparser.h:784
@ FLAG_WILDCARD_GLOB
Enable glob-style wildcarding.
Definition: queryparser.h:699
@ FLAG_NO_PROPER_NOUN_HEURISTIC
Turn off special handling of capitalised words.
Definition: queryparser.h:775
@ FLAG_FUZZY
Support fuzzy matching.
Definition: queryparser.h:711
@ FLAG_WORD_BREAKS
Find word breaks for text in scripts without explicit word breaks.
Definition: queryparser.h:658
@ FLAG_BOOLEAN_ANY_CASE
Support AND, OR, etc even if they aren't in ALLCAPS.
Definition: queryparser.h:532
@ FLAG_WILDCARD
Support wildcards.
Definition: queryparser.h:549
@ FLAG_SYNONYM
Enable synonym operator '~'.
Definition: queryparser.h:598
@ FLAG_SPELLING_CORRECTION
Enable spelling correction.
Definition: queryparser.h:592
@ FLAG_WILDCARD_MULTI
Support extended wildcard '*'.
Definition: queryparser.h:673
@ FLAG_WILDCARD_SINGLE
Support extended wildcard '?'.
Definition: queryparser.h:688
@ FLAG_PURE_NOT
Allow queries such as 'NOT apples'.
Definition: queryparser.h:556
@ FLAG_NO_POSITIONS
Produce a query which doesn't use positional information.
Definition: queryparser.h:737
@ FLAG_PHRASE
Support quoted phrases.
Definition: queryparser.h:528
@ FLAG_PARTIAL
Enable partial matching.
Definition: queryparser.h:577
@ FLAG_BOOLEAN
Support AND, OR, etc and bracketed subexpressions.
Definition: queryparser.h:526
Class representing a query.
Definition: query.h:45
const TermIterator get_terms_begin() const
Begin iterator for terms in the query object.
Definition: query.cc:198
std::string get_description() const
Return a string describing this object.
Definition: query.cc:307
const TermIterator get_terms_end() const noexcept
End iterator for terms in the query object.
Definition: query.h:639
op
Query operators.
Definition: query.h:78
@ OP_MAX
Pick the maximum weight of any subquery.
Definition: query.h:249
@ OP_NEAR
Match only documents where all subqueries match near each other.
Definition: query.h:140
@ OP_ELITE_SET
Pick the best N subqueries and combine with OP_OR.
Definition: query.h:215
@ OP_AND
Match only documents which all subqueries match.
Definition: query.h:84
@ OP_OR
Match documents which at least one subquery matches.
Definition: query.h:92
@ OP_PHRASE
Match only documents where all subqueries match near and in order.
Definition: query.h:152
@ OP_SYNONYM
Match like OP_OR but weighting as if a single term.
Definition: query.h:239
Simple implementation of Stopper class - this will suit most users.
Definition: queryparser.h:99
Class representing a stemming algorithm.
Definition: stem.h:74
static std::string get_available_languages()
Return a list of available languages.
Definition: stem.h:208
Abstract base class for weighting schemes.
Definition: weight.h:38
static const Weight * create(const std::string &scheme, const Registry &reg=Registry())
Return the appropriate weighting scheme object.
Definition: weight.cc:225
#define PACKAGE_STRING
Definition: config.h:361
string term
PositionList * p
Xapian::termpos pos
int optind
Definition: getopt.cc:93
char * optarg
Definition: getopt.cc:78
Wrappers to allow GNU getopt to be used cleanly from C++ code.
#define no_argument
Definition: gnu_getopt.h:78
#define required_argument
Definition: gnu_getopt.h:79
int gnu_getopt_long(int argc_, char *const *argv_, const char *shortopts_, const struct option *longopts_, int *optind_)
Definition: gnu_getopt.h:96
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
static Xapian::Stem stemmer
Definition: stemtest.cc:42
Various handy string-related helpers.
int * flag
Definition: gnu_getopt.h:74
Common string to integer map entry for option decoding.
Definition: xapian-quest.cc:53
const char * s
Definition: xapian-quest.cc:54
unsigned f
Definition: xapian-quest.cc:56
bool operator<(const char *s_) const
Definition: xapian-quest.cc:58
static const char * opts
static const struct option long_opts[]
static void list_table(const T &table)
List strings from a string to integer mapping table, one per line.
static char print_stemmers()
Print available stemmers, line wrapped.
static void show_usage()
static char print_table(const T &table)
Print strings from a string to integer mapping table.
int main(int argc, char **argv)
#define PROG_NAME
Definition: xapian-quest.cc:36
static const char *const sw[]
Definition: xapian-quest.cc:40
#define INDENT
The number of spaces to indent by in print_table.
static const tab_entry flag_tab[]
Definition: xapian-quest.cc:78
static int decode(const T(&table)[N], const char *s)
Decode a string to an integer.
Definition: xapian-quest.cc:70
#define PROG_DESC
Definition: xapian-quest.cc:37
static const tab_entry stem_strategy_tab[]
static const tab_entry default_op_tab[]
Public interfaces for the Xapian library.