xapian-core  1.4.19
quest.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2004-2021 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19  * USA
20  */
21 
22 #include <config.h>
23 
24 #include <xapian.h>
25 
26 #include <cstdlib>
27 #include <cstring>
28 
29 #include <algorithm>
30 #include <iostream>
31 
32 #include "gnu_getopt.h"
33 
34 using namespace std;
35 
36 #define PROG_NAME "quest"
37 #define PROG_DESC "Xapian command line search tool"
38 
39 // Stopwords:
40 static const char * const sw[] = {
41  "a", "about", "an", "and", "are", "as", "at",
42  "be", "by",
43  "en",
44  "for", "from",
45  "how",
46  "i", "in", "is", "it",
47  "of", "on", "or",
48  "that", "the", "this", "to",
49  "was", "what", "when", "where", "which", "who", "why", "will", "with"
50 };
51 
52 struct qp_flag { const char * s; unsigned f; };
53 static const qp_flag flag_tab[] = {
54  { "accumulate", Xapian::QueryParser::FLAG_ACCUMULATE },
55  { "auto_multiword_synonyms", Xapian::QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS },
56  { "auto_synonyms", Xapian::QueryParser::FLAG_AUTO_SYNONYMS },
57  { "boolean", Xapian::QueryParser::FLAG_BOOLEAN },
58  { "boolean_any_case", Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE },
59  { "cjk_ngram", Xapian::QueryParser::FLAG_CJK_NGRAM },
60  { "default", Xapian::QueryParser::FLAG_DEFAULT },
61  { "lovehate", Xapian::QueryParser::FLAG_LOVEHATE },
62  { "no_positions", Xapian::QueryParser::FLAG_NO_POSITIONS },
63  { "partial", Xapian::QueryParser::FLAG_PARTIAL },
65  { "pure_not", Xapian::QueryParser::FLAG_PURE_NOT },
66  { "spelling_correction", Xapian::QueryParser::FLAG_SPELLING_CORRECTION },
67  { "synonym", Xapian::QueryParser::FLAG_SYNONYM },
69 };
70 const int n_flag_tab = sizeof(flag_tab) / sizeof(flag_tab[0]);
71 
72 static inline bool operator<(const qp_flag& f1, const qp_flag& f2) {
73  return strcmp(f1.s, f2.s) < 0;
74 }
75 
76 struct qp_op { const char * s; unsigned f; };
77 static const qp_op op_tab[] = {
78  { "and", Xapian::Query::OP_AND },
79  { "elite_set", Xapian::Query::OP_ELITE_SET },
80  { "max", Xapian::Query::OP_MAX },
81  { "near", Xapian::Query::OP_NEAR },
82  { "or", Xapian::Query::OP_OR },
83  { "phrase", Xapian::Query::OP_PHRASE },
84  { "synonym", Xapian::Query::OP_SYNONYM }
85 };
86 const int n_op_tab = sizeof(op_tab) / sizeof(op_tab[0]);
87 
88 static inline bool operator<(const qp_op& f1, const qp_op& f2) {
89  return strcmp(f1.s, f2.s) < 0;
90 }
91 
92 enum {
108 };
109 
110 struct wt { const char * s; int f; };
111 static const wt wt_tab[] = {
112  { "bb2", WEIGHT_BB2 },
113  { "bm25", WEIGHT_BM25 },
114  { "bm25+", WEIGHT_BM25PLUS },
115  { "bool", WEIGHT_BOOL },
116  { "coord", WEIGHT_COORD },
117  { "dlh", WEIGHT_DLH },
118  { "dph", WEIGHT_DPH },
119  { "ifb2", WEIGHT_IFB2 },
120  { "ineb2", WEIGHT_INEB2 },
121  { "inl2", WEIGHT_INL2 },
122  { "lm", WEIGHT_LM },
123  { "pl2", WEIGHT_PL2 },
124  { "pl2+", WEIGHT_PL2PLUS },
125  { "tfidf", WEIGHT_TFIDF },
126  { "trad", WEIGHT_TRAD }
127 };
128 const int n_wt_tab = sizeof(wt_tab) / sizeof(wt_tab[0]);
129 
130 static inline bool operator<(const wt& f1, const wt& f2) {
131  return strcmp(f1.s, f2.s) < 0;
132 }
133 
134 static void show_usage() {
135  cout << "Usage: " PROG_NAME " [OPTIONS] 'QUERY'\n"
136 "NB: QUERY should be quoted to protect it from the shell.\n\n"
137 "Options:\n"
138 " -d, --db=DIRECTORY database to search (multiple databases may\n"
139 " be specified)\n"
140 " -m, --msize=MSIZE maximum number of matches to return\n"
141 " -c, --check-at-least=HOWMANY minimum number of matches to check\n"
142 " -s, --stemmer=LANG set the stemming language, the default is\n"
143 " 'english' (pass 'none' to disable stemming)\n"
144 " -p, --prefix=PFX:TERMPFX add a prefix\n"
145 " -b, --boolean-prefix=PFX:TERMPFX add a boolean prefix\n"
146 " -f, --flags=FLAG1[,FLAG2]... specify QueryParser flags (default:\n"
147 " default). Valid flags:";
148 #define INDENT \
149 " "
150  int pos = 256;
151  for (const qp_flag * i = flag_tab; i - flag_tab < n_flag_tab; ++i) {
152  size_t len = strlen(i->s);
153  if (pos < 256) cout << ',';
154  if (pos + len >= 78) {
155  cout << "\n" INDENT;
156  pos = sizeof(INDENT) - 2;
157  } else {
158  cout << ' ';
159  }
160  cout << i->s;
161  pos += len + 2;
162  }
163  cout << "\n"
164 " -o, --default-op=OP specify QueryParser default operator\n"
165 " (default: or). Valid operators:";
166  pos = 256;
167  for (const qp_op * i = op_tab; i - op_tab < n_op_tab; ++i) {
168  size_t len = strlen(i->s);
169  if (pos < 256) cout << ',';
170  if (pos + len >= 78) {
171  cout << "\n" INDENT;
172  pos = sizeof(INDENT) - 2;
173  } else {
174  cout << ' ';
175  }
176  cout << i->s;
177  pos += len + 2;
178  }
179  cout << "\n"
180 " -w, --weight=SCHEME specify weighting scheme to use\n"
181 " (default: bm25). Valid schemes:";
182  pos = 256;
183  for (const wt * i = wt_tab; i - wt_tab < n_wt_tab; ++i) {
184  size_t len = strlen(i->s);
185  if (pos < 256) cout << ',';
186  if (pos + len >= 78) {
187  cout << "\n" INDENT;
188  pos = sizeof(INDENT) - 2;
189  } else {
190  cout << ' ';
191  }
192  cout << i->s;
193  pos += len + 2;
194  }
195  cout << "\n"
196 " -h, --help display this help and exit\n"
197 " -v, --version output version information and exit\n";
198 }
199 
200 static unsigned
201 decode_qp_flag(const char * s)
202 {
203  qp_flag f;
204  f.s = s;
205  const qp_flag * p = lower_bound(flag_tab, flag_tab + n_flag_tab, f);
206  if (p == flag_tab + n_flag_tab || f < *p)
207  return 0;
208  return p->f;
209 }
210 
211 static int
212 decode_qp_op(const char * s)
213 {
214  qp_op f;
215  f.s = s;
216  const qp_op * p = lower_bound(op_tab, op_tab + n_op_tab, f);
217  if (p == op_tab + n_op_tab || f < *p)
218  return -1;
219  return p->f;
220 }
221 
222 static int
223 decode_wt(const char * s)
224 {
225  wt f;
226  f.s = s;
227  const wt * p = lower_bound(wt_tab, wt_tab + n_wt_tab, f);
228  if (p == wt_tab + n_wt_tab || f < *p)
229  return -1;
230  return p->f;
231 }
232 
233 int
234 main(int argc, char **argv)
235 try {
236  const char * opts = "d:m:c:s:p:b:f:o:w:hv";
237  static const struct option long_opts[] = {
238  { "db", required_argument, 0, 'd' },
239  { "msize", required_argument, 0, 'm' },
240  { "check-at-least", required_argument, 0, 'c' },
241  { "stemmer", required_argument, 0, 's' },
242  { "prefix", required_argument, 0, 'p' },
243  { "boolean-prefix", required_argument, 0, 'b' },
244  { "flags", required_argument, 0, 'f' },
245  { "default-op", required_argument, 0, 'o' },
246  { "weight", required_argument, 0, 'w' },
247  { "help", no_argument, 0, 'h' },
248  { "version", no_argument, 0, 'v' },
249  { NULL, 0, 0, 0}
250  };
251 
252  Xapian::SimpleStopper mystopper(sw, sw + sizeof(sw) / sizeof(sw[0]));
253  Xapian::Stem stemmer("english");
254  Xapian::doccount msize = 10;
255  Xapian::doccount check_at_least = 0;
256 
257  bool have_database = false;
258 
259  Xapian::Database db;
260  Xapian::QueryParser parser;
261  unsigned flags = 0;
262  bool flags_set = false;
263  int weight = -1;
264 
265  int c;
266  while ((c = gnu_getopt_long(argc, argv, opts, long_opts, 0)) != -1) {
267  switch (c) {
268  case 'm': {
269  char * p;
270  unsigned long v = strtoul(optarg, &p, 10);
271  msize = static_cast<Xapian::doccount>(v);
272  if (*p || v != msize) {
273  cerr << PROG_NAME": Bad value '" << optarg
274  << "' passed for msize" << endl;
275  exit(1);
276  }
277  break;
278  }
279  case 'c': {
280  char * p;
281  unsigned long v = strtoul(optarg, &p, 10);
282  check_at_least = static_cast<Xapian::doccount>(v);
283  if (*p || v != check_at_least) {
284  cerr << PROG_NAME": Bad value '" << optarg
285  << "' passed for check_at_least" << endl;
286  exit(1);
287  }
288  break;
289  }
290  case 'd':
292  have_database = true;
293  break;
294  case 's':
295  try {
296  stemmer = Xapian::Stem(optarg);
297  } catch (const Xapian::InvalidArgumentError &) {
298  cerr << "Unknown stemming language '" << optarg << "'.\n"
299  "Available language names are: "
301  exit(1);
302  }
303  break;
304  case 'b': case 'p': {
305  const char * colon = strchr(optarg, ':');
306  if (colon == NULL) {
307  cerr << argv[0] << ": need ':' when setting prefix" << endl;
308  exit(1);
309  }
310  string prefix(optarg, colon - optarg);
311  string termprefix(colon + 1);
312  if (c == 'b') {
313  parser.add_boolean_prefix(prefix, termprefix);
314  } else {
315  parser.add_prefix(prefix, termprefix);
316  }
317  break;
318  }
319  case 'f':
320  flags_set = true;
321  do {
322  char * comma = strchr(optarg, ',');
323  if (comma)
324  *comma++ = '\0';
325  unsigned flag = decode_qp_flag(optarg);
326  if (flag == 0) {
327  cerr << "Unknown flag '" << optarg << "'" << endl;
328  exit(1);
329  }
330  flags |= flag;
331  optarg = comma;
332  } while (optarg);
333  break;
334  case 'o': {
335  int op = decode_qp_op(optarg);
336  if (op < 0) {
337  cerr << "Unknown op '" << optarg << "'" << endl;
338  exit(1);
339  }
340  parser.set_default_op(static_cast<Xapian::Query::op>(op));
341  break;
342  }
343  case 'w': {
344  weight = decode_wt(optarg);
345  if (weight < 0) {
346  cerr << "Unknown weighting scheme '" << optarg << "'" << endl;
347  exit(1);
348  }
349  break;
350  }
351  case 'v':
352  cout << PROG_NAME " - " PACKAGE_STRING << endl;
353  exit(0);
354  case 'h':
355  cout << PROG_NAME " - " PROG_DESC "\n\n";
356  show_usage();
357  exit(0);
358  case ':': // missing parameter
359  case '?': // unknown option
360  show_usage();
361  exit(1);
362  }
363  }
364 
365  if (argc - optind != 1) {
366  show_usage();
367  exit(1);
368  }
369 
370  parser.set_database(db);
371  parser.set_stemmer(stemmer);
373  parser.set_stopper(&mystopper);
374 
375  if (!flags_set) {
377  }
378  Xapian::Query query = parser.parse_query(argv[optind], flags);
379  const string & correction = parser.get_corrected_query_string();
380  if (!correction.empty())
381  cout << "Did you mean: " << correction << "\n\n";
382 
383  cout << "Parsed Query: " << query.get_description() << endl;
384 
385  if (!have_database) {
386  cout << "No database specified so not running the query." << endl;
387  exit(0);
388  }
389 
390  Xapian::Enquire enquire(db);
391  enquire.set_query(query);
392 
393  switch (weight) {
394  case WEIGHT_BB2:
396  break;
397  case WEIGHT_BOOL:
399  break;
400  case WEIGHT_COORD:
402  break;
403  case WEIGHT_BM25:
405  break;
406  case WEIGHT_BM25PLUS:
408  break;
409  case WEIGHT_DLH:
411  break;
412  case WEIGHT_DPH:
414  break;
415  case WEIGHT_IFB2:
417  break;
418  case WEIGHT_INEB2:
420  break;
421  case WEIGHT_INL2:
423  break;
424  case WEIGHT_LM:
426  break;
427  case WEIGHT_PL2:
429  break;
430  case WEIGHT_PL2PLUS:
432  break;
433  case WEIGHT_TFIDF:
435  break;
436  case WEIGHT_TRAD:
438  break;
439  }
440 
441  Xapian::MSet mset = enquire.get_mset(0, msize, check_at_least);
442 
443  auto lower_bound = mset.get_matches_lower_bound();
444  auto estimate = mset.get_matches_estimated();
445  auto upper_bound = mset.get_matches_upper_bound();
446  if (lower_bound == upper_bound) {
447  cout << "Exactly " << estimate << " matches" << endl;
448  } else {
449  cout << "Between " << lower_bound << " and " << upper_bound
450  << " matches, best estimate is " << estimate << endl;
451  }
452 
453  cout << "MSet:" << endl;
454  for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); ++i) {
455  Xapian::Document doc = i.get_document();
456  string data = doc.get_data();
457  cout << *i << ": [" << i.get_weight() << "]\n" << data << "\n";
458  }
459  cout << flush;
460 } catch (const Xapian::QueryParserError & e) {
461  cout << "Couldn't parse query: " << e.get_msg() << endl;
462  exit(1);
463 } catch (const Xapian::Error & err) {
464  cout << err.get_description() << endl;
465  exit(1);
466 }
Support AND, OR, etc even if they aren&#39;t in ALLCAPS.
Definition: queryparser.h:794
const int n_wt_tab
Definition: quest.cc:128
static int decode_wt(const char *s)
Definition: quest.cc:223
static const char *const sw[]
Definition: quest.cc:40
Wrappers to allow GNU getopt to be used cleanly from C++ code.
static void show_usage()
Definition: quest.cc:134
unsigned f
Definition: quest.cc:76
Simple implementation of Stopper class - this will suit most users.
Definition: queryparser.h:100
int optind
Definition: getopt.cc:94
void set_default_op(Query::op default_op)
Set the default operator.
Definition: queryparser.cc:102
This class is used to access a database, or a group of databases.
Definition: database.h:68
static const wt wt_tab[]
Definition: quest.cc:111
int gnu_getopt_long(int argc_, char *const *argv_, const char *shortopts_, const struct option *longopts_, int *optind_)
Definition: gnu_getopt.h:97
static int decode_qp_op(const char *s)
Definition: quest.cc:212
int main(int argc, char **argv)
Definition: quest.cc:234
Class representing a stemming algorithm.
Definition: stem.h:62
double weight
The weight of a document or term.
Definition: types.h:122
void set_stopper(const Stopper *stop=NULL)
Set the stopper.
Definition: queryparser.cc:96
unsigned f
Definition: quest.cc:52
static unsigned decode_qp_flag(const char *s)
Definition: quest.cc:201
std::string get_corrected_query_string() const
Get the spelling-corrected query string.
Definition: queryparser.cc:242
Xapian::doccount get_matches_lower_bound() const
Lower bound on the total number of matching documents.
Definition: omenquire.cc:246
static const char * opts
const std::string & get_msg() const
Message giving details of the error, intended for human consumption.
Definition: error.h:122
Build a Xapian::Query object from a user query string.
Definition: queryparser.h:778
const int n_op_tab
Definition: quest.cc:86
static const qp_op op_tab[]
Definition: quest.cc:77
Class representing a list of search results.
Definition: mset.h:44
This class implements the InL2 weighting scheme.
Definition: weight.h:838
STL namespace.
Pick the maximum weight of any subquery.
Definition: query.h:249
MSet get_mset(Xapian::doccount first, Xapian::doccount maxitems, Xapian::doccount checkatleast=0, const RSet *omrset=0, const MatchDecider *mdecider=0) const
Get (a portion of) the match set for the current query.
Definition: omenquire.cc:932
Indicates a query string can&#39;t be parsed.
Definition: error.h:887
int f
Definition: quest.cc:110
Produce a query which doesn&#39;t use positional information.
Definition: queryparser.h:912
Xapian::Weight subclass implementing the PL2+ probabilistic formula.
Definition: weight.h:1257
static Xapian::Stem stemmer
Definition: stemtest.cc:41
Enable automatic use of synonyms for single terms and groups of terms.
Definition: queryparser.h:871
static bool operator<(const qp_flag &f1, const qp_flag &f2)
Definition: quest.cc:72
Enable partial matching.
Definition: queryparser.h:837
void set_stemmer(const Xapian::Stem &stemmer)
Set the stemmer.
Definition: queryparser.cc:84
#define no_argument
Definition: gnu_getopt.h:79
Xapian::doccount get_matches_upper_bound() const
Upper bound on the total number of matching documents.
Definition: omenquire.cc:262
This class implements the BB2 weighting scheme.
Definition: weight.h:1054
Xapian::Weight subclass implementing Coordinate Matching.
Definition: weight.h:1509
InvalidArgumentError indicates an invalid parameter value was passed to the API.
Definition: error.h:241
Definition: quest.cc:76
Class implementing a "boolean" weighting scheme.
Definition: weight.h:422
Pick the best N subqueries and combine with OP_OR.
Definition: query.h:215
void set_stemming_strategy(stem_strategy strategy)
Set the stemming strategy.
Definition: queryparser.cc:90
Iterator over a Xapian::MSet.
Definition: mset.h:351
Match only documents where all subqueries match near and in order.
Definition: query.h:152
const char * s
Definition: quest.cc:76
Public interfaces for the Xapian library.
static std::string get_available_languages()
Return a list of available languages.
Definition: stem.h:181
void add_boolean_prefix(const std::string &field, const std::string &prefix, const std::string *grouping=NULL)
Add a boolean term prefix allowing the user to restrict a search with a boolean filter specified in t...
Definition: queryparser.cc:197
char * optarg
Definition: getopt.cc:79
MSetIterator begin() const
Return iterator pointing to the first item in this MSet.
Definition: mset.h:607
MSetIterator end() const
Return iterator pointing to just after the last item in this MSet.
Definition: mset.h:612
#define INDENT
Support quoted phrases.
Definition: queryparser.h:790
Xapian::Weight subclass implementing the traditional probabilistic formula.
Definition: weight.h:768
#define PROG_DESC
Definition: quest.cc:37
This class implements the DLH weighting scheme, which is a representative scheme of the Divergence fr...
Definition: weight.h:1130
This class implements the PL2 weighting scheme.
Definition: weight.h:1190
This class implements the IneB2 weighting scheme.
Definition: weight.h:982
#define required_argument
Definition: gnu_getopt.h:80
Query parse_query(const std::string &query_string, unsigned flags=FLAG_DEFAULT, const std::string &default_prefix=std::string())
Parse a query.
Definition: queryparser.cc:161
void add_database(const Database &database)
Add an existing database (or group of databases) to those accessed by this object.
Definition: omdatabase.cc:148
void set_query(const Xapian::Query &query, Xapian::termcount qlen=0)
Set the query to run.
Definition: omenquire.cc:793
Match like OP_OR but weighting as if a single term.
Definition: query.h:239
std::string get_description() const
Return a string describing this object.
Definition: error.cc:93
This class implements the IfB2 weighting scheme.
Definition: weight.h:909
int * flag
Definition: gnu_getopt.h:75
Match only documents which all subqueries match.
Definition: query.h:84
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
Definition: api_anydb.cc:63
Definition: quest.cc:52
Enable automatic use of synonyms for single terms.
Definition: queryparser.h:864
Xapian::doccount get_matches_estimated() const
Estimate of the total number of matching documents.
Definition: omenquire.cc:253
void set_database(const Database &db)
Specify the database being searched.
Definition: queryparser.cc:141
Accumulate unstem and stoplist results.
Definition: queryparser.h:901
std::string get_description() const
Return a string describing this object.
Definition: query.cc:232
This class provides an interface to the information retrieval system for the purpose of searching...
Definition: enquire.h:152
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Support AND, OR, etc and bracketed subexpressions.
Definition: queryparser.h:788
Match only documents where all subqueries match near each other.
Definition: query.h:140
const char * s
Definition: quest.cc:110
This class implements the DPH weighting scheme.
Definition: weight.h:1353
All exceptions thrown by Xapian are subclasses of Xapian::Error.
Definition: error.h:43
Match documents which at least one subquery matches.
Definition: query.h:92
Allow queries such as &#39;NOT apples&#39;.
Definition: queryparser.h:816
static const qp_flag flag_tab[]
Definition: quest.cc:53
#define PACKAGE_STRING
Definition: config.h:315
Definition: quest.cc:110
Enable generation of n-grams from CJK text.
Definition: queryparser.h:886
void set_weighting_scheme(const Weight &weight_)
Set the weighting scheme to use for queries.
Definition: omenquire.cc:819
Class representing a query.
Definition: query.h:46
void add_prefix(const std::string &field, const std::string &prefix)
Add a free-text field term prefix.
Definition: queryparser.cc:183
std::string get_data() const
Get data stored in the document.
Definition: omdocument.cc:71
#define PROG_NAME
Definition: quest.cc:36
Xapian::Weight subclass implementing the Language Model formula.
Definition: weight.h:1406
const int n_flag_tab
Definition: quest.cc:70
Enable synonym operator &#39;~&#39;.
Definition: queryparser.h:858
A handle representing a document in a Xapian database.
Definition: document.h:61
const char * s
Definition: quest.cc:52
Xapian::Weight subclass implementing the BM25+ probabilistic formula.
Definition: weight.h:639
Xapian::Weight subclass implementing the BM25 probabilistic formula.
Definition: weight.h:535
Xapian::Weight subclass implementing the tf-idf weighting scheme.
Definition: weight.h:447