xapian-core  1.4.18
quest.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2004,2005,2006,2007,2008,2009,2010,2012,2013,2014,2016,2018 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19  * USA
20  */
21 
22 #include <config.h>
23 
24 #include <xapian.h>
25 
26 #include <cstdlib>
27 #include <cstring>
28 
29 #include <algorithm>
30 #include <iostream>
31 
32 #include "gnu_getopt.h"
33 
34 using namespace std;
35 
36 #define PROG_NAME "quest"
37 #define PROG_DESC "Xapian command line search tool"
38 
39 // Stopwords:
40 static const char * const sw[] = {
41  "a", "about", "an", "and", "are", "as", "at",
42  "be", "by",
43  "en",
44  "for", "from",
45  "how",
46  "i", "in", "is", "it",
47  "of", "on", "or",
48  "that", "the", "this", "to",
49  "was", "what", "when", "where", "which", "who", "why", "will", "with"
50 };
51 
52 struct qp_flag { const char * s; unsigned f; };
53 static const qp_flag flag_tab[] = {
54  { "auto_multiword_synonyms", Xapian::QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS },
55  { "auto_synonyms", Xapian::QueryParser::FLAG_AUTO_SYNONYMS },
56  { "boolean", Xapian::QueryParser::FLAG_BOOLEAN },
57  { "boolean_any_case", Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE },
58  { "cjk_ngram", Xapian::QueryParser::FLAG_CJK_NGRAM },
59  { "default", Xapian::QueryParser::FLAG_DEFAULT },
60  { "lovehate", Xapian::QueryParser::FLAG_LOVEHATE },
61  { "partial", Xapian::QueryParser::FLAG_PARTIAL },
63  { "pure_not", Xapian::QueryParser::FLAG_PURE_NOT },
64  { "spelling_correction", Xapian::QueryParser::FLAG_SPELLING_CORRECTION },
65  { "synonym", Xapian::QueryParser::FLAG_SYNONYM },
67 };
68 const int n_flag_tab = sizeof(flag_tab) / sizeof(flag_tab[0]);
69 
70 static inline bool operator<(const qp_flag& f1, const qp_flag& f2) {
71  return strcmp(f1.s, f2.s) < 0;
72 }
73 
74 struct qp_op { const char * s; unsigned f; };
75 static const qp_op op_tab[] = {
76  { "and", Xapian::Query::OP_AND },
77  { "elite_set", Xapian::Query::OP_ELITE_SET },
78  { "max", Xapian::Query::OP_MAX },
79  { "near", Xapian::Query::OP_NEAR },
80  { "or", Xapian::Query::OP_OR },
81  { "phrase", Xapian::Query::OP_PHRASE },
82  { "synonym", Xapian::Query::OP_SYNONYM }
83 };
84 const int n_op_tab = sizeof(op_tab) / sizeof(op_tab[0]);
85 
86 static inline bool operator<(const qp_op& f1, const qp_op& f2) {
87  return strcmp(f1.s, f2.s) < 0;
88 }
89 
90 enum {
106 };
107 
108 struct wt { const char * s; int f; };
109 static const wt wt_tab[] = {
110  { "bb2", WEIGHT_BB2 },
111  { "bm25", WEIGHT_BM25 },
112  { "bm25+", WEIGHT_BM25PLUS },
113  { "bool", WEIGHT_BOOL },
114  { "coord", WEIGHT_COORD },
115  { "dlh", WEIGHT_DLH },
116  { "dph", WEIGHT_DPH },
117  { "ifb2", WEIGHT_IFB2 },
118  { "ineb2", WEIGHT_INEB2 },
119  { "inl2", WEIGHT_INL2 },
120  { "lm", WEIGHT_LM },
121  { "pl2", WEIGHT_PL2 },
122  { "pl2+", WEIGHT_PL2PLUS },
123  { "tfidf", WEIGHT_TFIDF },
124  { "trad", WEIGHT_TRAD }
125 };
126 const int n_wt_tab = sizeof(wt_tab) / sizeof(wt_tab[0]);
127 
128 static inline bool operator<(const wt& f1, const wt& f2) {
129  return strcmp(f1.s, f2.s) < 0;
130 }
131 
132 static void show_usage() {
133  cout << "Usage: " PROG_NAME " [OPTIONS] 'QUERY'\n"
134 "NB: QUERY should be quoted to protect it from the shell.\n\n"
135 "Options:\n"
136 " -d, --db=DIRECTORY database to search (multiple databases may\n"
137 " be specified)\n"
138 " -m, --msize=MSIZE maximum number of matches to return\n"
139 " -c, --check-at-least=HOWMANY minimum number of matches to check\n"
140 " -s, --stemmer=LANG set the stemming language, the default is\n"
141 " 'english' (pass 'none' to disable stemming)\n"
142 " -p, --prefix=PFX:TERMPFX add a prefix\n"
143 " -b, --boolean-prefix=PFX:TERMPFX add a boolean prefix\n"
144 " -f, --flags=FLAG1[,FLAG2]... specify QueryParser flags. Valid flags:";
145 #define INDENT \
146 " "
147  int pos = 256;
148  for (const qp_flag * i = flag_tab; i - flag_tab < n_flag_tab; ++i) {
149  size_t len = strlen(i->s);
150  if (pos < 256) cout << ',';
151  if (pos + len >= 78) {
152  cout << "\n" INDENT;
153  pos = sizeof(INDENT) - 2;
154  } else {
155  cout << ' ';
156  }
157  cout << i->s;
158  pos += len + 2;
159  }
160  cout << "\n"
161 " -o, --default-op=OP specify QueryParser default operator\n"
162 " (default: or). Valid operators:";
163  pos = 256;
164  for (const qp_op * i = op_tab; i - op_tab < n_op_tab; ++i) {
165  size_t len = strlen(i->s);
166  if (pos < 256) cout << ',';
167  if (pos + len >= 78) {
168  cout << "\n" INDENT;
169  pos = sizeof(INDENT) - 2;
170  } else {
171  cout << ' ';
172  }
173  cout << i->s;
174  pos += len + 2;
175  }
176  cout << "\n"
177 " -w, --weight=SCHEME specify weighting scheme to use\n"
178 " (default: bm25). Valid schemes:";
179  pos = 256;
180  for (const wt * i = wt_tab; i - wt_tab < n_wt_tab; ++i) {
181  size_t len = strlen(i->s);
182  if (pos < 256) cout << ',';
183  if (pos + len >= 78) {
184  cout << "\n" INDENT;
185  pos = sizeof(INDENT) - 2;
186  } else {
187  cout << ' ';
188  }
189  cout << i->s;
190  pos += len + 2;
191  }
192  cout << "\n"
193 " -h, --help display this help and exit\n"
194 " -v, --version output version information and exit\n";
195 }
196 
197 static unsigned
198 decode_qp_flag(const char * s)
199 {
200  qp_flag f;
201  f.s = s;
202  const qp_flag * p = lower_bound(flag_tab, flag_tab + n_flag_tab, f);
203  if (p == flag_tab + n_flag_tab || f < *p)
204  return 0;
205  return p->f;
206 }
207 
208 static int
209 decode_qp_op(const char * s)
210 {
211  qp_op f;
212  f.s = s;
213  const qp_op * p = lower_bound(op_tab, op_tab + n_op_tab, f);
214  if (p == op_tab + n_op_tab || f < *p)
215  return -1;
216  return p->f;
217 }
218 
219 static int
220 decode_wt(const char * s)
221 {
222  wt f;
223  f.s = s;
224  const wt * p = lower_bound(wt_tab, wt_tab + n_wt_tab, f);
225  if (p == wt_tab + n_wt_tab || f < *p)
226  return -1;
227  return p->f;
228 }
229 
230 int
231 main(int argc, char **argv)
232 try {
233  const char * opts = "d:m:c:s:p:b:f:o:w:hv";
234  static const struct option long_opts[] = {
235  { "db", required_argument, 0, 'd' },
236  { "msize", required_argument, 0, 'm' },
237  { "check-at-least", required_argument, 0, 'c' },
238  { "stemmer", required_argument, 0, 's' },
239  { "prefix", required_argument, 0, 'p' },
240  { "boolean-prefix", required_argument, 0, 'b' },
241  { "flags", required_argument, 0, 'f' },
242  { "default-op", required_argument, 0, 'o' },
243  { "weight", required_argument, 0, 'w' },
244  { "help", no_argument, 0, 'h' },
245  { "version", no_argument, 0, 'v' },
246  { NULL, 0, 0, 0}
247  };
248 
249  Xapian::SimpleStopper mystopper(sw, sw + sizeof(sw) / sizeof(sw[0]));
250  Xapian::Stem stemmer("english");
251  Xapian::doccount msize = 10;
252  Xapian::doccount check_at_least = 0;
253 
254  bool have_database = false;
255 
256  Xapian::Database db;
257  Xapian::QueryParser parser;
258  unsigned flags = parser.FLAG_DEFAULT|parser.FLAG_SPELLING_CORRECTION;
259  int weight = -1;
260 
261  int c;
262  while ((c = gnu_getopt_long(argc, argv, opts, long_opts, 0)) != -1) {
263  switch (c) {
264  case 'm': {
265  char * p;
266  unsigned long v = strtoul(optarg, &p, 10);
267  msize = static_cast<Xapian::doccount>(v);
268  if (*p || v != msize) {
269  cerr << PROG_NAME": Bad value '" << optarg
270  << "' passed for msize" << endl;
271  exit(1);
272  }
273  break;
274  }
275  case 'c': {
276  char * p;
277  unsigned long v = strtoul(optarg, &p, 10);
278  check_at_least = static_cast<Xapian::doccount>(v);
279  if (*p || v != check_at_least) {
280  cerr << PROG_NAME": Bad value '" << optarg
281  << "' passed for check_at_least" << endl;
282  exit(1);
283  }
284  break;
285  }
286  case 'd':
288  have_database = true;
289  break;
290  case 's':
291  try {
292  stemmer = Xapian::Stem(optarg);
293  } catch (const Xapian::InvalidArgumentError &) {
294  cerr << "Unknown stemming language '" << optarg << "'.\n"
295  "Available language names are: "
297  exit(1);
298  }
299  break;
300  case 'b': case 'p': {
301  const char * colon = strchr(optarg, ':');
302  if (colon == NULL) {
303  cerr << argv[0] << ": need ':' when setting prefix" << endl;
304  exit(1);
305  }
306  string prefix(optarg, colon - optarg);
307  string termprefix(colon + 1);
308  if (c == 'b') {
309  parser.add_boolean_prefix(prefix, termprefix);
310  } else {
311  parser.add_prefix(prefix, termprefix);
312  }
313  break;
314  }
315  case 'f':
316  flags = 0;
317  do {
318  char * comma = strchr(optarg, ',');
319  if (comma)
320  *comma++ = '\0';
321  unsigned flag = decode_qp_flag(optarg);
322  if (flag == 0) {
323  cerr << "Unknown flag '" << optarg << "'" << endl;
324  exit(1);
325  }
326  flags |= flag;
327  optarg = comma;
328  } while (optarg);
329  break;
330  case 'o': {
331  int op = decode_qp_op(optarg);
332  if (op < 0) {
333  cerr << "Unknown op '" << optarg << "'" << endl;
334  exit(1);
335  }
336  parser.set_default_op(static_cast<Xapian::Query::op>(op));
337  break;
338  }
339  case 'w': {
340  weight = decode_wt(optarg);
341  if (weight < 0) {
342  cerr << "Unknown weighting scheme '" << optarg << "'" << endl;
343  exit(1);
344  }
345  break;
346  }
347  case 'v':
348  cout << PROG_NAME " - " PACKAGE_STRING << endl;
349  exit(0);
350  case 'h':
351  cout << PROG_NAME " - " PROG_DESC "\n\n";
352  show_usage();
353  exit(0);
354  case ':': // missing parameter
355  case '?': // unknown option
356  show_usage();
357  exit(1);
358  }
359  }
360 
361  if (argc - optind != 1) {
362  show_usage();
363  exit(1);
364  }
365 
366  parser.set_database(db);
367  parser.set_stemmer(stemmer);
369  parser.set_stopper(&mystopper);
370 
371  Xapian::Query query = parser.parse_query(argv[optind], flags);
372  const string & correction = parser.get_corrected_query_string();
373  if (!correction.empty())
374  cout << "Did you mean: " << correction << "\n\n";
375 
376  cout << "Parsed Query: " << query.get_description() << endl;
377 
378  if (!have_database) {
379  cout << "No database specified so not running the query." << endl;
380  exit(0);
381  }
382 
383  Xapian::Enquire enquire(db);
384  enquire.set_query(query);
385 
386  switch (weight) {
387  case WEIGHT_BB2:
389  break;
390  case WEIGHT_BOOL:
392  break;
393  case WEIGHT_COORD:
395  break;
396  case WEIGHT_BM25:
398  break;
399  case WEIGHT_BM25PLUS:
401  break;
402  case WEIGHT_DLH:
404  break;
405  case WEIGHT_DPH:
407  break;
408  case WEIGHT_IFB2:
410  break;
411  case WEIGHT_INEB2:
413  break;
414  case WEIGHT_INL2:
416  break;
417  case WEIGHT_LM:
419  break;
420  case WEIGHT_PL2:
422  break;
423  case WEIGHT_PL2PLUS:
425  break;
426  case WEIGHT_TFIDF:
428  break;
429  case WEIGHT_TRAD:
431  break;
432  }
433 
434  Xapian::MSet mset = enquire.get_mset(0, msize, check_at_least);
435 
436  auto lower_bound = mset.get_matches_lower_bound();
437  auto estimate = mset.get_matches_estimated();
438  auto upper_bound = mset.get_matches_upper_bound();
439  if (lower_bound == upper_bound) {
440  cout << "Exactly " << estimate << " matches" << endl;
441  } else {
442  cout << "Between " << lower_bound << " and " << upper_bound
443  << " matches, best estimate is " << estimate << endl;
444  }
445 
446  cout << "MSet:" << endl;
447  for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); ++i) {
448  Xapian::Document doc = i.get_document();
449  string data = doc.get_data();
450  cout << *i << ": [" << i.get_weight() << "]\n" << data << "\n";
451  }
452  cout << flush;
453 } catch (const Xapian::QueryParserError & e) {
454  cout << "Couldn't parse query: " << e.get_msg() << endl;
455  exit(1);
456 } catch (const Xapian::Error & err) {
457  cout << err.get_description() << endl;
458  exit(1);
459 }
Support AND, OR, etc even if they aren&#39;t in ALLCAPS.
Definition: queryparser.h:786
const int n_wt_tab
Definition: quest.cc:126
static int decode_wt(const char *s)
Definition: quest.cc:220
static const char *const sw[]
Definition: quest.cc:40
Wrappers to allow GNU getopt to be used cleanly from C++ code.
static void show_usage()
Definition: quest.cc:132
unsigned f
Definition: quest.cc:74
Simple implementation of Stopper class - this will suit most users.
Definition: queryparser.h:96
int optind
Definition: getopt.cc:94
void set_default_op(Query::op default_op)
Set the default operator.
Definition: queryparser.cc:102
This class is used to access a database, or a group of databases.
Definition: database.h:68
static const wt wt_tab[]
Definition: quest.cc:109
int gnu_getopt_long(int argc_, char *const *argv_, const char *shortopts_, const struct option *longopts_, int *optind_)
Definition: gnu_getopt.h:97
static int decode_qp_op(const char *s)
Definition: quest.cc:209
int main(int argc, char **argv)
Definition: quest.cc:231
Class representing a stemming algorithm.
Definition: stem.h:62
double weight
The weight of a document or term.
Definition: types.h:122
void set_stopper(const Stopper *stop=NULL)
Set the stopper.
Definition: queryparser.cc:96
unsigned f
Definition: quest.cc:52
static unsigned decode_qp_flag(const char *s)
Definition: quest.cc:198
std::string get_corrected_query_string() const
Get the spelling-corrected query string.
Definition: queryparser.cc:242
Xapian::doccount get_matches_lower_bound() const
Lower bound on the total number of matching documents.
Definition: omenquire.cc:246
static const char * opts
const std::string & get_msg() const
Message giving details of the error, intended for human consumption.
Definition: error.h:122
Build a Xapian::Query object from a user query string.
Definition: queryparser.h:770
const int n_op_tab
Definition: quest.cc:84
static const qp_op op_tab[]
Definition: quest.cc:75
Class representing a list of search results.
Definition: mset.h:44
This class implements the InL2 weighting scheme.
Definition: weight.h:816
STL namespace.
Pick the maximum weight of any subquery.
Definition: query.h:240
MSet get_mset(Xapian::doccount first, Xapian::doccount maxitems, Xapian::doccount checkatleast=0, const RSet *omrset=0, const MatchDecider *mdecider=0) const
Get (a portion of) the match set for the current query.
Definition: omenquire.cc:932
Indicates a query string can&#39;t be parsed.
Definition: error.h:887
int f
Definition: quest.cc:108
Xapian::Weight subclass implementing the PL2+ probabilistic formula.
Definition: weight.h:1235
static Xapian::Stem stemmer
Definition: stemtest.cc:41
Enable automatic use of synonyms for single terms and groups of terms.
Definition: queryparser.h:863
static bool operator<(const qp_flag &f1, const qp_flag &f2)
Definition: quest.cc:70
Enable partial matching.
Definition: queryparser.h:829
void set_stemmer(const Xapian::Stem &stemmer)
Set the stemmer.
Definition: queryparser.cc:84
#define no_argument
Definition: gnu_getopt.h:79
Xapian::doccount get_matches_upper_bound() const
Upper bound on the total number of matching documents.
Definition: omenquire.cc:262
This class implements the BB2 weighting scheme.
Definition: weight.h:1032
Xapian::Weight subclass implementing Coordinate Matching.
Definition: weight.h:1487
InvalidArgumentError indicates an invalid parameter value was passed to the API.
Definition: error.h:241
Definition: quest.cc:74
Class implementing a "boolean" weighting scheme.
Definition: weight.h:400
Pick the best N subqueries and combine with OP_OR.
Definition: query.h:206
void set_stemming_strategy(stem_strategy strategy)
Set the stemming strategy.
Definition: queryparser.cc:90
Iterator over a Xapian::MSet.
Definition: mset.h:351
Match only documents where all subqueries match near and in order.
Definition: query.h:152
const char * s
Definition: quest.cc:74
Public interfaces for the Xapian library.
static std::string get_available_languages()
Return a list of available languages.
Definition: stem.h:181
void add_boolean_prefix(const std::string &field, const std::string &prefix, const std::string *grouping=NULL)
Add a boolean term prefix allowing the user to restrict a search with a boolean filter specified in t...
Definition: queryparser.cc:197
char * optarg
Definition: getopt.cc:79
MSetIterator begin() const
Return iterator pointing to the first item in this MSet.
Definition: mset.h:607
MSetIterator end() const
Return iterator pointing to just after the last item in this MSet.
Definition: mset.h:612
#define INDENT
Support quoted phrases.
Definition: queryparser.h:782
Xapian::Weight subclass implementing the traditional probabilistic formula.
Definition: weight.h:746
#define PROG_DESC
Definition: quest.cc:37
This class implements the DLH weighting scheme, which is a representative scheme of the Divergence fr...
Definition: weight.h:1108
This class implements the PL2 weighting scheme.
Definition: weight.h:1168
This class implements the IneB2 weighting scheme.
Definition: weight.h:960
#define required_argument
Definition: gnu_getopt.h:80
Query parse_query(const std::string &query_string, unsigned flags=FLAG_DEFAULT, const std::string &default_prefix=std::string())
Parse a query.
Definition: queryparser.cc:161
void add_database(const Database &database)
Add an existing database (or group of databases) to those accessed by this object.
Definition: omdatabase.cc:148
void set_query(const Xapian::Query &query, Xapian::termcount qlen=0)
Set the query to run.
Definition: omenquire.cc:793
Match like OP_OR but weighting as if a single term.
Definition: query.h:230
std::string get_description() const
Return a string describing this object.
Definition: error.cc:93
This class implements the IfB2 weighting scheme.
Definition: weight.h:887
int * flag
Definition: gnu_getopt.h:75
Match only documents which all subqueries match.
Definition: query.h:84
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
Definition: api_anydb.cc:63
Definition: quest.cc:52
Enable automatic use of synonyms for single terms.
Definition: queryparser.h:856
Xapian::doccount get_matches_estimated() const
Estimate of the total number of matching documents.
Definition: omenquire.cc:253
void set_database(const Database &db)
Specify the database being searched.
Definition: queryparser.cc:141
std::string get_description() const
Return a string describing this object.
Definition: query.cc:232
This class provides an interface to the information retrieval system for the purpose of searching...
Definition: enquire.h:152
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Support AND, OR, etc and bracketed subexpressions.
Definition: queryparser.h:780
Match only documents where all subqueries match near each other.
Definition: query.h:140
const char * s
Definition: quest.cc:108
This class implements the DPH weighting scheme.
Definition: weight.h:1331
All exceptions thrown by Xapian are subclasses of Xapian::Error.
Definition: error.h:43
Match documents which at least one subquery matches.
Definition: query.h:92
Allow queries such as &#39;NOT apples&#39;.
Definition: queryparser.h:808
static const qp_flag flag_tab[]
Definition: quest.cc:53
#define PACKAGE_STRING
Definition: config.h:307
Definition: quest.cc:108
Enable generation of n-grams from CJK text.
Definition: queryparser.h:878
void set_weighting_scheme(const Weight &weight_)
Set the weighting scheme to use for queries.
Definition: omenquire.cc:819
Class representing a query.
Definition: query.h:46
void add_prefix(const std::string &field, const std::string &prefix)
Add a free-text field term prefix.
Definition: queryparser.cc:183
std::string get_data() const
Get data stored in the document.
Definition: omdocument.cc:71
#define PROG_NAME
Definition: quest.cc:36
Xapian::Weight subclass implementing the Language Model formula.
Definition: weight.h:1384
const int n_flag_tab
Definition: quest.cc:68
Enable synonym operator &#39;~&#39;.
Definition: queryparser.h:850
A handle representing a document in a Xapian database.
Definition: document.h:61
const char * s
Definition: quest.cc:52
Xapian::Weight subclass implementing the BM25+ probabilistic formula.
Definition: weight.h:617
Xapian::Weight subclass implementing the BM25 probabilistic formula.
Definition: weight.h:513
Xapian::Weight subclass implementing the tf-idf weighting scheme.
Definition: weight.h:425