xapian-core  1.4.26
quest.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2004-2022 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19  * USA
20  */
21 
22 #include <config.h>
23 
24 #include <xapian.h>
25 
26 #include <cstdlib>
27 #include <cstring>
28 
29 #include <algorithm>
30 #include <iostream>
31 
32 #include "gnu_getopt.h"
33 
34 using namespace std;
35 
36 #define PROG_NAME "quest"
37 #define PROG_DESC "Xapian command line search tool"
38 
39 // Stopwords:
40 static const char * const sw[] = {
41  "a", "about", "an", "and", "are", "as", "at",
42  "be", "by",
43  "en",
44  "for", "from",
45  "how",
46  "i", "in", "is", "it",
47  "of", "on", "or",
48  "that", "the", "this", "to",
49  "was", "what", "when", "where", "which", "who", "why", "will", "with"
50 };
51 
53 struct tab_entry {
54  const char* s;
55 
56  unsigned f;
57 
58  bool operator<(const char* s_) const {
59  return strcmp(s, s_) < 0;
60  }
61 };
62 
68 template<typename T, std::size_t N>
69 static int
70 decode(const T (&table)[N], const char* s)
71 {
72  auto p = lower_bound(begin(table), end(table), s);
73  if (p == end(table) || strcmp(s, p->s) != 0)
74  return -1;
75  return p->f;
76 }
77 
78 static const tab_entry flag_tab[] = {
79  { "accumulate", Xapian::QueryParser::FLAG_ACCUMULATE },
80  { "auto_multiword_synonyms", Xapian::QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS },
81  { "auto_synonyms", Xapian::QueryParser::FLAG_AUTO_SYNONYMS },
82  { "boolean", Xapian::QueryParser::FLAG_BOOLEAN },
83  { "boolean_any_case", Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE },
84  { "cjk_ngram", Xapian::QueryParser::FLAG_CJK_NGRAM },
85  { "default", Xapian::QueryParser::FLAG_DEFAULT },
86  { "lovehate", Xapian::QueryParser::FLAG_LOVEHATE },
88  { "no_positions", Xapian::QueryParser::FLAG_NO_POSITIONS },
89  { "partial", Xapian::QueryParser::FLAG_PARTIAL },
91  { "pure_not", Xapian::QueryParser::FLAG_PURE_NOT },
92  { "spelling_correction", Xapian::QueryParser::FLAG_SPELLING_CORRECTION },
93  { "synonym", Xapian::QueryParser::FLAG_SYNONYM },
95 };
96 
97 static const tab_entry default_op_tab[] = {
98  { "and", Xapian::Query::OP_AND },
99  { "elite_set", Xapian::Query::OP_ELITE_SET },
100  { "max", Xapian::Query::OP_MAX },
101  { "near", Xapian::Query::OP_NEAR },
102  { "or", Xapian::Query::OP_OR },
103  { "phrase", Xapian::Query::OP_PHRASE },
104  { "synonym", Xapian::Query::OP_SYNONYM }
105 };
106 
107 enum {
123 };
124 
125 static const tab_entry wt_tab[] = {
126  { "bb2", WEIGHT_BB2 },
127  { "bm25", WEIGHT_BM25 },
128  { "bm25+", WEIGHT_BM25PLUS },
129  { "bool", WEIGHT_BOOL },
130  { "coord", WEIGHT_COORD },
131  { "dlh", WEIGHT_DLH },
132  { "dph", WEIGHT_DPH },
133  { "ifb2", WEIGHT_IFB2 },
134  { "ineb2", WEIGHT_INEB2 },
135  { "inl2", WEIGHT_INL2 },
136  { "lm", WEIGHT_LM },
137  { "pl2", WEIGHT_PL2 },
138  { "pl2+", WEIGHT_PL2PLUS },
139  { "tfidf", WEIGHT_TFIDF },
140  { "trad", WEIGHT_TRAD }
141 };
142 
147 #define INDENT \
148 " "
149 
154 template<typename T>
155 static char
156 print_table(const T& table)
157 {
158  size_t pos = 256;
159  for (auto& i : table) {
160  size_t len = strlen(i.s);
161  if (pos < 256) cout << ',';
162  if (pos + len >= 78) {
163  cout << "\n" INDENT;
164  pos = sizeof(INDENT) - 2;
165  } else {
166  cout << ' ';
167  }
168  cout << i.s;
169  pos += len + 2;
170  }
171  return '\n';
172 }
173 
174 static void show_usage() {
175  cout << "Usage: " PROG_NAME " [OPTIONS] 'QUERY'\n"
176 "NB: QUERY should be quoted to protect it from the shell.\n\n"
177 "Options:\n"
178 " -d, --db=DIRECTORY database to search (multiple databases may\n"
179 " be specified)\n"
180 " -m, --msize=MSIZE maximum number of matches to return\n"
181 " -c, --check-at-least=HOWMANY minimum number of matches to check\n"
182 " -s, --stemmer=LANG set the stemming language, the default is\n"
183 " 'english' (pass 'none' to disable stemming)\n"
184 " -p, --prefix=PFX:TERMPFX add a prefix\n"
185 " -b, --boolean-prefix=PFX:TERMPFX add a boolean prefix\n"
186 " -f, --flags=FLAG1[,FLAG2]... specify QueryParser flags (default:\n"
187 " default). Valid flags:"
188 << print_table(flag_tab) <<
189 " -o, --default-op=OP specify QueryParser default operator\n"
190 " (default: or). Valid operators:"
191 << print_table(default_op_tab) <<
192 " -w, --weight=SCHEME specify weighting scheme to use\n"
193 " (default: bm25). Valid schemes:"
194 << print_table(wt_tab) <<
195 " -F, --freqs show query term frequencies\n"
196 " -h, --help display this help and exit\n"
197 " -v, --version output version information and exit\n";
198 }
199 
200 int
201 main(int argc, char **argv)
202 try {
203  const char * opts = "d:m:c:s:p:b:f:o:w:Fhv";
204  static const struct option long_opts[] = {
205  { "db", required_argument, 0, 'd' },
206  { "msize", required_argument, 0, 'm' },
207  { "check-at-least", required_argument, 0, 'c' },
208  { "stemmer", required_argument, 0, 's' },
209  { "prefix", required_argument, 0, 'p' },
210  { "boolean-prefix", required_argument, 0, 'b' },
211  { "flags", required_argument, 0, 'f' },
212  { "default-op", required_argument, 0, 'o' },
213  { "weight", required_argument, 0, 'w' },
214  { "freqs", no_argument, 0, 'F' },
215  { "help", no_argument, 0, 'h' },
216  { "version", no_argument, 0, 'v' },
217  { NULL, 0, 0, 0}
218  };
219 
220  Xapian::SimpleStopper mystopper(sw, sw + sizeof(sw) / sizeof(sw[0]));
221  Xapian::Stem stemmer("english");
222  Xapian::doccount msize = 10;
223  Xapian::doccount check_at_least = 0;
224 
225  bool have_database = false;
226 
227  Xapian::Database db;
228  Xapian::QueryParser parser;
229  unsigned flags = 0;
230  bool flags_set = false;
231  bool show_termfreqs = false;
232  int weight = -1;
233 
234  int c;
235  while ((c = gnu_getopt_long(argc, argv, opts, long_opts, 0)) != -1) {
236  switch (c) {
237  case 'm': {
238  char * p;
239  unsigned long v = strtoul(optarg, &p, 10);
240  msize = static_cast<Xapian::doccount>(v);
241  if (*p || v != msize) {
242  cerr << PROG_NAME": Bad value '" << optarg
243  << "' passed for msize\n";
244  exit(1);
245  }
246  break;
247  }
248  case 'c': {
249  char * p;
250  unsigned long v = strtoul(optarg, &p, 10);
251  check_at_least = static_cast<Xapian::doccount>(v);
252  if (*p || v != check_at_least) {
253  cerr << PROG_NAME": Bad value '" << optarg
254  << "' passed for check_at_least\n";
255  exit(1);
256  }
257  break;
258  }
259  case 'd':
261  have_database = true;
262  break;
263  case 's':
264  try {
265  stemmer = Xapian::Stem(optarg);
266  } catch (const Xapian::InvalidArgumentError &) {
267  cerr << "Unknown stemming language '" << optarg << "'.\n"
268  "Available language names are: "
270  exit(1);
271  }
272  break;
273  case 'b': case 'p': {
274  const char * colon = strchr(optarg, ':');
275  if (colon == NULL) {
276  cerr << argv[0] << ": need ':' when setting prefix\n";
277  exit(1);
278  }
279  string prefix(optarg, colon - optarg);
280  string termprefix(colon + 1);
281  if (c == 'b') {
282  parser.add_boolean_prefix(prefix, termprefix);
283  } else {
284  parser.add_prefix(prefix, termprefix);
285  }
286  break;
287  }
288  case 'f':
289  flags_set = true;
290  do {
291  char * comma = strchr(optarg, ',');
292  if (comma)
293  *comma++ = '\0';
294  int flag = decode(flag_tab, optarg);
295  if (flag < 0) {
296  cerr << "Unknown flag '" << optarg << "'\n";
297  exit(1);
298  }
299  flags |= unsigned(flag);
300  optarg = comma;
301  } while (optarg);
302  break;
303  case 'o': {
304  int op = decode(default_op_tab, optarg);
305  if (op < 0) {
306  cerr << "Unknown op '" << optarg << "'\n";
307  exit(1);
308  }
309  parser.set_default_op(static_cast<Xapian::Query::op>(op));
310  break;
311  }
312  case 'w': {
313  weight = decode(wt_tab, optarg);
314  if (weight < 0) {
315  cerr << "Unknown weighting scheme '" << optarg << "'\n";
316  exit(1);
317  }
318  break;
319  }
320  case 'F':
321  show_termfreqs = true;
322  break;
323  case 'v':
324  cout << PROG_NAME " - " PACKAGE_STRING "\n";
325  exit(0);
326  case 'h':
327  cout << PROG_NAME " - " PROG_DESC "\n\n";
328  show_usage();
329  exit(0);
330  case ':': // missing parameter
331  case '?': // unknown option
332  show_usage();
333  exit(1);
334  }
335  }
336 
337  if (argc - optind != 1) {
338  show_usage();
339  exit(1);
340  }
341 
342  parser.set_database(db);
343  parser.set_stemmer(stemmer);
345  parser.set_stopper(&mystopper);
346 
347  if (!flags_set) {
349  }
350  Xapian::Query query = parser.parse_query(argv[optind], flags);
351  const string & correction = parser.get_corrected_query_string();
352  if (!correction.empty())
353  cout << "Did you mean: " << correction << "\n\n";
354 
355  cout << "Parsed Query: " << query.get_description() << '\n';
356 
357  if (!have_database) {
358  cout << "No database specified so not running the query.\n";
359  exit(0);
360  }
361 
362  Xapian::Enquire enquire(db);
363  enquire.set_query(query);
364 
365  switch (weight) {
366  case WEIGHT_BB2:
368  break;
369  case WEIGHT_BOOL:
371  break;
372  case WEIGHT_COORD:
374  break;
375  case WEIGHT_BM25:
377  break;
378  case WEIGHT_BM25PLUS:
380  break;
381  case WEIGHT_DLH:
383  break;
384  case WEIGHT_DPH:
386  break;
387  case WEIGHT_IFB2:
389  break;
390  case WEIGHT_INEB2:
392  break;
393  case WEIGHT_INL2:
395  break;
396  case WEIGHT_LM:
398  break;
399  case WEIGHT_PL2:
401  break;
402  case WEIGHT_PL2PLUS:
404  break;
405  case WEIGHT_TFIDF:
407  break;
408  case WEIGHT_TRAD:
410  break;
411  }
412 
413  Xapian::MSet mset = enquire.get_mset(0, msize, check_at_least);
414 
415  if (show_termfreqs) {
416  cout << "Query term frequencies:\n";
417  for (auto t = query.get_terms_begin();
418  t != query.get_terms_end();
419  ++t) {
420  const string& term = *t;
421  cout << " " << mset.get_termfreq(term) << '\t' << term << '\n';
422  }
423  }
424  auto lower_bound = mset.get_matches_lower_bound();
425  auto estimate = mset.get_matches_estimated();
426  auto upper_bound = mset.get_matches_upper_bound();
427  if (lower_bound == upper_bound) {
428  cout << "Exactly " << estimate << " matches\n";
429  } else {
430  cout << "Between " << lower_bound << " and " << upper_bound
431  << " matches, best estimate is " << estimate << '\n';
432  }
433 
434  cout << "MSet:\n";
435  for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); ++i) {
436  Xapian::Document doc = i.get_document();
437  string data = doc.get_data();
438  cout << *i << ": [" << i.get_weight() << "]\n" << data << "\n";
439  }
440  cout << flush;
441 } catch (const Xapian::QueryParserError & e) {
442  cout << "Couldn't parse query: " << e.get_msg() << '\n';
443  exit(1);
444 } catch (const Xapian::Error & err) {
445  cout << err.get_description() << '\n';
446  exit(1);
447 }
Support AND, OR, etc even if they aren&#39;t in ALLCAPS.
Definition: queryparser.h:794
static const tab_entry wt_tab[]
Definition: quest.cc:125
static const char *const sw[]
Definition: quest.cc:40
Wrappers to allow GNU getopt to be used cleanly from C++ code.
static void show_usage()
Definition: quest.cc:174
Simple implementation of Stopper class - this will suit most users.
Definition: queryparser.h:100
int optind
Definition: getopt.cc:94
void set_default_op(Query::op default_op)
Set the default operator.
Definition: queryparser.cc:103
This class is used to access a database, or a group of databases.
Definition: database.h:68
int gnu_getopt_long(int argc_, char *const *argv_, const char *shortopts_, const struct option *longopts_, int *optind_)
Definition: gnu_getopt.h:97
int main(int argc, char **argv)
Definition: quest.cc:201
const TermIterator get_terms_begin() const
Begin iterator for terms in the query object.
Definition: query.cc:135
Class representing a stemming algorithm.
Definition: stem.h:62
double weight
The weight of a document or term.
Definition: types.h:122
void set_stopper(const Stopper *stop=NULL)
Set the stopper.
Definition: queryparser.cc:97
std::string get_corrected_query_string() const
Get the spelling-corrected query string.
Definition: queryparser.cc:261
bool operator<(const char *s_) const
Definition: quest.cc:58
Xapian::doccount get_matches_lower_bound() const
Lower bound on the total number of matching documents.
Definition: omenquire.cc:246
static const char * opts
const std::string & get_msg() const
Message giving details of the error, intended for human consumption.
Definition: error.h:122
Build a Xapian::Query object from a user query string.
Definition: queryparser.h:778
static int decode(const T(&table)[N], const char *s)
Decode a string to an integer.
Definition: quest.cc:70
Class representing a list of search results.
Definition: mset.h:44
This class implements the InL2 weighting scheme.
Definition: weight.h:844
unsigned f
Definition: quest.cc:56
STL namespace.
const char * s
Definition: quest.cc:54
Pick the maximum weight of any subquery.
Definition: query.h:249
MSet get_mset(Xapian::doccount first, Xapian::doccount maxitems, Xapian::doccount checkatleast=0, const RSet *omrset=0, const MatchDecider *mdecider=0) const
Get (a portion of) the match set for the current query.
Definition: omenquire.cc:938
Indicates a query string can&#39;t be parsed.
Definition: error.h:887
Produce a query which doesn&#39;t use positional information.
Definition: queryparser.h:930
Xapian::Weight subclass implementing the PL2+ probabilistic formula.
Definition: weight.h:1263
static Xapian::Stem stemmer
Definition: stemtest.cc:41
static const tab_entry default_op_tab[]
Definition: quest.cc:97
Enable automatic use of synonyms for single terms and groups of terms.
Definition: queryparser.h:871
Enable partial matching.
Definition: queryparser.h:837
void set_stemmer(const Xapian::Stem &stemmer)
Set the stemmer.
Definition: queryparser.cc:85
static const tab_entry flag_tab[]
Definition: quest.cc:78
#define no_argument
Definition: gnu_getopt.h:79
Xapian::doccount get_matches_upper_bound() const
Upper bound on the total number of matching documents.
Definition: omenquire.cc:262
This class implements the BB2 weighting scheme.
Definition: weight.h:1060
Xapian::Weight subclass implementing Coordinate Matching.
Definition: weight.h:1516
InvalidArgumentError indicates an invalid parameter value was passed to the API.
Definition: error.h:241
Class implementing a "boolean" weighting scheme.
Definition: weight.h:433
Pick the best N subqueries and combine with OP_OR.
Definition: query.h:215
void set_stemming_strategy(stem_strategy strategy)
Set the stemming strategy.
Definition: queryparser.cc:91
Iterator over a Xapian::MSet.
Definition: mset.h:368
Match only documents where all subqueries match near and in order.
Definition: query.h:152
Common string to integer map entry for option decoding.
Definition: quest.cc:53
Public interfaces for the Xapian library.
static std::string get_available_languages()
Return a list of available languages.
Definition: stem.h:181
void add_boolean_prefix(const std::string &field, const std::string &prefix, const std::string *grouping=NULL)
Add a boolean term prefix allowing the user to restrict a search with a boolean filter specified in t...
Definition: queryparser.cc:206
char * optarg
Definition: getopt.cc:79
MSetIterator begin() const
Return iterator pointing to the first item in this MSet.
Definition: mset.h:624
MSetIterator end() const
Return iterator pointing to just after the last item in this MSet.
Definition: mset.h:629
#define INDENT
The number of spaces to indent by in print_table.
Definition: quest.cc:147
Support quoted phrases.
Definition: queryparser.h:790
Xapian::Weight subclass implementing the traditional probabilistic formula.
Definition: weight.h:774
#define PROG_DESC
Definition: quest.cc:37
This class implements the DLH weighting scheme, which is a representative scheme of the Divergence fr...
Definition: weight.h:1136
This class implements the PL2 weighting scheme.
Definition: weight.h:1196
This class implements the IneB2 weighting scheme.
Definition: weight.h:988
#define required_argument
Definition: gnu_getopt.h:80
Query parse_query(const std::string &query_string, unsigned flags=FLAG_DEFAULT, const std::string &default_prefix=std::string())
Parse a query.
Definition: queryparser.cc:162
void add_database(const Database &database)
Add an existing database (or group of databases) to those accessed by this object.
Definition: omdatabase.cc:148
void set_query(const Xapian::Query &query, Xapian::termcount qlen=0)
Set the query to run.
Definition: omenquire.cc:793
Match like OP_OR but weighting as if a single term.
Definition: query.h:239
std::string get_description() const
Return a string describing this object.
Definition: error.cc:93
This class implements the IfB2 weighting scheme.
Definition: weight.h:915
int * flag
Definition: gnu_getopt.h:75
Match only documents which all subqueries match.
Definition: query.h:84
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
Definition: api_anydb.cc:63
Enable automatic use of synonyms for single terms.
Definition: queryparser.h:864
Xapian::doccount get_matches_estimated() const
Estimate of the total number of matching documents.
Definition: omenquire.cc:253
void set_database(const Database &db)
Specify the database being searched.
Definition: queryparser.cc:142
Accumulate unstem and stoplist results.
Definition: queryparser.h:919
std::string get_description() const
Return a string describing this object.
Definition: query.cc:232
This class provides an interface to the information retrieval system for the purpose of searching...
Definition: enquire.h:152
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Support AND, OR, etc and bracketed subexpressions.
Definition: queryparser.h:788
Match only documents where all subqueries match near each other.
Definition: query.h:140
This class implements the DPH weighting scheme.
Definition: weight.h:1359
All exceptions thrown by Xapian are subclasses of Xapian::Error.
Definition: error.h:43
Match documents which at least one subquery matches.
Definition: query.h:92
Allow queries such as &#39;NOT apples&#39;.
Definition: queryparser.h:816
#define PACKAGE_STRING
Definition: config.h:337
Xapian::doccount get_termfreq(const std::string &term) const
Get the termfreq of a term.
Definition: omenquire.cc:206
Generate n-grams for scripts without explicit word breaks.
Definition: queryparser.h:904
static char print_table(const T &table)
Print string from a string to integer mapping table.
Definition: quest.cc:156
void set_weighting_scheme(const Weight &weight_)
Set the weighting scheme to use for queries.
Definition: omenquire.cc:819
Class representing a query.
Definition: query.h:46
void add_prefix(const std::string &field, const std::string &prefix)
Add a free-text field term prefix.
Definition: queryparser.cc:184
std::string get_data() const
Get data stored in the document.
Definition: omdocument.cc:71
#define PROG_NAME
Definition: quest.cc:36
Xapian::Weight subclass implementing the Language Model formula.
Definition: weight.h:1413
const TermIterator get_terms_end() const
End iterator for terms in the query object.
Definition: query.h:502
Generate n-grams for scripts without explicit word breaks.
Definition: queryparser.h:895
Enable synonym operator &#39;~&#39;.
Definition: queryparser.h:858
A handle representing a document in a Xapian database.
Definition: document.h:61
Xapian::Weight subclass implementing the BM25+ probabilistic formula.
Definition: weight.h:650
Xapian::Weight subclass implementing the BM25 probabilistic formula.
Definition: weight.h:546
Xapian::Weight subclass implementing the tf-idf weighting scheme.
Definition: weight.h:458