xapian-core  1.4.21
quest.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2004-2022 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19  * USA
20  */
21 
22 #include <config.h>
23 
24 #include <xapian.h>
25 
26 #include <cstdlib>
27 #include <cstring>
28 
29 #include <algorithm>
30 #include <iostream>
31 
32 #include "gnu_getopt.h"
33 
34 using namespace std;
35 
36 #define PROG_NAME "quest"
37 #define PROG_DESC "Xapian command line search tool"
38 
39 // Stopwords:
40 static const char * const sw[] = {
41  "a", "about", "an", "and", "are", "as", "at",
42  "be", "by",
43  "en",
44  "for", "from",
45  "how",
46  "i", "in", "is", "it",
47  "of", "on", "or",
48  "that", "the", "this", "to",
49  "was", "what", "when", "where", "which", "who", "why", "will", "with"
50 };
51 
53 struct tab_entry {
54  const char* s;
55 
56  unsigned f;
57 
58  bool operator<(const char* s_) const {
59  return strcmp(s, s_) < 0;
60  }
61 };
62 
68 template<typename T, std::size_t N>
69 static int
70 decode(const T (&table)[N], const char* s)
71 {
72  auto p = lower_bound(begin(table), end(table), s);
73  if (p == end(table) || strcmp(s, p->s) != 0)
74  return -1;
75  return p->f;
76 }
77 
78 static const tab_entry flag_tab[] = {
79  { "accumulate", Xapian::QueryParser::FLAG_ACCUMULATE },
80  { "auto_multiword_synonyms", Xapian::QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS },
81  { "auto_synonyms", Xapian::QueryParser::FLAG_AUTO_SYNONYMS },
82  { "boolean", Xapian::QueryParser::FLAG_BOOLEAN },
83  { "boolean_any_case", Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE },
84  { "cjk_ngram", Xapian::QueryParser::FLAG_CJK_NGRAM },
85  { "default", Xapian::QueryParser::FLAG_DEFAULT },
86  { "lovehate", Xapian::QueryParser::FLAG_LOVEHATE },
87  { "no_positions", Xapian::QueryParser::FLAG_NO_POSITIONS },
88  { "partial", Xapian::QueryParser::FLAG_PARTIAL },
90  { "pure_not", Xapian::QueryParser::FLAG_PURE_NOT },
91  { "spelling_correction", Xapian::QueryParser::FLAG_SPELLING_CORRECTION },
92  { "synonym", Xapian::QueryParser::FLAG_SYNONYM },
94 };
95 
96 static const tab_entry default_op_tab[] = {
97  { "and", Xapian::Query::OP_AND },
98  { "elite_set", Xapian::Query::OP_ELITE_SET },
99  { "max", Xapian::Query::OP_MAX },
100  { "near", Xapian::Query::OP_NEAR },
101  { "or", Xapian::Query::OP_OR },
102  { "phrase", Xapian::Query::OP_PHRASE },
103  { "synonym", Xapian::Query::OP_SYNONYM }
104 };
105 
106 enum {
122 };
123 
124 static const tab_entry wt_tab[] = {
125  { "bb2", WEIGHT_BB2 },
126  { "bm25", WEIGHT_BM25 },
127  { "bm25+", WEIGHT_BM25PLUS },
128  { "bool", WEIGHT_BOOL },
129  { "coord", WEIGHT_COORD },
130  { "dlh", WEIGHT_DLH },
131  { "dph", WEIGHT_DPH },
132  { "ifb2", WEIGHT_IFB2 },
133  { "ineb2", WEIGHT_INEB2 },
134  { "inl2", WEIGHT_INL2 },
135  { "lm", WEIGHT_LM },
136  { "pl2", WEIGHT_PL2 },
137  { "pl2+", WEIGHT_PL2PLUS },
138  { "tfidf", WEIGHT_TFIDF },
139  { "trad", WEIGHT_TRAD }
140 };
141 
146 #define INDENT \
147 " "
148 
153 template<typename T>
154 static char
155 print_table(const T& table)
156 {
157  size_t pos = 256;
158  for (auto& i : table) {
159  size_t len = strlen(i.s);
160  if (pos < 256) cout << ',';
161  if (pos + len >= 78) {
162  cout << "\n" INDENT;
163  pos = sizeof(INDENT) - 2;
164  } else {
165  cout << ' ';
166  }
167  cout << i.s;
168  pos += len + 2;
169  }
170  return '\n';
171 }
172 
173 static void show_usage() {
174  cout << "Usage: " PROG_NAME " [OPTIONS] 'QUERY'\n"
175 "NB: QUERY should be quoted to protect it from the shell.\n\n"
176 "Options:\n"
177 " -d, --db=DIRECTORY database to search (multiple databases may\n"
178 " be specified)\n"
179 " -m, --msize=MSIZE maximum number of matches to return\n"
180 " -c, --check-at-least=HOWMANY minimum number of matches to check\n"
181 " -s, --stemmer=LANG set the stemming language, the default is\n"
182 " 'english' (pass 'none' to disable stemming)\n"
183 " -p, --prefix=PFX:TERMPFX add a prefix\n"
184 " -b, --boolean-prefix=PFX:TERMPFX add a boolean prefix\n"
185 " -f, --flags=FLAG1[,FLAG2]... specify QueryParser flags (default:\n"
186 " default). Valid flags:"
187 << print_table(flag_tab) <<
188 " -o, --default-op=OP specify QueryParser default operator\n"
189 " (default: or). Valid operators:"
190 << print_table(default_op_tab) <<
191 " -w, --weight=SCHEME specify weighting scheme to use\n"
192 " (default: bm25). Valid schemes:"
193 << print_table(wt_tab) <<
194 " -F, --freqs show query term frequencies\n"
195 " -h, --help display this help and exit\n"
196 " -v, --version output version information and exit\n";
197 }
198 
199 int
200 main(int argc, char **argv)
201 try {
202  const char * opts = "d:m:c:s:p:b:f:o:w:Fhv";
203  static const struct option long_opts[] = {
204  { "db", required_argument, 0, 'd' },
205  { "msize", required_argument, 0, 'm' },
206  { "check-at-least", required_argument, 0, 'c' },
207  { "stemmer", required_argument, 0, 's' },
208  { "prefix", required_argument, 0, 'p' },
209  { "boolean-prefix", required_argument, 0, 'b' },
210  { "flags", required_argument, 0, 'f' },
211  { "default-op", required_argument, 0, 'o' },
212  { "weight", required_argument, 0, 'w' },
213  { "freqs", no_argument, 0, 'F' },
214  { "help", no_argument, 0, 'h' },
215  { "version", no_argument, 0, 'v' },
216  { NULL, 0, 0, 0}
217  };
218 
219  Xapian::SimpleStopper mystopper(sw, sw + sizeof(sw) / sizeof(sw[0]));
220  Xapian::Stem stemmer("english");
221  Xapian::doccount msize = 10;
222  Xapian::doccount check_at_least = 0;
223 
224  bool have_database = false;
225 
226  Xapian::Database db;
227  Xapian::QueryParser parser;
228  unsigned flags = 0;
229  bool flags_set = false;
230  bool show_termfreqs = false;
231  int weight = -1;
232 
233  int c;
234  while ((c = gnu_getopt_long(argc, argv, opts, long_opts, 0)) != -1) {
235  switch (c) {
236  case 'm': {
237  char * p;
238  unsigned long v = strtoul(optarg, &p, 10);
239  msize = static_cast<Xapian::doccount>(v);
240  if (*p || v != msize) {
241  cerr << PROG_NAME": Bad value '" << optarg
242  << "' passed for msize\n";
243  exit(1);
244  }
245  break;
246  }
247  case 'c': {
248  char * p;
249  unsigned long v = strtoul(optarg, &p, 10);
250  check_at_least = static_cast<Xapian::doccount>(v);
251  if (*p || v != check_at_least) {
252  cerr << PROG_NAME": Bad value '" << optarg
253  << "' passed for check_at_least\n";
254  exit(1);
255  }
256  break;
257  }
258  case 'd':
260  have_database = true;
261  break;
262  case 's':
263  try {
264  stemmer = Xapian::Stem(optarg);
265  } catch (const Xapian::InvalidArgumentError &) {
266  cerr << "Unknown stemming language '" << optarg << "'.\n"
267  "Available language names are: "
269  exit(1);
270  }
271  break;
272  case 'b': case 'p': {
273  const char * colon = strchr(optarg, ':');
274  if (colon == NULL) {
275  cerr << argv[0] << ": need ':' when setting prefix\n";
276  exit(1);
277  }
278  string prefix(optarg, colon - optarg);
279  string termprefix(colon + 1);
280  if (c == 'b') {
281  parser.add_boolean_prefix(prefix, termprefix);
282  } else {
283  parser.add_prefix(prefix, termprefix);
284  }
285  break;
286  }
287  case 'f':
288  flags_set = true;
289  do {
290  char * comma = strchr(optarg, ',');
291  if (comma)
292  *comma++ = '\0';
293  int flag = decode(flag_tab, optarg);
294  if (flag < 0) {
295  cerr << "Unknown flag '" << optarg << "'\n";
296  exit(1);
297  }
298  flags |= unsigned(flag);
299  optarg = comma;
300  } while (optarg);
301  break;
302  case 'o': {
303  int op = decode(default_op_tab, optarg);
304  if (op < 0) {
305  cerr << "Unknown op '" << optarg << "'\n";
306  exit(1);
307  }
308  parser.set_default_op(static_cast<Xapian::Query::op>(op));
309  break;
310  }
311  case 'w': {
312  weight = decode(wt_tab, optarg);
313  if (weight < 0) {
314  cerr << "Unknown weighting scheme '" << optarg << "'\n";
315  exit(1);
316  }
317  break;
318  }
319  case 'F':
320  show_termfreqs = true;
321  break;
322  case 'v':
323  cout << PROG_NAME " - " PACKAGE_STRING "\n";
324  exit(0);
325  case 'h':
326  cout << PROG_NAME " - " PROG_DESC "\n\n";
327  show_usage();
328  exit(0);
329  case ':': // missing parameter
330  case '?': // unknown option
331  show_usage();
332  exit(1);
333  }
334  }
335 
336  if (argc - optind != 1) {
337  show_usage();
338  exit(1);
339  }
340 
341  parser.set_database(db);
342  parser.set_stemmer(stemmer);
344  parser.set_stopper(&mystopper);
345 
346  if (!flags_set) {
348  }
349  Xapian::Query query = parser.parse_query(argv[optind], flags);
350  const string & correction = parser.get_corrected_query_string();
351  if (!correction.empty())
352  cout << "Did you mean: " << correction << "\n\n";
353 
354  cout << "Parsed Query: " << query.get_description() << '\n';
355 
356  if (!have_database) {
357  cout << "No database specified so not running the query.\n";
358  exit(0);
359  }
360 
361  Xapian::Enquire enquire(db);
362  enquire.set_query(query);
363 
364  switch (weight) {
365  case WEIGHT_BB2:
367  break;
368  case WEIGHT_BOOL:
370  break;
371  case WEIGHT_COORD:
373  break;
374  case WEIGHT_BM25:
376  break;
377  case WEIGHT_BM25PLUS:
379  break;
380  case WEIGHT_DLH:
382  break;
383  case WEIGHT_DPH:
385  break;
386  case WEIGHT_IFB2:
388  break;
389  case WEIGHT_INEB2:
391  break;
392  case WEIGHT_INL2:
394  break;
395  case WEIGHT_LM:
397  break;
398  case WEIGHT_PL2:
400  break;
401  case WEIGHT_PL2PLUS:
403  break;
404  case WEIGHT_TFIDF:
406  break;
407  case WEIGHT_TRAD:
409  break;
410  }
411 
412  Xapian::MSet mset = enquire.get_mset(0, msize, check_at_least);
413 
414  if (show_termfreqs) {
415  cout << "Query term frequencies:\n";
416  for (auto t = query.get_terms_begin();
417  t != query.get_terms_end();
418  ++t) {
419  const string& term = *t;
420  cout << " " << mset.get_termfreq(term) << '\t' << term << '\n';
421  }
422  }
423  auto lower_bound = mset.get_matches_lower_bound();
424  auto estimate = mset.get_matches_estimated();
425  auto upper_bound = mset.get_matches_upper_bound();
426  if (lower_bound == upper_bound) {
427  cout << "Exactly " << estimate << " matches\n";
428  } else {
429  cout << "Between " << lower_bound << " and " << upper_bound
430  << " matches, best estimate is " << estimate << '\n';
431  }
432 
433  cout << "MSet:\n";
434  for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); ++i) {
435  Xapian::Document doc = i.get_document();
436  string data = doc.get_data();
437  cout << *i << ": [" << i.get_weight() << "]\n" << data << "\n";
438  }
439  cout << flush;
440 } catch (const Xapian::QueryParserError & e) {
441  cout << "Couldn't parse query: " << e.get_msg() << '\n';
442  exit(1);
443 } catch (const Xapian::Error & err) {
444  cout << err.get_description() << '\n';
445  exit(1);
446 }
Support AND, OR, etc even if they aren&#39;t in ALLCAPS.
Definition: queryparser.h:794
static const tab_entry wt_tab[]
Definition: quest.cc:124
static const char *const sw[]
Definition: quest.cc:40
Wrappers to allow GNU getopt to be used cleanly from C++ code.
static void show_usage()
Definition: quest.cc:173
Simple implementation of Stopper class - this will suit most users.
Definition: queryparser.h:100
int optind
Definition: getopt.cc:94
void set_default_op(Query::op default_op)
Set the default operator.
Definition: queryparser.cc:102
This class is used to access a database, or a group of databases.
Definition: database.h:68
int gnu_getopt_long(int argc_, char *const *argv_, const char *shortopts_, const struct option *longopts_, int *optind_)
Definition: gnu_getopt.h:97
int main(int argc, char **argv)
Definition: quest.cc:200
const TermIterator get_terms_begin() const
Begin iterator for terms in the query object.
Definition: query.cc:135
Class representing a stemming algorithm.
Definition: stem.h:62
double weight
The weight of a document or term.
Definition: types.h:122
void set_stopper(const Stopper *stop=NULL)
Set the stopper.
Definition: queryparser.cc:96
std::string get_corrected_query_string() const
Get the spelling-corrected query string.
Definition: queryparser.cc:242
bool operator<(const char *s_) const
Definition: quest.cc:58
Xapian::doccount get_matches_lower_bound() const
Lower bound on the total number of matching documents.
Definition: omenquire.cc:246
static const char * opts
const std::string & get_msg() const
Message giving details of the error, intended for human consumption.
Definition: error.h:122
Build a Xapian::Query object from a user query string.
Definition: queryparser.h:778
static int decode(const T(&table)[N], const char *s)
Decode a string to an integer.
Definition: quest.cc:70
Class representing a list of search results.
Definition: mset.h:44
This class implements the InL2 weighting scheme.
Definition: weight.h:838
unsigned f
Definition: quest.cc:56
STL namespace.
const char * s
Definition: quest.cc:54
Pick the maximum weight of any subquery.
Definition: query.h:249
MSet get_mset(Xapian::doccount first, Xapian::doccount maxitems, Xapian::doccount checkatleast=0, const RSet *omrset=0, const MatchDecider *mdecider=0) const
Get (a portion of) the match set for the current query.
Definition: omenquire.cc:932
Indicates a query string can&#39;t be parsed.
Definition: error.h:887
Produce a query which doesn&#39;t use positional information.
Definition: queryparser.h:912
Xapian::Weight subclass implementing the PL2+ probabilistic formula.
Definition: weight.h:1257
static Xapian::Stem stemmer
Definition: stemtest.cc:41
static const tab_entry default_op_tab[]
Definition: quest.cc:96
Enable automatic use of synonyms for single terms and groups of terms.
Definition: queryparser.h:871
Enable partial matching.
Definition: queryparser.h:837
void set_stemmer(const Xapian::Stem &stemmer)
Set the stemmer.
Definition: queryparser.cc:84
static const tab_entry flag_tab[]
Definition: quest.cc:78
#define no_argument
Definition: gnu_getopt.h:79
Xapian::doccount get_matches_upper_bound() const
Upper bound on the total number of matching documents.
Definition: omenquire.cc:262
This class implements the BB2 weighting scheme.
Definition: weight.h:1054
Xapian::Weight subclass implementing Coordinate Matching.
Definition: weight.h:1509
InvalidArgumentError indicates an invalid parameter value was passed to the API.
Definition: error.h:241
Class implementing a "boolean" weighting scheme.
Definition: weight.h:422
Pick the best N subqueries and combine with OP_OR.
Definition: query.h:215
void set_stemming_strategy(stem_strategy strategy)
Set the stemming strategy.
Definition: queryparser.cc:90
Iterator over a Xapian::MSet.
Definition: mset.h:351
Match only documents where all subqueries match near and in order.
Definition: query.h:152
Common string to integer map entry for option decoding.
Definition: quest.cc:53
Public interfaces for the Xapian library.
static std::string get_available_languages()
Return a list of available languages.
Definition: stem.h:181
void add_boolean_prefix(const std::string &field, const std::string &prefix, const std::string *grouping=NULL)
Add a boolean term prefix allowing the user to restrict a search with a boolean filter specified in t...
Definition: queryparser.cc:197
char * optarg
Definition: getopt.cc:79
MSetIterator begin() const
Return iterator pointing to the first item in this MSet.
Definition: mset.h:607
MSetIterator end() const
Return iterator pointing to just after the last item in this MSet.
Definition: mset.h:612
#define INDENT
The number of spaces to indent by in print_table.
Definition: quest.cc:146
Support quoted phrases.
Definition: queryparser.h:790
Xapian::Weight subclass implementing the traditional probabilistic formula.
Definition: weight.h:768
#define PROG_DESC
Definition: quest.cc:37
This class implements the DLH weighting scheme, which is a representative scheme of the Divergence fr...
Definition: weight.h:1130
This class implements the PL2 weighting scheme.
Definition: weight.h:1190
This class implements the IneB2 weighting scheme.
Definition: weight.h:982
#define required_argument
Definition: gnu_getopt.h:80
Query parse_query(const std::string &query_string, unsigned flags=FLAG_DEFAULT, const std::string &default_prefix=std::string())
Parse a query.
Definition: queryparser.cc:161
void add_database(const Database &database)
Add an existing database (or group of databases) to those accessed by this object.
Definition: omdatabase.cc:148
void set_query(const Xapian::Query &query, Xapian::termcount qlen=0)
Set the query to run.
Definition: omenquire.cc:793
Match like OP_OR but weighting as if a single term.
Definition: query.h:239
std::string get_description() const
Return a string describing this object.
Definition: error.cc:93
This class implements the IfB2 weighting scheme.
Definition: weight.h:909
int * flag
Definition: gnu_getopt.h:75
Match only documents which all subqueries match.
Definition: query.h:84
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
Definition: api_anydb.cc:63
Enable automatic use of synonyms for single terms.
Definition: queryparser.h:864
Xapian::doccount get_matches_estimated() const
Estimate of the total number of matching documents.
Definition: omenquire.cc:253
void set_database(const Database &db)
Specify the database being searched.
Definition: queryparser.cc:141
Accumulate unstem and stoplist results.
Definition: queryparser.h:901
std::string get_description() const
Return a string describing this object.
Definition: query.cc:232
This class provides an interface to the information retrieval system for the purpose of searching...
Definition: enquire.h:152
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Support AND, OR, etc and bracketed subexpressions.
Definition: queryparser.h:788
Match only documents where all subqueries match near each other.
Definition: query.h:140
This class implements the DPH weighting scheme.
Definition: weight.h:1353
All exceptions thrown by Xapian are subclasses of Xapian::Error.
Definition: error.h:43
Match documents which at least one subquery matches.
Definition: query.h:92
Allow queries such as &#39;NOT apples&#39;.
Definition: queryparser.h:816
#define PACKAGE_STRING
Definition: config.h:337
Xapian::doccount get_termfreq(const std::string &term) const
Get the termfreq of a term.
Definition: omenquire.cc:206
Enable generation of n-grams from CJK text.
Definition: queryparser.h:886
static char print_table(const T &table)
Print string from a string to integer mapping table.
Definition: quest.cc:155
void set_weighting_scheme(const Weight &weight_)
Set the weighting scheme to use for queries.
Definition: omenquire.cc:819
Class representing a query.
Definition: query.h:46
void add_prefix(const std::string &field, const std::string &prefix)
Add a free-text field term prefix.
Definition: queryparser.cc:183
std::string get_data() const
Get data stored in the document.
Definition: omdocument.cc:71
#define PROG_NAME
Definition: quest.cc:36
Xapian::Weight subclass implementing the Language Model formula.
Definition: weight.h:1406
const TermIterator get_terms_end() const
End iterator for terms in the query object.
Definition: query.h:502
Enable synonym operator &#39;~&#39;.
Definition: queryparser.h:858
A handle representing a document in a Xapian database.
Definition: document.h:61
Xapian::Weight subclass implementing the BM25+ probabilistic formula.
Definition: weight.h:639
Xapian::Weight subclass implementing the BM25 probabilistic formula.
Definition: weight.h:535
Xapian::Weight subclass implementing the tf-idf weighting scheme.
Definition: weight.h:447