xapian-core  2.0.0
enquire.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2009,2017,2024 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include "xapian/enquire.h"
24 #include "enquireinternal.h"
25 
26 #include "expand/esetinternal.h"
27 #include "expand/expandweight.h"
28 #include "matcher/matcher.h"
29 #include "msetinternal.h"
30 #include "omassert.h"
31 #include "vectortermlist.h"
32 #include "weight/weightinternal.h"
33 #include "xapian/database.h"
34 #include "xapian/error.h"
35 #include "xapian/expanddecider.h"
36 #include "xapian/intrusive_ptr.h"
37 #include "xapian/keymaker.h"
38 #include "xapian/matchspy.h"
39 #include "xapian/query.h"
40 #include "xapian/rset.h"
41 #include "xapian/weight.h"
42 
43 #include <memory>
44 #include <string>
45 #include <vector>
46 
47 using namespace std;
48 
49 [[noreturn]]
50 static void
51 throw_invalid_arg(const char* msg) {
53 }
54 
55 namespace Xapian {
56 
57 Enquire::Enquire(const Enquire&) = default;
58 
59 Enquire&
60 Enquire::operator=(const Enquire&) = default;
61 
62 Enquire::Enquire(Enquire&&) = default;
63 
64 Enquire&
65 Enquire::operator=(Enquire&&) = default;
66 
67 Enquire::Enquire(const Database& db) : internal(new Enquire::Internal(db)) {}
68 
70 
71 void
72 Enquire::set_query(const Query& query, termcount query_length)
73 {
74  internal->query = query;
75  internal->query_length = query_length;
76 }
77 
78 const Query&
80 {
81  return internal->query;
82 }
83 
84 void
86 {
87  internal->weight.reset(weight.clone());
88 }
89 
90 void
92 {
93  internal->order = order;
94 }
95 
96 void
98 {
99  internal->sort_by = Internal::REL;
100 }
101 
102 void
103 Enquire::set_sort_by_value(valueno sort_key, bool reverse)
104 {
105  internal->sort_by = Internal::VAL;
106  internal->sort_functor = NULL;
107  internal->sort_key = sort_key;
108  internal->sort_val_reverse = reverse;
109 }
110 
111 void
112 Enquire::set_sort_by_key(KeyMaker* sorter, bool reverse)
113 {
114  if (sorter == NULL) {
115  throw_invalid_arg("Enquire::set_sort_by_key(): sorter cannot be NULL");
116  }
117  internal->sort_by = Internal::VAL;
118  internal->sort_functor = sorter;
119  internal->sort_val_reverse = reverse;
120 }
121 
122 void
124 {
125  internal->sort_by = Internal::VAL_REL;
126  internal->sort_functor = NULL;
127  internal->sort_key = sort_key;
128  internal->sort_val_reverse = reverse;
129 }
130 
131 void
133 {
134  if (sorter == NULL) {
135  throw_invalid_arg("Enquire::set_sort_by_key_then_relevance(): "
136  "sorter cannot be NULL");
137  }
138  internal->sort_by = Internal::VAL_REL;
139  internal->sort_functor = sorter;
140  internal->sort_val_reverse = reverse;
141 }
142 
143 void
145 {
146  internal->sort_by = Internal::REL_VAL;
147  internal->sort_functor = NULL;
148  internal->sort_key = sort_key;
149  internal->sort_val_reverse = reverse;
150 }
151 
152 void
154 {
155  if (sorter == NULL) {
156  throw_invalid_arg("Enquire::set_sort_by_relevance_then_key(): "
157  "sorter cannot be NULL");
158  }
159  internal->sort_by = Internal::REL_VAL;
160  internal->sort_functor = sorter;
161  internal->sort_val_reverse = reverse;
162 }
163 
164 void
165 Enquire::set_collapse_key(valueno collapse_key, doccount collapse_max)
166 {
167  internal->collapse_key = collapse_key;
168  internal->collapse_max = collapse_max;
169 }
170 
171 void
172 Enquire::set_cutoff(int percent_threshold, double weight_threshold)
173 {
174  internal->percent_threshold = percent_threshold;
175  internal->weight_threshold = weight_threshold;
176 }
177 
178 void
180 {
182  if (spy == NULL)
183  throw_invalid_arg("Enquire::add_matchspy(): spy cannot be NULL");
184  internal->matchspies.push_back(opt_intrusive_ptr<MatchSpy>(spy));
185 }
186 
187 void
189 {
190  internal->matchspies.clear();
191 }
192 
193 void
194 Enquire::set_time_limit(double time_limit)
195 {
196  internal->time_limit = time_limit;
197 }
198 
199 MSet
201  doccount maxitems,
202  doccount checkatleast,
203  const RSet* rset,
204  const MatchDecider* mdecider) const
205 {
206  return internal->get_mset(first, maxitems, checkatleast, rset, mdecider);
207 }
208 
211 {
212  return internal->get_matching_terms_begin(did);
213 }
214 
215 void
216 Enquire::set_expansion_scheme(std::string_view eweightname,
217  double expand_k) const
218 {
219  if (eweightname == "bo1") {
220  internal->eweight = Enquire::Internal::EXPAND_BO1;
221  } else if (eweightname == "prob" || eweightname == "trad") {
222  internal->eweight = Enquire::Internal::EXPAND_PROB;
223  } else {
224  throw_invalid_arg("Enquire::set_expansion_scheme(): eweightname must "
225  "be 'bo1', 'prob' or 'trad'");
226  }
227  internal->expand_k = expand_k;
228 }
229 
230 ESet
232  const RSet& rset,
233  int flags,
234  const ExpandDecider* edecider,
235  double min_weight) const
236 {
237  return internal->get_eset(maxitems, rset, flags, edecider, min_weight);
238 }
239 
240 std::string
242 {
243  string desc = "Enquire(db=";
244  desc += internal->db.get_description();
245  if (!internal->query.empty()) {
246  desc += ", query=";
247  desc += internal->query.get_description();
248  }
249  desc += ')';
250  return desc;
251 }
252 
254  : db(db_) {}
255 
256 MSet
258  doccount maxitems,
259  doccount checkatleast,
260  const RSet* rset,
261  const MatchDecider* mdecider) const
262 {
263  if (query.empty()) {
264  MSet mset;
265  mset.internal->set_first(first);
266  return mset;
267  }
268 
269  if (percent_threshold && (sort_by == VAL || sort_by == VAL_REL)) {
270  throw Xapian::UnimplementedError("Use of a percentage cutoff while "
271  "sorting primary by value isn't "
272  "currently supported");
273  }
274 
275  // Lazily initialise weight to its default if necessary.
276  if (!weight)
277  weight.reset(new BM25Weight);
278 
279  // Lazily initialise query_length if it wasn't explicitly specified.
280  if (query_length == 0) {
281  query_length = query.get_length();
282  }
283 
284  Xapian::doccount first_orig = first;
285  {
286  Xapian::doccount docs = db.get_doccount();
287  first = min(first, docs);
288  maxitems = min(maxitems, docs - first);
289  checkatleast = min(checkatleast, docs);
290  checkatleast = max(checkatleast, first + maxitems);
291  }
292 
293  unique_ptr<Xapian::Weight::Internal> stats(new Xapian::Weight::Internal);
294  ::Matcher match(db,
295  query,
296  query_length,
297  rset,
298  *stats,
299  *weight,
300  (mdecider != NULL),
301  collapse_key,
302  collapse_max,
303  percent_threshold,
304  weight_threshold,
305  order,
306  sort_key,
307  sort_by,
308  sort_val_reverse,
309  time_limit,
310  matchspies);
311 
312  MSet mset = match.get_mset(first,
313  maxitems,
314  checkatleast,
315  *stats,
316  *weight,
317  mdecider,
318  sort_functor.get(),
319  collapse_key,
320  collapse_max,
321  percent_threshold,
322  weight_threshold,
323  order,
324  sort_key,
325  sort_by,
326  sort_val_reverse,
327  time_limit,
328  matchspies);
329 
330  if (first_orig != first) {
331  mset.internal->set_first(first_orig);
332  }
333 
334  mset.internal->set_enquire(this);
335 
336  if (!mset.internal->get_stats()) {
337  mset.internal->set_stats(stats.release());
338  }
339 
340  return mset;
341 }
342 
345 {
346  if (query.empty())
347  return TermIterator();
348 
349  struct term_and_pos {
350  string term;
352 
353  term_and_pos(const string& term_, Xapian::termpos pos_)
354  : term(term_), pos(pos_) {}
355  };
356 
357  vector<term_and_pos> query_terms;
358  Xapian::termpos pos = 1;
359  for (auto t = query.get_terms_begin(); t != query.get_terms_end(); ++t) {
360  query_terms.emplace_back(*t, pos++);
361  }
362 
363  if (query_terms.empty())
364  return TermIterator();
365 
366  // Reorder by term, secondary sort by position.
367  sort(query_terms.begin(), query_terms.end(),
368  [](const term_and_pos& a, const term_and_pos& b) {
369  int cmp = a.term.compare(b.term);
370  return cmp ? cmp < 0 : a.pos < b.pos;
371  });
372 
373  // Loop through the query terms, skipping the document terms for each to
374  // see which match, and shuffling down the matching ones. Also discard
375  // repeats, keeping the smallest position.
376  size_t i = 0, j = 0;
377  auto t = db.termlist_begin(did);
378  do {
379  const string& term = query_terms[i].term;
380  if (j == 0 || term != query_terms[j - 1].term) {
381  t.skip_to(term);
382  if (t == db.termlist_end(did)) {
383  break;
384  }
385 
386  if (*t == term) {
387  // Matched, so move down if necessary.
388  if (i != j)
389  query_terms[j] = std::move(query_terms[i]);
390  ++j;
391  }
392  }
393  } while (++i != query_terms.size());
394 
395  // Truncate to leave just the matching terms.
396  query_terms.erase(query_terms.begin() + j, query_terms.end());
397 
398  // Reorder by ascending query position.
399  sort(query_terms.begin(), query_terms.end(),
400  [](const term_and_pos& a, const term_and_pos& b) {
401  return a.pos < b.pos;
402  });
403 
404  // Iterator adaptor to present query_terms as a container of just strings.
405  struct Itor {
406  vector<term_and_pos>::const_iterator it;
407 
408  explicit
409  Itor(vector<term_and_pos>::const_iterator it_) : it(it_) {}
410 
411  const std::string& operator*() const {
412  return it->term;
413  }
414 
415  Itor& operator++() {
416  ++it;
417  return *this;
418  }
419 
420  Itor operator++(int) {
421  Itor retval = *this;
422  ++it;
423  return retval;
424  }
425 
426  bool operator!=(const Itor& o) { return it != o.it; }
427  };
428 
429  return TermIterator(new VectorTermList(Itor(query_terms.cbegin()),
430  Itor(query_terms.cend())));
431 }
432 
433 ESet
435  const RSet& rset,
436  int flags,
437  const ExpandDecider* edecider_,
438  double min_weight) const
439 {
441  opt_intrusive_ptr<const ExpandDecider> edecider(edecider_);
442 
443  Xapian::ESet eset;
444 
445  if (maxitems == 0 || rset.empty()) {
446  // Either we were asked for no results, or wouldn't produce any
447  // because no documents were marked as relevant.
448  return eset;
449  }
450 
451  // Excluding query terms is a no-op without a query.
452  if ((flags & Enquire::INCLUDE_QUERY_TERMS) == 0 && !query.empty()) {
454  query.get_terms_end());
455  if (!edecider) {
456  edecider = edft->release();
457  } else {
458  // Make sure ExpandDeciderFilterTerms doesn't leak if new throws.
459  opt_intrusive_ptr<const ExpandDecider> ptr(edft->release());
460  edecider = (new ExpandDeciderAnd(ptr.get(),
461  edecider.get()))->release();
462  }
463  }
464 
465  bool use_exact_termfreq = flags & Enquire::USE_EXACT_TERMFREQ;
466  if (eweight == Enquire::Internal::EXPAND_BO1) {
468  Bo1EWeight bo1eweight(db, rset.size(), use_exact_termfreq);
469  eset.internal->expand(maxitems, db, rset, edecider.get(), bo1eweight,
470  min_weight);
471  } else {
474  ProbEWeight probeweight(db, rset.size(), use_exact_termfreq, expand_k);
475  eset.internal->expand(maxitems, db, rset, edecider.get(), probeweight,
476  min_weight);
477  }
478 
479  return eset;
480 }
481 
482 }
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
Definition: api_anydb.cc:62
Xapian::MSet get_mset(Xapian::doccount first, Xapian::doccount maxitems, Xapian::doccount check_at_least, Xapian::Weight::Internal &stats, const Xapian::Weight &wtscheme, const Xapian::MatchDecider *mdecider, const Xapian::KeyMaker *sorter, Xapian::valueno collapse_key, Xapian::doccount collapse_max, int percent_threshold, double weight_threshold, Xapian::Enquire::docid_order order, Xapian::valueno sort_key, Xapian::Enquire::Internal::sort_setting sort_by, bool sort_val_reverse, double time_limit, const std::vector< opt_ptr_spy > &matchspies)
Run the match and produce an MSet object.
Definition: matcher.cc:577
This class stores a list of terms.
Xapian::Weight subclass implementing the BM25 probabilistic formula.
Definition: weight.h:1050
An indexed database of documents.
Definition: database.h:75
Class representing a list of search results.
Definition: eset.h:42
Xapian::Internal::intrusive_ptr_nonnull< Internal > internal
Definition: eset.h:47
Internal(const Database &db_)
Definition: enquire.cc:253
MSet get_mset(doccount first, doccount maxitems, doccount checkatleast, const RSet *rset, const MatchDecider *mdecider) const
Definition: enquire.cc:257
ESet get_eset(termcount maxitems, const RSet &rset, int flags, const ExpandDecider *edecider_, double min_weight) const
Definition: enquire.cc:434
TermIterator get_matching_terms_begin(docid did) const
Definition: enquire.cc:344
Querying session.
Definition: enquire.h:57
void set_weighting_scheme(const Weight &weight)
Set the weighting scheme to use.
Definition: enquire.cc:85
static const int USE_EXACT_TERMFREQ
Flag telling get_eset() to always use the exact term frequency.
Definition: enquire.h:479
void add_matchspy(MatchSpy *spy) XAPIAN_NONNULL()
Add a matchspy.
Definition: enquire.cc:179
void set_sort_by_key(KeyMaker *sorter, bool reverse) XAPIAN_NONNULL()
Set the sorting to be by key generated from values only.
Definition: enquire.cc:112
MSet get_mset(doccount first, doccount maxitems, doccount checkatleast=0, const RSet *rset=NULL, const MatchDecider *mdecider=NULL) const
Run the query.
Definition: enquire.cc:200
void clear_matchspies()
Remove all the matchspies.
Definition: enquire.cc:188
TermIterator get_matching_terms_begin(docid did) const
Iterate query terms matching a document.
Definition: enquire.cc:210
void set_time_limit(double time_limit)
Set a time limit for the match.
Definition: enquire.cc:194
void set_sort_by_value_then_relevance(valueno sort_key, bool reverse)
Set the sorting to be by value, then by relevance for documents with the same value.
Definition: enquire.cc:123
void set_cutoff(int percent_threshold, double weight_threshold=0)
Set lower bounds on percentage and/or weight.
Definition: enquire.cc:172
void set_expansion_scheme(std::string_view eweightname, double expand_k=1.0) const
Set the weighting scheme to use for expansion.
Definition: enquire.cc:216
void set_query(const Query &query, termcount query_length=0)
Set the query.
Definition: enquire.cc:72
ESet get_eset(termcount maxitems, const RSet &rset, int flags=0, const ExpandDecider *edecider=NULL, double min_weight=0.0) const
Perform query expansion.
Definition: enquire.cc:231
Xapian::Internal::intrusive_ptr_nonnull< Internal > internal
Definition: enquire.h:60
void set_sort_by_relevance_then_key(KeyMaker *sorter, bool reverse) XAPIAN_NONNULL()
Set the sorting to be by relevance, then by keys generated from values.
Definition: enquire.cc:153
void set_sort_by_relevance_then_value(valueno sort_key, bool reverse)
Set the sorting to be by relevance then value.
Definition: enquire.cc:144
std::string get_description() const
Return a string describing this object.
Definition: enquire.cc:241
~Enquire()
Destructor.
Definition: enquire.cc:69
void set_sort_by_relevance()
Set the sorting to be by relevance only.
Definition: enquire.cc:97
void set_sort_by_value(valueno sort_key, bool reverse)
Set the sorting to be by value only.
Definition: enquire.cc:103
const Query & get_query() const
Get the currently set query.
Definition: enquire.cc:79
void set_collapse_key(valueno collapse_key, doccount collapse_max=1)
Control collapsing of results.
Definition: enquire.cc:165
void set_docid_order(docid_order order)
Set sort order for document IDs.
Definition: enquire.cc:91
void set_sort_by_key_then_relevance(KeyMaker *sorter, bool reverse) XAPIAN_NONNULL()
Set the sorting to be by keys generated from values, then by relevance for documents with identical k...
Definition: enquire.cc:132
docid_order
Ordering of docids.
Definition: enquire.h:130
static const int INCLUDE_QUERY_TERMS
Flag telling get_eset() to allow query terms in Xapian::ESet.
Definition: enquire.h:469
ExpandDecider subclass which rejects terms using two ExpandDeciders.
Definition: expanddecider.h:88
ExpandDecider subclass which rejects terms in a specified list.
Virtual base class for expand decider functor.
Definition: expanddecider.h:38
This class implements the Bo1 scheme for query expansion.
Definition: expandweight.h:233
This class implements the probabilistic scheme for query expansion.
Definition: expandweight.h:200
A smart pointer that optionally uses intrusive reference counting.
InvalidArgumentError indicates an invalid parameter value was passed to the API.
Definition: error.h:229
Virtual base class for key making functors.
Definition: keymaker.h:44
Class representing a list of search results.
Definition: mset.h:46
Xapian::Internal::intrusive_ptr_nonnull< Internal > internal
Definition: mset.h:78
Abstract base class for match deciders.
Definition: matchdecider.h:37
Abstract base class for match spies.
Definition: matchspy.h:50
Class representing a query.
Definition: query.h:45
const TermIterator get_terms_begin() const
Begin iterator for terms in the query object.
Definition: query.cc:198
const TermIterator get_terms_end() const noexcept
End iterator for terms in the query object.
Definition: query.h:639
bool empty() const noexcept
Check if this query is Xapian::Query::MatchNothing.
Definition: query.h:661
Xapian::termcount get_length() const noexcept
Return the length of this query object.
Definition: query.cc:250
Class representing a set of documents judged as relevant.
Definition: rset.h:39
Xapian::doccount size() const
Return number of documents in this RSet object.
Definition: rset.cc:49
bool empty() const
Return true if this RSet object is empty.
Definition: rset.h:81
Class for iterating over a list of terms.
Definition: termiterator.h:41
UnimplementedError indicates an attempt to use an unimplemented feature.
Definition: error.h:313
Class to hold statistics for a given collection.
Abstract base class for weighting schemes.
Definition: weight.h:38
virtual Weight * clone() const =0
Clone this object.
An indexed database of documents.
string term
Xapian::termpos pos
static void throw_invalid_arg(const char *msg)
Definition: enquire.cc:51
Querying session.
Xapian::Enquire internals.
Hierarchy of classes which Xapian can throw as exceptions.
Xapian::ESet::Internal class.
Allow rejection of terms during ESet generation.
Collate statistics and calculate the term weights for the ESet.
Build key strings for MSet ordering or collapsing.
static constexpr auto VAL_REL
Definition: matcher.cc:69
static constexpr auto VAL
Definition: matcher.cc:68
Matcher class.
MatchSpy implementation.
Xapian::MSet internals.
void sort(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:277
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned valueno
The number for a value slot in a document.
Definition: types.h:90
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
bool operator!=(const ESetIterator &a, const ESetIterator &b) noexcept
Inequality test for ESetIterator objects.
Definition: eset.h:278
const Query operator*(double factor, const Query &q)
Scale a Xapian::Query object using OP_SCALE_WEIGHT.
Definition: query.h:827
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
Definition: types.h:75
Various assertion macros.
#define AssertEq(A, B)
Definition: omassert.h:124
Xapian::Query API class.
Set of documents judged as relevant.
A vector-like container of terms which can be iterated.
Weighting scheme API.
Xapian::Weight::Internal class, holding database and term statistics.