xapian-core  1.4.27
multimatch.cc
Go to the documentation of this file.
1 /* multimatch.cc
2  *
3  * Copyright 1999,2000,2001 BrightStation PLC
4  * Copyright 2001,2002 Ananova Ltd
5  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2015,2016 Olly Betts
6  * Copyright 2003 Orange PCS Ltd
7  * Copyright 2003 Sam Liddicott
8  * Copyright 2007,2008,2009 Lemur Consulting Ltd
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public License as
12  * published by the Free Software Foundation; either version 2 of the
13  * License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23  * USA
24  */
25 
26 #include <config.h>
27 
28 #include "multimatch.h"
29 
30 #include "autoptr.h"
31 #include "collapser.h"
32 #include "debuglog.h"
33 #include "submatch.h"
34 #include "localsubmatch.h"
35 #include "omassert.h"
36 #include "api/omenquireinternal.h"
37 #include "realtime.h"
38 
39 #include "api/emptypostlist.h"
40 #include "branchpostlist.h"
41 #include "mergepostlist.h"
42 
43 #include "backends/document.h"
44 
45 #include "msetcmp.h"
46 
47 #include "valuestreamdocument.h"
48 #include "weight/weightinternal.h"
49 
50 #include <xapian/error.h>
51 #include <xapian/matchspy.h>
52 #include <xapian/version.h> // For XAPIAN_HAS_REMOTE_BACKEND
53 
54 #ifdef XAPIAN_HAS_REMOTE_BACKEND
55 #include "remotesubmatch.h"
57 #endif /* XAPIAN_HAS_REMOTE_BACKEND */
58 
59 #include <algorithm>
60 #include <cfloat> // For DBL_EPSILON.
61 #include <climits> // For UINT_MAX.
62 #include <vector>
63 #include <map>
64 #include <set>
65 
66 #ifdef HAVE_TIMER_CREATE
67 #include <signal.h>
68 #include <time.h>
69 #include "safeunistd.h" // For _POSIX_* feature test macros.
70 
71 extern "C" {
72 
73 static void
74 set_timeout_flag(union sigval sv)
75 {
76  *(reinterpret_cast<volatile bool*>(sv.sival_ptr)) = true;
77 }
78 
79 }
80 
81 // The monotonic clock is the better basis for timeouts, but not always
82 // available.
83 
84 #ifndef _POSIX_MONOTONIC_CLOCK
85 const clockid_t TIMEOUT_CLOCK = CLOCK_REALTIME;
86 #elif defined __sun
87 // Solaris defines CLOCK_MONOTONIC, but "man timer_create" doesn't mention it
88 // and when using it timer_create() fails with "EPERM" (perhaps you need to be
89 // root to use it? I can't test that).
90 //
91 // Solaris defines _POSIX_MONOTONIC_CLOCK so we need to special case.
92 const clockid_t TIMEOUT_CLOCK = CLOCK_REALTIME;
93 #elif defined __CYGWIN__
94 // https://cygwin.com/cygwin-api/std-notes.html currently (2023-04-18) says:
95 //
96 // clock_nanosleep currently supports only CLOCK_REALTIME and
97 // CLOCK_MONOTONIC. clock_setres, clock_settime, and timer_create
98 // currently support only CLOCK_REALTIME.
99 //
100 // So CLOCK_MONOTONIC is defined, but not supported by timer_create().
101 const clockid_t TIMEOUT_CLOCK = CLOCK_REALTIME;
102 #else
103 const clockid_t TIMEOUT_CLOCK = CLOCK_MONOTONIC;
104 #endif
105 
106 class TimeOut {
107  struct sigevent sev;
108  timer_t timerid;
109  volatile bool expired;
110 
111  public:
112  explicit TimeOut(double limit) : expired(false) {
113  if (limit > 0) {
114  sev.sigev_notify = SIGEV_THREAD;
115  sev.sigev_notify_function = set_timeout_flag;
116  sev.sigev_notify_attributes = NULL;
117  sev.sigev_value.sival_ptr =
118  static_cast<void*>(const_cast<bool*>(&expired));
119  if (usual(timer_create(TIMEOUT_CLOCK, &sev, &timerid) == 0)) {
120  struct itimerspec interval;
121  interval.it_interval.tv_sec = 0;
122  interval.it_interval.tv_nsec = 0;
123  RealTime::to_timespec(limit, &interval.it_value);
124  if (usual(timer_settime(timerid, 0, &interval, NULL) == 0)) {
125  // Timeout successfully set.
126  return;
127  }
128  timer_delete(timerid);
129  }
130  }
131  sev.sigev_notify = SIGEV_NONE;
132  }
133 
134  ~TimeOut() {
135  if (sev.sigev_notify != SIGEV_NONE) {
136  timer_delete(timerid);
137  sev.sigev_notify = SIGEV_NONE;
138  }
139  }
140 
141  bool timed_out() const { return expired; }
142 };
143 #else
144 class TimeOut {
145  public:
146  explicit TimeOut(double) { }
147  bool timed_out() const { return false; }
148 };
149 #endif
150 
151 using namespace std;
153 
160 #if 0 // VAL_REL isn't currently used so avoid compiler warnings.
163 #endif
164 
172 static void
174  Xapian::doccount number_of_subdbs,
175  vector<Xapian::RSet> & subrsets)
176 {
177  LOGCALL_STATIC_VOID(MATCH, "split_rset_by_db", rset | number_of_subdbs | subrsets);
178  if (rset && !rset->empty()) {
179  if (number_of_subdbs == 1) {
180  // The common case of a single database is easy to handle.
181  subrsets.push_back(*rset);
182  } else {
183  // Can't just use vector::resize() here, since that creates N
184  // copies of the same RSet!
185  subrsets.reserve(number_of_subdbs);
186  for (size_t i = 0; i < number_of_subdbs; ++i) {
187  subrsets.push_back(Xapian::RSet());
188  }
189 
190  const set<Xapian::docid> & rsetitems = rset->internal->get_items();
191  set<Xapian::docid>::const_iterator j;
192  for (j = rsetitems.begin(); j != rsetitems.end(); ++j) {
193  Xapian::doccount local_docid = (*j - 1) / number_of_subdbs + 1;
194  Xapian::doccount subdatabase = (*j - 1) % number_of_subdbs;
195  subrsets[subdatabase].add_document(local_docid);
196  }
197  }
198  } else {
199  // NB vector::resize() creates N copies of the same empty RSet.
200  subrsets.resize(number_of_subdbs);
201  }
202  Assert(subrsets.size() == number_of_subdbs);
203 }
204 
223 static void
225  Xapian::Weight::Internal & stats)
226 {
227  LOGCALL_STATIC_VOID(MATCH, "prepare_sub_matches", leaves | stats);
228  // We use a vector<bool> to track which SubMatches we're already prepared.
229  vector<bool> prepared;
230  prepared.resize(leaves.size(), false);
231  size_t unprepared = leaves.size();
232  bool nowait = true;
233  while (unprepared) {
234  for (size_t leaf = 0; leaf < leaves.size(); ++leaf) {
235  if (prepared[leaf]) continue;
236  SubMatch * submatch = leaves[leaf].get();
237  if (!submatch || submatch->prepare_match(nowait, stats)) {
238  prepared[leaf] = true;
239  --unprepared;
240  }
241  }
242  // Use blocking IO on subsequent passes, so that we don't go into
243  // a tight loop.
244  nowait = false;
245  }
246 }
247 
250  private:
252  const std::vector<Xapian::Internal::opt_intrusive_ptr<Xapian::MatchSpy>> & spies;
253 
254  public:
255  explicit
257  : spies(spies_) {}
258 
263  void operator()(const Xapian::Document &doc, double wt);
264 };
265 
266 void
268  LOGCALL_VOID(MATCH, "MultipleMatchSpy::operator()", doc | wt);
269  for (auto i : spies) {
270  (*i)(doc, wt);
271  }
272 }
273 
275 // Initialisation and cleaning up //
278  const Xapian::Query & query_,
279  Xapian::termcount qlen,
280  const Xapian::RSet * omrset,
281  Xapian::doccount collapse_max_,
282  Xapian::valueno collapse_key_,
283  int percent_cutoff_, double weight_cutoff_,
285  Xapian::valueno sort_key_,
287  bool sort_value_forward_,
288  double time_limit_,
289  Xapian::Weight::Internal & stats,
290  const Xapian::Weight * weight_,
291  const vector<Xapian::Internal::opt_intrusive_ptr<Xapian::MatchSpy>> & matchspies_,
292  bool have_sorter, bool have_mdecider)
293  : db(db_), query(query_),
294  collapse_max(collapse_max_), collapse_key(collapse_key_),
295  percent_cutoff(percent_cutoff_), weight_cutoff(weight_cutoff_),
296  order(order_),
297  sort_key(sort_key_), sort_by(sort_by_),
298  sort_value_forward(sort_value_forward_),
299  time_limit(time_limit_),
300  weight(weight_),
301  is_remote(db.internal.size()),
302  matchspies(matchspies_)
303 {
304  LOGCALL_CTOR(MATCH, "MultiMatch", db_ | query_ | qlen | omrset | collapse_max_ | collapse_key_ | percent_cutoff_ | weight_cutoff_ | int(order_) | sort_key_ | int(sort_by_) | sort_value_forward_ | time_limit_| stats | weight_ | matchspies_ | have_sorter | have_mdecider);
305 
306  if (query.empty()) return;
307 
308  Xapian::doccount number_of_subdbs = db.internal.size();
309  vector<Xapian::RSet> subrsets;
310  split_rset_by_db(omrset, number_of_subdbs, subrsets);
311 
312  for (size_t i = 0; i != number_of_subdbs; ++i) {
313  Xapian::Database::Internal *subdb = db.internal[i].get();
314  Assert(subdb);
316 
317  // There is currently only one special case, for network databases.
318 #ifdef XAPIAN_HAS_REMOTE_BACKEND
319  if (subdb->get_backend_info(NULL) == BACKEND_REMOTE) {
320  RemoteDatabase *rem_db = static_cast<RemoteDatabase*>(subdb);
321  if (have_sorter) {
322  throw Xapian::UnimplementedError("Xapian::KeyMaker not supported for the remote backend");
323  }
324  if (have_mdecider) {
325  throw Xapian::UnimplementedError("Xapian::MatchDecider not supported for the remote backend");
326  }
327  // FIXME: Remote handling for time_limit with multiple
328  // databases may need some work.
329  rem_db->set_query(query, qlen, collapse_max, collapse_key,
331  time_limit,
333  subrsets[i], matchspies);
334  bool decreasing_relevance =
335  (sort_by == REL || sort_by == REL_VAL);
336  smatch = new RemoteSubMatch(rem_db, decreasing_relevance, matchspies);
337  is_remote[i] = true;
338  } else {
339  smatch = new LocalSubMatch(subdb, query, qlen, subrsets[i], weight,
340  i);
341  subdb->readahead_for_query(query);
342  }
343 #else
344  // Avoid unused parameter warnings.
345  (void)have_sorter;
346  (void)have_mdecider;
347  smatch = new LocalSubMatch(subdb, query, qlen, subrsets[i], weight, i);
348 #endif /* XAPIAN_HAS_REMOTE_BACKEND */
349  leaves.push_back(smatch);
350  }
351 
352  stats.set_query(query);
353  prepare_sub_matches(leaves, stats);
354  stats.set_bounds_from_db(db);
355 }
356 
357 double
359 {
360  LOGCALL(MATCH, double, "MultiMatch::getorrecalc_maxweight", pl);
361  double wt;
362  if (recalculate_w_max) {
363  LOGLINE(MATCH, "recalculating max weight");
364  wt = pl->recalc_maxweight();
365  recalculate_w_max = false;
366  } else {
367  wt = pl->get_maxweight();
368  LOGLINE(MATCH, "pl = (" << pl->get_description() << ")");
369  // FIXME: This fails for hoistnotbug1 under multi_remoteprog_glass:
370  // AssertionError: matcher/multimatch.cc:370: within_DBL_EPSILON(wt, pl->recalc_maxweight()) : values were 2.7075940282079162813 and 2.6966211268553141878
371  //
372  // AssertEqDoubleParanoid(wt, pl->recalc_maxweight());
373  //
374  // Not sure why - adding code to call recalc_maxweight() doesn't seem
375  // to help. But the max weight not being as low as it could be is
376  // only a potential missed optimisation opportunity, not a correctness
377  // issue.
378  }
379  LOGLINE(MATCH, "max possible doc weight = " << wt);
380  RETURN(wt);
381 }
382 
383 void
385  Xapian::doccount check_at_least,
386  Xapian::MSet & mset,
387  Xapian::Weight::Internal & stats,
388  const Xapian::MatchDecider *mdecider,
389  const Xapian::KeyMaker *sorter)
390 {
391  LOGCALL_VOID(MATCH, "MultiMatch::get_mset", first | maxitems | check_at_least | Literal("mset") | stats | Literal("mdecider") | Literal("sorter"));
392  AssertRel(check_at_least,>=,first + maxitems);
393 
394  if (query.empty()) {
395  mset = Xapian::MSet();
396  mset.internal->firstitem = first;
397  return;
398  }
399 
400  Assert(!leaves.empty());
401 
403 
404 #ifdef XAPIAN_HAS_REMOTE_BACKEND
405  // If there's only one database and it's remote, we can just unserialise
406  // its MSet and return that.
407  if (leaves.size() == 1 && is_remote[0]) {
408  RemoteSubMatch * rem_match;
409  rem_match = static_cast<RemoteSubMatch*>(leaves[0].get());
410  rem_match->start_match(first, maxitems, check_at_least, stats);
411  rem_match->get_mset(mset);
412  return;
413  }
414 #endif
415 
416  // Start matchers.
417  for (size_t i = 0; i != leaves.size(); ++i) {
418  auto& leaf = leaves[i];
419 #ifdef XAPIAN_HAS_REMOTE_BACKEND
420  if (is_remote[i]) {
421  Xapian::doccount remote_maxitems = first + maxitems;
422  if (collapse_max != 0) {
423  // If collapsing we need to fetch all check_at_least items in
424  // order to satisfy the requirement that if there are <=
425  // check_at_least results then then estimated number of matches
426  // is exact.
427  remote_maxitems = check_at_least;
428  }
429  leaf->start_match(0, remote_maxitems, check_at_least, stats);
430  continue;
431  }
432 #endif
433  leaf->start_match(0, first + maxitems, check_at_least, stats);
434  }
435 
436  // Get postlists and term info
437  vector<PostList *> postlists;
438  Xapian::termcount total_subqs = 0;
439  // Keep a count of matches which we know exist, but we won't see. This
440  // occurs when a submatch is remote, and returns a lower bound on the
441  // number of matching documents which is higher than the number of
442  // documents it returns (because it wasn't asked for more documents).
443  Xapian::doccount definite_matches_not_seen = 0;
444 #ifdef XAPIAN_HAS_REMOTE_BACKEND
445  // Track these for calculating uncollapsed_upper_bound for the local.
446  size_t n_remotes = 0;
447  Xapian::doccount remote_uncollapsed_upper_bound = 0;
448 #endif
449  try {
450  for (size_t i = 0; i != leaves.size(); ++i) {
451  // Pick the highest total subqueries answer amongst the
452  // subdatabases, as the query to postlist conversion doesn't
453  // recurse into positional queries for shards that don't have
454  // positional data when at least one other shard does.
455  Xapian::termcount total_subqs_i = 0;
456  PostList* pl = leaves[i]->get_postlist(this, &total_subqs_i, stats);
457  total_subqs = max(total_subqs, total_subqs_i);
458 #ifdef XAPIAN_HAS_REMOTE_BACKEND
459  if (is_remote[i]) {
460  ++n_remotes;
461  RemoteSubMatch* rem_match =
462  static_cast<RemoteSubMatch*>(leaves[i].get());
463  remote_uncollapsed_upper_bound +=
464  rem_match->get_uncollapsed_upper_bound();
465  if (collapse_max == 0 &&
466  pl->get_termfreq_min() > first + maxitems) {
467  LOGLINE(MATCH, "Found " <<
468  pl->get_termfreq_min() - (first + maxitems)
469  << " definite matches in remote submatch "
470  "which aren't passed to local match");
471  definite_matches_not_seen += pl->get_termfreq_min();
472  definite_matches_not_seen -= first + maxitems;
473  }
474  }
475 #endif
476  postlists.push_back(pl);
477  }
478  } catch (...) {
479  for (auto pl : postlists) delete pl;
480  throw;
481  }
482  Assert(!postlists.empty());
483 
484  ValueStreamDocument vsdoc(db);
485  ++vsdoc._refs;
486  Xapian::Document doc(&vsdoc);
487 
488  // Get a single combined postlist
489  AutoPtr<PostList> pl;
490  if (postlists.size() == 1) {
491  pl.reset(postlists.front());
492  } else {
493  pl.reset(new MergePostList(postlists, this, vsdoc));
494  }
495 
496  LOGLINE(MATCH, "pl = (" << pl->get_description() << ")");
497 
498  // Empty result set
499  Xapian::doccount docs_matched = 0;
500  double greatest_wt = 0;
501  Xapian::termcount greatest_wt_subqs_matched = 0;
502 #ifdef XAPIAN_HAS_REMOTE_BACKEND
503  unsigned greatest_wt_subqs_db_num = UINT_MAX;
504 #endif
505  vector<Xapian::Internal::MSetItem> items;
506 
507  // maximum weight a document could possibly have
508  const double max_possible = pl->recalc_maxweight();
509 
510  LOGLINE(MATCH, "pl = (" << pl->get_description() << ")");
511  recalculate_w_max = false;
512 
513  Xapian::doccount matches_upper_bound = pl->get_termfreq_max();
514  Xapian::doccount matches_lower_bound = 0;
515  Xapian::doccount matches_estimated = pl->get_termfreq_est();
516 
517  if (mdecider == NULL) {
518  // If we have a match decider, the lower bound must be
519  // set to 0 as we could discard all hits. Otherwise set it to the
520  // minimum number of entries which the postlist could return.
521  matches_lower_bound = pl->get_termfreq_min();
522  }
523 
524  // Prepare the matchspy
525  Xapian::MatchSpy *matchspy = NULL;
526  MultipleMatchSpy multispy(matchspies);
527  if (!matchspies.empty()) {
528  if (matchspies.size() == 1) {
529  matchspy = matchspies[0].get();
530  } else {
531  matchspy = &multispy;
532  }
533  }
534 
535  // Check if any results have been asked for (might just be wanting
536  // maxweight).
537  if (check_at_least == 0) {
538  pl.reset(NULL);
539  Xapian::doccount uncollapsed_lower_bound = matches_lower_bound;
540  if (collapse_max) {
541  // Lower bound must be set to no more than collapse_max, since it's
542  // possible that all matching documents have the same collapse_key
543  // value and so are collapsed together.
544  if (matches_lower_bound > collapse_max)
545  matches_lower_bound = collapse_max;
546  }
547 
548  mset.internal = new Xapian::MSet::Internal(
549  first,
550  matches_upper_bound,
551  matches_lower_bound,
552  matches_estimated,
553  matches_upper_bound,
554  uncollapsed_lower_bound,
555  matches_estimated,
556  max_possible, greatest_wt, items,
557  0);
558  return;
559  }
560 
561  // Number of documents considered by a decider.
562  Xapian::doccount decider_considered = 0;
563  // Number of documents denied by the decider.
564  Xapian::doccount decider_denied = 0;
565 
566  // Set max number of results that we want - this is used to decide
567  // when to throw away unwanted items.
568  Xapian::doccount max_msize = first + maxitems;
569  items.reserve(max_msize + 1);
570 
571  // Tracks the minimum item currently eligible for the MSet - we compare
572  // candidate items against this.
573  Xapian::Internal::MSetItem min_item(0.0, 0);
574 
575  // Minimum weight an item must have to be worth considering.
576  double min_weight = weight_cutoff;
577 
578  // Factor to multiply maximum weight seen by to get the cutoff weight.
579  double percent_cutoff_factor = percent_cutoff / 100.0;
580  // Corresponding correction to that in omenquire.cc to account for excess
581  // precision on x86.
582  percent_cutoff_factor -= DBL_EPSILON;
583 
584  // Object to handle collapsing.
585  Collapser collapser(collapse_key, collapse_max);
586 
588  bool sort_forward = (order != Xapian::Enquire::DESCENDING);
590 
591  // Perform query
592 
593  // We form the mset in two stages. In the first we fill up our working
594  // mset. Adding a new document does not remove another.
595  //
596  // In the second, we consider documents which rank higher than the current
597  // lowest ranking document in the mset. Each document added expels the
598  // current lowest ranking document.
599  //
600  // If a percentage cutoff is in effect, it can cause the matcher to return
601  // from the second stage from the first.
602 
603  // Is the mset a valid heap?
604  bool is_heap = false;
605 
606  while (true) {
607  bool pushback;
608 
609  if (rare(recalculate_w_max)) {
610  if (min_weight > 0.0) {
611  if (rare(getorrecalc_maxweight(pl.get()) < min_weight)) {
612  LOGLINE(MATCH, "*** TERMINATING EARLY (1)");
613  break;
614  }
615  }
616  }
617 
618  PostList * pl_copy = pl.get();
619  if (rare(next_handling_prune(pl_copy, min_weight, this))) {
620  (void)pl.release();
621  pl.reset(pl_copy);
622  LOGLINE(MATCH, "*** REPLACING ROOT");
623 
624  if (min_weight > 0.0) {
625  // No need for a full recalc (unless we've got to do one
626  // because of a prune elsewhere) - we're just switching to a
627  // subtree.
628  if (rare(getorrecalc_maxweight(pl.get()) < min_weight)) {
629  LOGLINE(MATCH, "*** TERMINATING EARLY (2)");
630  break;
631  }
632  }
633  }
634 
635  if (rare(pl->at_end())) {
636  LOGLINE(MATCH, "Reached end of potential matches");
637  break;
638  }
639 
640  // Only calculate the weight if we need it for mcmp, or there's a
641  // percentage or weight cutoff in effect. Otherwise we calculate it
642  // below if we haven't already rejected this candidate.
643  double wt = 0.0;
644  bool calculated_weight = false;
645  if (sort_by != VAL || min_weight > 0.0) {
646  wt = pl->get_weight();
647  if (wt < min_weight) {
648  LOGLINE(MATCH, "Rejecting potential match due to insufficient weight");
649  continue;
650  }
651  calculated_weight = true;
652  }
653 
654  Xapian::docid did = pl->get_docid();
655  vsdoc.set_document(did);
656  LOGLINE(MATCH, "Candidate document id " << did << " wt " << wt);
657  Xapian::Internal::MSetItem new_item(wt, did);
658  if (check_at_least > first + maxitems && timeout.timed_out()) {
659  check_at_least = first + maxitems;
660  }
661 
662  if (sort_by != REL) {
663  const string * ptr = pl->get_sort_key();
664  if (ptr) {
665  new_item.sort_key = *ptr;
666  } else if (sorter) {
667  new_item.sort_key = (*sorter)(doc);
668  } else {
669  new_item.sort_key = vsdoc.get_value(sort_key);
670  }
671 
672  // We're sorting by value (in part at least), so compare the item
673  // against the lowest currently in the proto-mset. If sort_by is
674  // VAL, then new_item.wt won't yet be set, but that doesn't
675  // matter since it's not used by the sort function.
676  if (!mcmp(new_item, min_item)) {
677  if (mdecider == NULL && !collapser) {
678  // Document was definitely suitable for mset - no more
679  // processing needed.
680  LOGLINE(MATCH, "Making note of match item which sorts lower than min_item");
681  ++docs_matched;
682  if (!calculated_weight) wt = pl->get_weight();
683  if (matchspy) {
684  matchspy->operator()(doc, wt);
685  }
686  if (wt > greatest_wt) goto new_greatest_weight;
687  continue;
688  }
689  if (docs_matched >= check_at_least) {
690  // We've seen enough items - we can drop this one.
691  LOGLINE(MATCH, "Dropping candidate which sorts lower than min_item");
692  // FIXME: hmm, match decider might have rejected this...
693  if (!calculated_weight) wt = pl->get_weight();
694  if (wt > greatest_wt) goto new_greatest_weight;
695  continue;
696  }
697  // We can't drop the item, because we need to test whether the
698  // mdecider would accept it and/or test whether it would be
699  // collapsed.
700  LOGLINE(MATCH, "Keeping candidate which sorts lower than min_item for further investigation");
701  }
702  }
703 
704  // Use the match spy and/or decision functors (if specified).
705  if (matchspy != NULL || mdecider != NULL) {
706  const unsigned int multiplier = db.internal.size();
707  Assert(multiplier != 0);
708  Xapian::doccount n = (did - 1) % multiplier; // which actual database
709  // If the results are from a remote database, then the functor will
710  // already have been applied there so we can skip this step.
711  if (!is_remote[n]) {
712  ++decider_considered;
713  if (mdecider && !mdecider->operator()(doc)) {
714  ++decider_denied;
715  continue;
716  }
717  if (matchspy) {
718  if (!calculated_weight) {
719  wt = pl->get_weight();
720  new_item.wt = wt;
721  calculated_weight = true;
722  }
723  matchspy->operator()(doc, wt);
724  }
725  }
726  }
727 
728  if (!calculated_weight) {
729  // we didn't calculate the weight above, but now we will need it
730  wt = pl->get_weight();
731  new_item.wt = wt;
732  }
733 
734  pushback = true;
735 
736  // Perform collapsing on key if requested.
737  if (collapser) {
738  collapse_result res;
739  res = collapser.process(new_item, pl.get(), vsdoc, mcmp);
740  if (res == REJECTED) {
741  // If we're sorting by relevance primarily, then we throw away
742  // the lower weighted document anyway.
743  if (sort_by != REL && sort_by != REL_VAL) {
744  if (wt > greatest_wt) goto new_greatest_weight;
745  }
746  continue;
747  }
748 
749  if (res == REPLACED) {
750  // There was a previous item in the collapse tab so
751  // the MSet can't be empty.
752  Assert(!items.empty());
753 
754  const Xapian::Internal::MSetItem & old_item =
755  collapser.old_item;
756  // This is one of the best collapse_max potential MSet entries
757  // with this key which we've seen so far. Check if the
758  // entry with this key which it displaced might still be in the
759  // proto-MSet. If it might be, we need to check through for
760  // it.
761  double old_wt = old_item.wt;
762  if (old_wt >= min_weight && mcmp(old_item, min_item)) {
763  // Scan through (unsorted) MSet looking for entry.
764  // FIXME: more efficient way than just scanning?
765  Xapian::docid olddid = old_item.did;
766  vector<Xapian::Internal::MSetItem>::iterator i;
767  for (i = items.begin(); i != items.end(); ++i) {
768  if (i->did == olddid) {
769  LOGLINE(MATCH, "collapse: removing " <<
770  olddid << ": " <<
771  new_item.collapse_key);
772  // We can replace an arbitrary element in O(log N)
773  // but have to do it by hand (in this case the new
774  // elt is bigger, so we just swap down the tree).
775  // FIXME: implement this, and clean up is_heap
776  // handling
777  *i = new_item;
778  pushback = false;
779  is_heap = false;
780  break;
781  }
782  }
783  }
784  }
785  }
786 
787  // OK, actually add the item to the mset.
788  if (pushback) {
789  ++docs_matched;
790  if (items.size() >= max_msize) {
791  items.push_back(new_item);
792  if (!is_heap) {
793  is_heap = true;
794  make_heap(items.begin(), items.end(), mcmp);
795  } else {
796  push_heap<vector<Xapian::Internal::MSetItem>::iterator,
797  MSetCmp>(items.begin(), items.end(), mcmp);
798  }
799  pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
800  MSetCmp>(items.begin(), items.end(), mcmp);
801  items.pop_back();
802 
803  if (!items.empty()) {
804  // In the get_mset(0, 0, X) (X > 0) case just don't update min_item.
805  min_item = items.front();
806  }
807  if (sort_by == REL || sort_by == REL_VAL) {
808  if (docs_matched >= check_at_least) {
809  if (sort_by == REL) {
810  // We're done if this is a forward boolean match
811  // with only one database (bodgetastic, FIXME
812  // better if we can!)
813  if (rare(max_possible == 0 && sort_forward)) {
814  // In the multi database case, MergePostList
815  // currently processes each database
816  // sequentially (which actually may well be
817  // more efficient) so the docids in general
818  // won't arrive in order.
819  if (leaves.size() == 1) break;
820  }
821  }
822  if (min_item.wt > min_weight) {
823  LOGLINE(MATCH, "Setting min_weight to " <<
824  min_item.wt << " from " << min_weight);
825  min_weight = min_item.wt;
826  }
827  }
828  }
829  if (rare(getorrecalc_maxweight(pl.get()) < min_weight)) {
830  LOGLINE(MATCH, "*** TERMINATING EARLY (3)");
831  break;
832  }
833  } else {
834  items.push_back(new_item);
835  is_heap = false;
836  if (sort_by == REL && items.size() == max_msize) {
837  if (docs_matched >= check_at_least) {
838  // We're done if this is a forward boolean match
839  // with only one database (bodgetastic, FIXME
840  // better if we can!)
841  if (rare(max_possible == 0 && sort_forward)) {
842  // In the multi database case, MergePostList
843  // currently processes each database
844  // sequentially (which actually may well be
845  // more efficient) so the docids in general
846  // won't arrive in order.
847  if (leaves.size() == 1) break;
848  }
849  }
850  }
851  }
852  }
853 
854  // Keep a track of the greatest weight we've seen.
855  if (wt > greatest_wt) {
856 new_greatest_weight:
857  greatest_wt = wt;
858 #ifdef XAPIAN_HAS_REMOTE_BACKEND
859  const unsigned int multiplier = db.internal.size();
860  unsigned int db_num = (did - 1) % multiplier;
861  if (is_remote[db_num]) {
862  // Note that the greatest weighted document came from a remote
863  // database, and which one.
864  greatest_wt_subqs_db_num = db_num;
865  } else
866 #endif
867  {
868  greatest_wt_subqs_matched = pl->count_matching_subqs();
869 #ifdef XAPIAN_HAS_REMOTE_BACKEND
870  greatest_wt_subqs_db_num = UINT_MAX;
871 #endif
872  }
873  if (percent_cutoff) {
874  double w = wt * percent_cutoff_factor;
875  if (w > min_weight) {
876  min_weight = w;
877  if (!is_heap) {
878  is_heap = true;
879  make_heap<vector<Xapian::Internal::MSetItem>::iterator,
880  MSetCmp>(items.begin(), items.end(), mcmp);
881  }
882  while (!items.empty() && items.front().wt < min_weight) {
883  pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
884  MSetCmp>(items.begin(), items.end(), mcmp);
885  Assert(items.back().wt < min_weight);
886  items.pop_back();
887  }
888 #ifdef XAPIAN_ASSERTIONS_PARANOID
889  vector<Xapian::Internal::MSetItem>::const_iterator i;
890  for (i = items.begin(); i != items.end(); ++i) {
891  Assert(i->wt >= min_weight);
892  }
893 #endif
894  }
895  }
896  }
897  }
898 
899  // done with posting list tree
900  pl.reset(NULL);
901 
902  double percent_scale = 0;
903  if (!items.empty() && greatest_wt > 0) {
904 #ifdef XAPIAN_HAS_REMOTE_BACKEND
905  if (greatest_wt_subqs_db_num != UINT_MAX) {
906  const unsigned int n = greatest_wt_subqs_db_num;
907  RemoteSubMatch * rem_match;
908  rem_match = static_cast<RemoteSubMatch*>(leaves[n].get());
909  percent_scale = rem_match->get_percent_factor() / 100.0;
910  } else
911 #endif
912  {
913  percent_scale = greatest_wt_subqs_matched / double(total_subqs);
914  percent_scale /= greatest_wt;
915  }
916  Assert(percent_scale > 0);
917  if (percent_cutoff) {
918  // FIXME: better to sort and binary chop maybe?
919  // Or we could just use a linear scan here instead.
920 
921  // trim the mset to the correct answer...
922  double min_wt = percent_cutoff_factor / percent_scale;
923  if (!is_heap) {
924  is_heap = true;
925  make_heap<vector<Xapian::Internal::MSetItem>::iterator,
926  MSetCmp>(items.begin(), items.end(), mcmp);
927  }
928  while (!items.empty() && items.front().wt < min_wt) {
929  pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
930  MSetCmp>(items.begin(), items.end(), mcmp);
931  Assert(items.back().wt < min_wt);
932  items.pop_back();
933  }
934 #ifdef XAPIAN_ASSERTIONS_PARANOID
935  vector<Xapian::Internal::MSetItem>::const_iterator j;
936  for (j = items.begin(); j != items.end(); ++j) {
937  Assert(j->wt >= min_wt);
938  }
939 #endif
940  }
941  }
942 
943  LOGLINE(MATCH,
944  "docs_matched = " << docs_matched <<
945  ", definite_matches_not_seen = " << definite_matches_not_seen <<
946  ", matches_lower_bound = " << matches_lower_bound <<
947  ", matches_estimated = " << matches_estimated <<
948  ", matches_upper_bound = " << matches_upper_bound);
949 
950  // Adjust docs_matched to take account of documents which matched remotely
951  // but weren't sent across.
952  docs_matched += definite_matches_not_seen;
953 
954  Xapian::doccount uncollapsed_lower_bound = matches_lower_bound;
955  Xapian::doccount uncollapsed_upper_bound = matches_upper_bound;
956  Xapian::doccount uncollapsed_estimated = matches_estimated;
957 #ifdef XAPIAN_HAS_REMOTE_BACKEND
958  if (collapser && n_remotes) {
959  // We need to adjust uncollapsed_upper_bound if there are multiple
960  // shards and some or all are remote. The lower bound and estimate
961  // will be valid, though potentially could be better, but this
962  // doesn't seem worth addressing in 1.4.x - the code on master
963  // handles this correctly via merging MSet objects.
964  if (n_remotes == leaves.size()) {
965  // All shards are remote so we just use the sum of
966  // uncollapsed_upper_bound over the remotes.
967  uncollapsed_upper_bound = remote_uncollapsed_upper_bound;
968  } else {
969  // Sum and clamp to the number of documents. This is crude but
970  // at least gives a valid answer.
971  Xapian::doccount num_docs = db.get_doccount();
972  uncollapsed_upper_bound += remote_uncollapsed_upper_bound;
973  if (uncollapsed_upper_bound < remote_uncollapsed_upper_bound ||
974  uncollapsed_upper_bound > num_docs) {
975  uncollapsed_upper_bound = num_docs;
976  }
977  }
978  }
979 #endif
980  if (items.size() < max_msize) {
981  // We have fewer items in the mset than we tried to get for it, so we
982  // must have all the matches in it.
983  LOGLINE(MATCH, "items.size() = " << items.size() <<
984  ", max_msize = " << max_msize << ", setting bounds equal");
985  Assert(definite_matches_not_seen == 0);
986  Assert(percent_cutoff || docs_matched == items.size());
987  matches_lower_bound = matches_upper_bound = matches_estimated
988  = items.size();
989  if (collapser && matches_lower_bound > uncollapsed_lower_bound)
990  uncollapsed_lower_bound = matches_lower_bound;
991  } else if (!collapser && docs_matched < check_at_least) {
992  // We have seen fewer matches than we checked for, so we must have seen
993  // all the matches.
994  LOGLINE(MATCH, "Setting bounds equal");
995  matches_lower_bound = matches_upper_bound = matches_estimated
996  = docs_matched;
997  if (collapser && matches_lower_bound > uncollapsed_lower_bound)
998  uncollapsed_lower_bound = matches_lower_bound;
999  } else {
1000  AssertRel(matches_estimated,>=,matches_lower_bound);
1001  AssertRel(matches_estimated,<=,matches_upper_bound);
1002 
1003  // We can end up scaling the estimate more than once, so collect
1004  // the scale factors and apply them in one go to avoid rounding
1005  // more than once.
1006  double estimate_scale = 1.0;
1007  double unique_rate = 1.0;
1008 
1009  if (collapser) {
1010  LOGLINE(MATCH, "Adjusting bounds due to collapse_key");
1011  matches_lower_bound = collapser.get_matches_lower_bound();
1012  if (matches_lower_bound > uncollapsed_lower_bound)
1013  uncollapsed_lower_bound = matches_lower_bound;
1014 
1015  // The estimate for the number of hits can be modified by
1016  // multiplying it by the rate at which we've been finding
1017  // unique documents.
1018  Xapian::doccount docs_considered = collapser.get_docs_considered();
1019  Xapian::doccount dups_ignored = collapser.get_dups_ignored();
1020  if (docs_considered > 0) {
1021  double unique = double(docs_considered - dups_ignored);
1022  unique_rate = unique / double(docs_considered);
1023  }
1024 
1025  // We can safely reduce the upper bound by the number of duplicates
1026  // we've ignored.
1027  matches_upper_bound -= dups_ignored;
1028 
1029  LOGLINE(MATCH, "matches_lower_bound=" << matches_lower_bound <<
1030  ", matches_estimated=" << matches_estimated <<
1031  "*" << estimate_scale << "*" << unique_rate <<
1032  ", matches_upper_bound=" << matches_upper_bound);
1033  }
1034 
1035  if (mdecider) {
1036  if (!percent_cutoff) {
1037  if (!collapser) {
1038  // We're not collapsing or doing a percentage cutoff, so
1039  // docs_matched is a lower bound on the total number of
1040  // matches.
1041  matches_lower_bound = max(docs_matched, matches_lower_bound);
1042  }
1043  }
1044 
1045  // The estimate for the number of hits can be modified by
1046  // multiplying it by the rate at which the match decider has
1047  // been accepting documents.
1048  if (decider_considered > 0) {
1049  double accept = double(decider_considered - decider_denied);
1050  double accept_rate = accept / double(decider_considered);
1051  estimate_scale *= accept_rate;
1052  }
1053 
1054  // If a document is denied by a match decider, it is not possible
1055  // for it to found to be a duplicate, so it is safe to also reduce
1056  // the upper bound by the number of documents denied by a match
1057  // decider.
1058  matches_upper_bound -= decider_denied;
1059  if (collapser) uncollapsed_upper_bound -= decider_denied;
1060  }
1061 
1062  if (percent_cutoff) {
1063  estimate_scale *= (1.0 - percent_cutoff_factor);
1064  // another approach:
1065  // Xapian::doccount new_est = items.size() * (1 - percent_cutoff_factor) / (1 - min_weight / greatest_wt);
1066  // and another: items.size() + (1 - greatest_wt * percent_cutoff_factor / min_weight) * (matches_estimated - items.size());
1067 
1068  // Very likely an underestimate, but we can't really do better
1069  // without checking further matches... Only possibility would be
1070  // to track how many docs made the min weight test but didn't make
1071  // the candidate set since the last greatest_wt change, which we
1072  // could use if the top documents matched all the prob terms.
1073  matches_lower_bound = items.size();
1074  if (collapser) uncollapsed_lower_bound = matches_lower_bound;
1075 
1076  // matches_upper_bound could be reduced by the number of documents
1077  // which fail the min weight test (FIXME)
1078 
1079  LOGLINE(MATCH, "Adjusted bounds due to percent_cutoff (" <<
1080  percent_cutoff << "): now have matches_estimated=" <<
1081  matches_estimated << ", matches_lower_bound=" <<
1082  matches_lower_bound);
1083  }
1084 
1085  if (collapser && estimate_scale != 1.0) {
1086  uncollapsed_estimated =
1087  Xapian::doccount(uncollapsed_estimated * estimate_scale + 0.5);
1088  }
1089 
1090  estimate_scale *= unique_rate;
1091 
1092  if (estimate_scale != 1.0) {
1093  matches_estimated =
1094  Xapian::doccount(matches_estimated * estimate_scale + 0.5);
1095  if (matches_estimated < matches_lower_bound)
1096  matches_estimated = matches_lower_bound;
1097  }
1098 
1099  if (collapser || mdecider) {
1100  LOGLINE(MATCH, "Clamping estimate between bounds: "
1101  "matches_lower_bound = " << matches_lower_bound <<
1102  ", matches_estimated = " << matches_estimated <<
1103  ", matches_upper_bound = " << matches_upper_bound);
1104  AssertRel(matches_lower_bound,<=,matches_upper_bound);
1105  matches_estimated = max(matches_estimated, matches_lower_bound);
1106  matches_estimated = min(matches_estimated, matches_upper_bound);
1107  } else if (!percent_cutoff) {
1108  AssertRel(docs_matched,<=,matches_upper_bound);
1109  if (docs_matched > matches_lower_bound)
1110  matches_lower_bound = docs_matched;
1111  if (docs_matched > matches_estimated)
1112  matches_estimated = docs_matched;
1113  }
1114 
1115  if (collapser && !mdecider && !percent_cutoff) {
1116  AssertRel(docs_matched,<=,uncollapsed_upper_bound);
1117  if (docs_matched > uncollapsed_lower_bound)
1118  uncollapsed_lower_bound = docs_matched;
1119  }
1120  }
1121 
1122  if (collapser) {
1123  AssertRel(uncollapsed_lower_bound,<=,uncollapsed_upper_bound);
1124  if (uncollapsed_estimated < uncollapsed_lower_bound) {
1125  uncollapsed_estimated = uncollapsed_lower_bound;
1126  } else if (uncollapsed_estimated > uncollapsed_upper_bound) {
1127  uncollapsed_estimated = uncollapsed_upper_bound;
1128  }
1129  } else {
1130  // We're not collapsing, so the bounds must be the same.
1131  uncollapsed_lower_bound = matches_lower_bound;
1132  uncollapsed_upper_bound = matches_upper_bound;
1133  uncollapsed_estimated = matches_estimated;
1134  }
1135 
1136  LOGLINE(MATCH, items.size() << " items in potential mset");
1137 
1138  if (first > 0) {
1139  // Remove unwanted leading entries
1140  if (items.size() <= first) {
1141  items.clear();
1142  } else {
1143  LOGLINE(MATCH, "finding " << first << "th");
1144  // We perform nth_element() on reverse iterators so that the
1145  // unwanted elements end up at the end of items, which means
1146  // that the call to erase() to remove them doesn't have to copy
1147  // any elements.
1148  vector<Xapian::Internal::MSetItem>::reverse_iterator nth;
1149  nth = items.rbegin() + first;
1150  nth_element(items.rbegin(), nth, items.rend(), mcmp);
1151  // Erase the trailing "first" elements
1152  items.erase(items.begin() + items.size() - first, items.end());
1153  }
1154  }
1155 
1156  LOGLINE(MATCH, "sorting " << items.size() << " entries");
1157 
1158  // We could use std::sort_heap if is_heap is true, but profiling
1159  // suggests that's actually slower. Cast to void to suppress
1160  // compiler warnings that the last set value of is_heap is never
1161  // used.
1162  (void)is_heap;
1163 
1164  // Need a stable sort, but this is provided by comparison operator
1165  sort(items.begin(), items.end(), mcmp);
1166 
1167  if (!items.empty()) {
1168  LOGLINE(MATCH, "min weight in mset = " << items.back().wt);
1169  LOGLINE(MATCH, "max weight in mset = " << items[0].wt);
1170  }
1171 
1172  AssertRel(matches_estimated,>=,matches_lower_bound);
1173  AssertRel(matches_estimated,<=,matches_upper_bound);
1174 
1175  AssertRel(uncollapsed_estimated,>=,uncollapsed_lower_bound);
1176  AssertRel(uncollapsed_estimated,<=,uncollapsed_upper_bound);
1177 
1178  // We may need to qualify any collapse_count to see if the highest weight
1179  // collapsed item would have qualified percent_cutoff
1180  // We WILL need to restore collapse_count to the mset by taking from
1181  // collapse_tab; this is what comes of copying around whole objects
1182  // instead of taking references, we find it hard to update collapse_count
1183  // of an item that has already been pushed-back as we don't know where it
1184  // is any more. If we keep or find references we won't need to mess with
1185  // is_heap so much maybe?
1186  if (!items.empty() && collapser && !collapser.empty()) {
1187  // Nicked this formula from above.
1188  double min_wt = 0.0;
1189  if (percent_scale > 0.0)
1190  min_wt = percent_cutoff_factor / percent_scale;
1191  Xapian::doccount entries = collapser.entries();
1192  vector<Xapian::Internal::MSetItem>::iterator i;
1193  for (i = items.begin(); i != items.end(); ++i) {
1194  // Skip entries without a collapse key.
1195  if (i->collapse_key.empty()) continue;
1196 
1197  // Set collapse_count appropriately.
1198  i->collapse_count = collapser.get_collapse_count(i->collapse_key, percent_cutoff, min_wt);
1199  if (--entries == 0) {
1200  // Stop once we've processed all items with collapse keys.
1201  break;
1202  }
1203  }
1204  }
1205 
1206  mset.internal = new Xapian::MSet::Internal(
1207  first,
1208  matches_upper_bound,
1209  matches_lower_bound,
1210  matches_estimated,
1211  uncollapsed_upper_bound,
1212  uncollapsed_lower_bound,
1213  uncollapsed_estimated,
1214  max_possible, greatest_wt, items,
1215  percent_scale * 100.0);
1216 }
Xapian::Query query
Definition: multimatch.h:42
#define RETURN(A)
Definition: debuglog.h:493
TimeOut(double)
Definition: multimatch.cc:146
#define Assert(COND)
Definition: omassert.h:122
void get_mset(Xapian::MSet &mset)
Short-cut for single remote match.
MultiMatch(const MultiMatch &)
Copying is not permitted.
collapse_result process(Xapian::Internal::MSetItem &item, PostList *postlist, Xapian::Document::Internal &vsdoc, MSetCmp mcmp)
Handle a new MSetItem.
Definition: collapser.cc:70
MultipleMatchSpy(const std::vector< Xapian::Internal::opt_intrusive_ptr< Xapian::MatchSpy >> &spies_)
Definition: multimatch.cc:256
Abstract base class for postlists.
Definition: postlist.h:37
double time_limit
Definition: multimatch.h:60
Class which applies several match spies in turn.
Definition: multimatch.cc:249
This class is used to access a database, or a group of databases.
Definition: database.h:68
#define LOGCALL_STATIC_VOID(CATEGORY, FUNC, PARAMS)
Definition: debuglog.h:492
unsigned timeout
A timeout value in milliseconds.
Definition: types.h:100
class for performing a match
Xapian::Enquire::Internal::sort_setting sort_by
Definition: multimatch.h:56
#define AssertRel(A, REL, B)
Definition: omassert.h:123
A PostList which contains no entries.
Xapian::valueno collapse_key
Definition: multimatch.h:46
Abstract base class for match spies.
Definition: matchspy.h:49
double weight
The weight of a document or term.
Definition: types.h:122
#define usual(COND)
Definition: config.h:576
A document which gets its values from a ValueStreamManager.
Base class for databases.
Definition: database.h:57
const vector< Xapian::Internal::opt_intrusive_ptr< Xapian::MatchSpy > > & matchspies
The matchspies to use.
Definition: multimatch.h:74
Xapian::Internal::intrusive_ptr< Internal > internal
Definition: mset.h:52
void set_query(const Xapian::Query &query_)
Xapian::valueno sort_key
Definition: multimatch.h:54
Class representing a list of search results.
Definition: mset.h:44
virtual double recalc_maxweight()=0
Recalculate the upper bound on what get_weight() can return.
Collapse documents with the same collapse key during the match.
RemoteDatabase is the baseclass for remote database implementations.
#define LOGCALL_VOID(CATEGORY, FUNC, PARAMS)
Definition: debuglog.h:488
STL namespace.
const Xapian::Weight * weight
Weighting scheme.
Definition: multimatch.h:63
vector< bool > is_remote
Is each sub-database remote?
Definition: multimatch.h:71
std::vector< Xapian::Internal::intrusive_ptr< SubMatch > > leaves
Vector of the items.
Definition: multimatch.h:38
#define false
Definition: header.h:9
std::vector< Xapian::Internal::intrusive_ptr< Internal > > internal
Definition: database.h:81
Xapian::doccount get_doccount() const
Get the number of documents in the database.
Definition: omdatabase.cc:267
A postlist comprising postlists from different databases merged together.
Definition: mergepostlist.h:35
virtual bool prepare_match(bool nowait, Xapian::Weight::Internal &total_stats)=0
Fetch and collate statistics.
virtual void readahead_for_query(const Xapian::Query &query)
Definition: database.cc:54
const std::vector< Xapian::Internal::opt_intrusive_ptr< Xapian::MatchSpy > > & spies
List of match spies to call, in order.
Definition: multimatch.cc:252
#define rare(COND)
Definition: config.h:575
string sort_key
Used when sorting by value.
void operator()(const Xapian::Document &doc, double wt)
Implementation of virtual operator().
Definition: multimatch.cc:267
Xapian::Internal::intrusive_ptr< Internal > internal
Definition: enquire.h:63
Hierarchy of classes which Xapian can throw as exceptions.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
const Xapian::Database db
Definition: multimatch.h:40
Xapian::doccount get_dups_ignored() const
Definition: collapser.h:153
Definition: pretty.h:45
MatchSpy implementation.
bool(* MSetCmp)(const Xapian::Internal::MSetItem &, const Xapian::Internal::MSetItem &)
Definition: msetcmp.h:28
SubMatch class for a remote database.
virtual base class for branched types of postlist
merge postlists from different databases
Xapian::Weight::Internal class, holding database and term statistics.
Xapian::doccount get_uncollapsed_upper_bound() const
Get uncollapsed_upper_bound from the remote MSet.
Xapian::Enquire::docid_order order
Definition: multimatch.h:52
virtual Xapian::doccount get_termfreq_min() const =0
Get a lower bound on the number of documents indexed by this term.
base class for sub-matchers
double wt
Weight calculated.
Class to hold statistics for a given collection.
string collapse_key
Value which was used to collapse upon.
void set_query(const Xapian::Query &query, Xapian::termcount qlen, Xapian::doccount collapse_max, Xapian::valueno collapse_key, Xapian::Enquire::docid_order order, Xapian::valueno sort_key, Xapian::Enquire::Internal::sort_setting sort_by, bool sort_value_forward, double time_limit, int percent_cutoff, double weight_cutoff, const Xapian::Weight *wtscheme, const Xapian::RSet &omrset, const vector< Xapian::Internal::opt_intrusive_ptr< Xapian::MatchSpy >> &matchspies)
Set the query.
An item resulting from a query.
const Xapian::Enquire::Internal::sort_setting VAL
Definition: multimatch.cc:158
virtual int get_backend_info(string *path) const =0
Get backend information about this database.
void get_mset(Xapian::doccount first, Xapian::doccount maxitems, Xapian::doccount check_at_least, Xapian::MSet &mset, Xapian::Weight::Internal &stats, const Xapian::MatchDecider *mdecider, const Xapian::KeyMaker *sorter)
Run the match and generate an MSet object.
Definition: multimatch.cc:384
bool empty() const
Definition: collapser.h:159
RemoteDatabase is the baseclass for remote database implementations.
SubMatch class for a local database.
static void prepare_sub_matches(vector< intrusive_ptr< SubMatch > > &leaves, Xapian::Weight::Internal &stats)
Prepare some SubMatches.
Definition: multimatch.cc:224
int percent_cutoff
Definition: multimatch.h:48
A document which gets its values from a ValueStreamManager.
#define LOGCALL_CTOR(CATEGORY, CLASS, PARAMS)
Definition: debuglog.h:489
void set_bounds_from_db(const Xapian::Database &db_)
Set the "bounds" stats from Database db.
virtual double get_maxweight() const =0
Return an upper bound on what get_weight() can return.
Define preprocessor symbols for the library version.
const Xapian::Enquire::Internal::sort_setting REL_VAL
Definition: multimatch.cc:156
collapse_result
Enumeration reporting how a document was handled by the Collapser.
Definition: collapser.h:33
double get_percent_factor() const
Get percentage factor - only valid after get_postlist().
bool sort_value_forward
Definition: multimatch.h:58
Base class for matcher decision functor.
Definition: enquire.h:118
bool recalculate_w_max
Internal flag to note that w_max needs to be recalculated while query is running. ...
Definition: multimatch.h:68
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
Definition: api_anydb.cc:63
Xapian::doccount get_collapse_count(const std::string &collapse_key, int percent_cutoff, double min_weight) const
Definition: collapser.cc:111
static void split_rset_by_db(const Xapian::RSet *rset, Xapian::doccount number_of_subdbs, vector< Xapian::RSet > &subrsets)
Split an RSet into several sub rsets, one for each database.
Definition: multimatch.cc:173
bool next_handling_prune(PostList *&pl, double w_min, MultiMatch *matcher)
virtual std::string get_description() const =0
Return a string description of this object.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
double getorrecalc_maxweight(PostList *pl)
get the maxweight that the postlist pl may return, calling recalc_maxweight if recalculate_w_max is s...
Definition: multimatch.cc:358
MSetItem comparison functions.
MSetCmp get_msetcmp_function(Xapian::Enquire::Internal::sort_setting sort_by, bool sort_forward, bool sort_val_reverse)
Select the appropriate msetcmp function.
Definition: msetcmp.cc:117
double weight_cutoff
Definition: multimatch.h:50
bool empty() const
Check if this query is Xapian::Query::MatchNothing.
Definition: query.h:524
The Collapser class tracks collapse keys and the documents they match.
Definition: collapser.h:87
unsigned valueno
The number for a value slot in a document.
Definition: types.h:108
Xapian::doccount entries() const
Definition: collapser.h:155
void start_match(Xapian::doccount first, Xapian::doccount maxitems, Xapian::doccount check_at_least, Xapian::Weight::Internal &total_stats)
Start the match.
const Xapian::Enquire::Internal::sort_setting REL
Definition: multimatch.cc:154
<unistd.h>, but with compat.
Xapian::doccount get_matches_lower_bound() const
Definition: collapser.cc:137
Various assertion macros.
Functions for handling a time or time interval in a double.
#define LOGLINE(a, b)
Definition: debuglog.h:494
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
Class representing a query.
Definition: query.h:46
Xapian::Internal::MSetItem old_item
Replaced item when REPLACED is returned by collapse().
Definition: collapser.h:122
bool empty() const
Test if this R-Set is empty.
Definition: omenquire.cc:98
A smart pointer that uses intrusive reference counting.
Definition: intrusive_ptr.h:81
docid_order
Ordering of docids.
Definition: enquire.h:326
Xapian::doccount collapse_max
Definition: multimatch.h:44
bool timed_out() const
Definition: multimatch.cc:147
A handle representing a document in a Xapian database.
Definition: document.h:61
Class for performing matching on a remote database.
Wrapper around standard unique_ptr template.
Debug logging macros.
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:487
Virtual base class for key making functors.
Definition: keymaker.h:41
A relevance set (R-Set).
Definition: enquire.h:60
UnimplementedError indicates an attempt to use an unimplemented feature.
Definition: error.h:325
Xapian::doccount get_docs_considered() const
Definition: collapser.h:151
Abstract base class for weighting schemes.
Definition: weight.h:35
docids sort in descending order.
Definition: enquire.h:330