xapian-core  2.0.0
matcher.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2006-2026 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include "matcher.h"
24 
25 #include "api/enquireinternal.h"
26 #include "api/msetinternal.h"
27 #include "api/rsetinternal.h"
29 #include "deciderpostlist.h"
30 #include "localsubmatch.h"
31 #include "msetcmp.h"
32 #include "omassert.h"
33 #include "postlisttree.h"
34 #include "protomset.h"
35 #include "spymaster.h"
36 #include "valuestreamdocument.h"
37 #include "weight/weightinternal.h"
38 
39 #include <xapian/version.h> // For XAPIAN_HAS_REMOTE_BACKEND
40 
41 #ifdef XAPIAN_HAS_REMOTE_BACKEND
43 # include "remotesubmatch.h"
44 # include "socket_utils.h"
45 #endif
46 
47 #include <algorithm>
48 #include <cerrno>
49 #include <cfloat> // For DBL_EPSILON.
50 #include <vector>
51 
52 #ifdef HAVE_POLL_H
53 # include <poll.h>
54 #else
55 # include "safesysselect.h"
56 #endif
57 
58 #ifdef __WIN32__
59 # include "msvcignoreinvalidparam.h"
60 #endif
61 
62 using namespace std;
64 
65 static constexpr auto DOCID = Xapian::Enquire::Internal::DOCID;
66 static constexpr auto REL = Xapian::Enquire::Internal::REL;
68 static constexpr auto VAL = Xapian::Enquire::Internal::VAL;
70 
71 #ifdef XAPIAN_HAS_REMOTE_BACKEND
72 [[noreturn]]
73 static void unimplemented(const char* msg)
74 {
75  throw Xapian::UnimplementedError(msg);
76 }
77 
78 template<typename Action>
79 inline void
81 {
82 #ifdef HAVE_POLL
83  size_t n_remotes = remotes.size();
84  if (n_remotes <= 1) {
85  // We only need to use poll() when there are at least 2 remote
86  // databases we need to wait for.
87  if (n_remotes == 1) {
88  // Just execute action and block if it's not ready.
89  action(remotes[0].get());
90  }
91  return;
92  }
93 
94  unique_ptr<struct pollfd[]> fds(new struct pollfd[n_remotes]);
95  for (size_t i = 0; i != n_remotes; ++i) {
96  fds[i].fd = remotes[i]->get_read_fd();
97  fds[i].events = POLLIN;
98  fds[i].revents = 0;
99  }
100  do {
101  int r = poll(fds.get(), n_remotes, -1);
102  if (r <= 0) {
103  // We shouldn't get a timeout, but if we do retry.
104  if (r == 0 || errno == EINTR || errno == EAGAIN) {
105  continue;
106  }
107  throw Xapian::NetworkError("poll() failed waiting for remotes",
108  errno);
109  }
110  size_t i = 0;
111  while (i != n_remotes) {
112  if (fds[i].revents) {
113  action(remotes[i].get());
114  // Swap such that entries we still need to handle are first.
115  swap(remotes[i], remotes[--n_remotes]);
116  fds[i] = fds[n_remotes];
117  // r is number of ready fds.
118  if (--r == 0) break;
119  } else {
120  ++i;
121  }
122  }
123  } while (n_remotes > 1);
124 
125  // If there's only one remote left just execute action and block if it's
126  // not ready.
127  if (n_remotes == 1) {
128  action(remotes[0].get());
129  }
130 #else
131  size_t n_remotes = first_nonselectable;
132  fd_set fds;
133  while (n_remotes > 1) {
134  int nfds = 0;
135  FD_ZERO(&fds);
136  for (size_t i = 0; i != n_remotes; ++i) {
137  int fd = remotes[i]->get_read_fd();
138  FD_SET(fd, &fds);
139  if (fd >= nfds) nfds = fd + 1;
140  }
141 
142  int r = select(nfds, &fds, NULL, NULL, NULL);
143  if (r <= 0) {
144  int eno = socket_errno();
145  // We shouldn't get a timeout, but if we do retry.
146  if (r == 0 || eno == EINTR || eno == EAGAIN) {
147  continue;
148  }
149  throw Xapian::NetworkError("select() failed waiting for remotes",
150  eno);
151  }
152  size_t i = 0;
153  while (i != n_remotes) {
154  int fd = remotes[i]->get_read_fd();
155  if (FD_ISSET(fd, &fds)) {
156  action(remotes[i].get());
157  // Swap such that entries we still need to handle are first.
158  swap(remotes[i], remotes[--n_remotes]);
159  // r is number of ready fds.
160  if (--r == 0) break;
161  } else {
162  ++i;
163  }
164  }
165  }
166 
167  // If there's only one remote left just execute action and block if it's
168  // not ready.
169  if (n_remotes == 1) {
170  action(remotes[0].get());
171  }
172 
173  // Handle any remotes which we couldn't pass to select().
174  for (size_t i = first_nonselectable; i != remotes.size(); ++i) {
175  action(remotes[i].get());
176  }
177 #endif
178 }
179 #endif
180 
182  const Xapian::Query& query,
183  Xapian::termcount query_length,
184  const Xapian::RSet* rset,
186  const Xapian::Weight& wtscheme,
187  bool have_mdecider,
188  Xapian::valueno collapse_key,
189  Xapian::doccount collapse_max,
190  int percent_threshold,
191  double weight_threshold,
193  Xapian::valueno sort_key,
195  bool sort_val_reverse,
196  double time_limit,
197  const vector<opt_intrusive_ptr<Xapian::MatchSpy>>& matchspies)
198  : db(db_)
199 {
200  // An empty query should get handled higher up.
201  Assert(!query.empty());
202 
203  Xapian::doccount n_shards = db.internal->size();
204  vector<Xapian::RSet> subrsets;
205  if (rset && rset->internal) {
206  rset->internal->shard(n_shards, subrsets);
207  } else {
208  subrsets.resize(n_shards);
209  }
210 
211  for (Xapian::doccount i = 0; i != n_shards; ++i) {
212  const Xapian::Database::Internal *subdb = db.internal.get();
213  if (n_shards > 1) {
214  auto multidb = static_cast<const MultiDatabase*>(subdb);
215  subdb = multidb->shards[i];
216  }
217  Assert(subdb);
218 #ifdef XAPIAN_HAS_REMOTE_BACKEND
219  if (subdb->get_backend_info(NULL) == BACKEND_REMOTE) {
220  auto as_rem = static_cast<const RemoteDatabase*>(subdb);
221  if (have_mdecider) {
222  unimplemented("Xapian::MatchDecider not supported by the "
223  "remote backend");
224  }
225  as_rem->set_query(query, query_length,
226  collapse_key, collapse_max,
227  order, sort_key, sort_by, sort_val_reverse,
228  time_limit,
229  n_shards == 1 ? percent_threshold : 0,
230  weight_threshold,
231  wtscheme,
232  subrsets[i], matchspies);
233  remotes.emplace_back(new RemoteSubMatch(as_rem, i));
234  continue;
235  }
236 #else
237  // Avoid unused parameter warnings.
238  (void)have_mdecider;
239  (void)collapse_key;
240  (void)collapse_max;
241  (void)percent_threshold;
242  (void)weight_threshold;
243  (void)order;
244  (void)sort_key;
245  (void)sort_by;
246  (void)sort_val_reverse;
247  (void)time_limit;
248  (void)matchspies;
249 #endif /* XAPIAN_HAS_REMOTE_BACKEND */
250  if (locals.size() != i)
251  locals.resize(i);
252  locals.emplace_back(new LocalSubMatch(subdb, query, query_length,
253  wtscheme,
254  i));
255  subdb->readahead_for_query(query);
256  }
257 
258  if (!locals.empty() && locals.size() != n_shards)
259  locals.resize(n_shards);
260 
261 #ifdef XAPIAN_HAS_REMOTE_BACKEND
262 # ifndef HAVE_POLL
263 # ifndef __WIN32__
264  {
265  // Unfortunately POSIX select() can't monitor fds >= FD_SETSIZE, so
266  // swap those to the end here and then handle those last letting them
267  // just block if not ready.
268  first_nonselectable = remotes.size();
269  size_t i = 0;
270  while (i != first_nonselectable) {
271  int fd = remotes[i]->get_read_fd();
272  if (fd >= FD_SETSIZE) {
273  swap(remotes[i], remotes[--first_nonselectable]);
274  } else {
275  ++i;
276  }
277  }
278  }
279 # else
280  {
281  // We can only use select() on sockets under __WIN32__, but fds for
282  // remote prog databases aren't sockets, so go through and check if
283  // each fd is a socket or not, and swap the non-sockets to the end here
284  // and then handle those last letting them just block if not ready.
285  //
286  // FIXME: Perhaps we should use WaitForMultipleObjects() to allow
287  // waiting in parallel for prog databases too, but that seems a bit
288  // tricky to hook up as it probably needs an async ReadFile() to be
289  // active.
290  MSVCIgnoreInvalidParameter invalid_handle_value_is_ok;
291  first_nonselectable = remotes.size();
292  size_t i = 0;
293  while (i != first_nonselectable) {
294  int fd = remotes[i]->get_read_fd();
295  HANDLE handle = (HANDLE)_get_osfhandle(fd);
296  if (handle != INVALID_HANDLE_VALUE) {
297  // This fd isn't a socket.
298  swap(remotes[i], remotes[--first_nonselectable]);
299  } else {
300  ++i;
301  // On __WIN32__ FD_SETSIZE is the maximum number of sockets
302  // which can be added to an fd_set. It seems to be 64, so
303  // it's a case that's possible to trigger.
304  if (i == FD_SETSIZE) {
306  }
307  }
308  }
309  }
310 # endif
311 # endif
312 #endif
313 
314  stats.set_query(query);
315 
316  /* To improve overall performance in the case of searches over a mix of
317  * local and remote shards we set the queries for remote shards above,
318  * then prepare local shards here, then finish preparing remote shards
319  * below.
320  */
321 
322  if (!locals.empty()) {
323  // Prepare local matches.
324  for (Xapian::doccount i = 0; i != n_shards; ++i) {
325  auto submatch = locals[i].get();
326  if (submatch) {
327  submatch->prepare_match(subrsets[i], stats);
328  }
329  }
330  }
331 
332 #ifdef XAPIAN_HAS_REMOTE_BACKEND
334  [&](RemoteSubMatch* submatch) {
335  submatch->prepare_match(stats);
336  });
337 #endif
338 }
339 
342  Xapian::doccount maxitems,
343  Xapian::doccount check_at_least,
344  const Xapian::Weight& wtscheme,
345  const Xapian::MatchDecider* mdecider,
346  const Xapian::KeyMaker* sorter,
347  Xapian::valueno collapse_key,
348  Xapian::doccount collapse_max,
349  int percent_threshold,
350  double percent_threshold_factor,
351  double weight_threshold,
353  Xapian::valueno sort_key,
355  bool sort_val_reverse,
356  double time_limit,
357  const vector<opt_ptr_spy>& matchspies)
358 {
359  Assert(!locals.empty());
360 
361  ValueStreamDocument vsdoc(db);
362  ++vsdoc._refs;
363  Xapian::Document doc(&vsdoc);
364 
365  vector<PostList*> postlists;
366  postlists.reserve(locals.size());
367  PostListTree pltree(vsdoc, db, wtscheme);
368  Xapian::termcount total_subqs = 0;
375  Xapian::VecUniquePtr<EstimateOp> estimates(locals.size());
376  try {
377  bool all_null = true;
378  for (size_t i = 0; i != locals.size(); ++i) {
379  if (!locals[i]) {
380  postlists.push_back(nullptr);
381  estimates.push_back(nullptr);
382  continue;
383  }
384  // Pick the highest total subqueries answer amongst the
385  // subdatabases, as the query to postlist conversion doesn't
386  // recurse into positional queries for shards that don't have
387  // positional data when at least one other shard does.
388  Xapian::termcount total_subqs_i = 0;
389  PostListAndEstimate plest = locals[i]->get_postlist(&pltree,
390  &total_subqs_i);
391  total_subqs = max(total_subqs, total_subqs_i);
392  if (plest.pl != nullptr) {
393  all_null = false;
394  if (mdecider) {
395  plest.est.reset(new EstimateOp(EstimateOp::DECIDER,
396  plest.est.release()));
397  if (check_at_least) {
398  // No point creating the DeciderPostList if we aren't
399  // actually going to run the match.
400  plest.pl = new DeciderPostList(plest.pl,
401  plest.est.get(),
402  mdecider, &vsdoc,
403  &pltree);
404  }
405  }
406  }
407  postlists.push_back(plest.pl);
408  estimates.push_back(plest.est.release());
409  }
410  Assert(!postlists.empty());
411 
412  if (all_null) {
413  vector<Result> dummy;
414  return Xapian::MSet(new Xapian::MSet::Internal(first, 0, 0, 0, 0,
415  0, 0, 0.0, 0.0,
416  std::move(dummy),
417  0));
418  }
419  } catch (...) {
420  for (auto pl : postlists) delete pl;
421  throw;
422  }
423 
424  Xapian::doccount n_shards = postlists.size();
425  pltree.set_postlists(&postlists[0], n_shards);
426 
427  // The highest weight a document could get in this match.
428  const double max_possible = pltree.recalc_maxweight();
429 
430  if (max_possible == 0.0) {
431  // All the weights are zero.
432  if (sort_by == REL) {
433  // We're only sorting by DOCID.
434  sort_by = DOCID;
435  } else if (sort_by == REL_VAL || sort_by == VAL_REL) {
436  // Normalise REL_VAL and VAL_REL to VAL, to avoid needlessly
437  // fetching and comparing weights.
438  sort_by = VAL;
439  }
440  // All percentages will be 100% so turn off any percentage cut-off.
441  percent_threshold = 0;
442  percent_threshold_factor = 0.0;
443  }
444 
445  // Check if any results have been asked for (might just be wanting
446  // maxweight).
447  if (check_at_least == 0) {
448  // Explicitly delete all PostList objects so they report any stats to
449  // the EstimateOp objects.
450  pltree.delete_postlists();
451  Xapian::doccount matches_lower_bound = 0;
452  Xapian::doccount matches_estimated = 0;
453  Xapian::doccount matches_upper_bound = 0;
454  for (size_t i = 0; i != estimates.size(); ++i) {
455  if (estimates[i]) {
456  Assert(locals[i].get());
457  Estimates e = locals[i]->resolve(estimates[i]);
458  matches_lower_bound += e.min;
459  matches_estimated += e.est;
460  matches_upper_bound += e.max;
461  }
462  }
463 
464  if (mdecider) {
465  matches_lower_bound = 0;
466  }
467 
468  Xapian::doccount uncollapsed_lower_bound = matches_lower_bound;
469  if (collapse_max) {
470  // Lower bound must be set to no more than collapse_max, since it's
471  // possible that all matching documents have the same collapse_key
472  // value and so are collapsed together.
473  if (matches_lower_bound > collapse_max)
474  matches_lower_bound = collapse_max;
475  }
476 
477  vector<Result> dummy;
478  return Xapian::MSet(new Xapian::MSet::Internal(first,
479  matches_upper_bound,
480  matches_lower_bound,
481  matches_estimated,
482  matches_upper_bound,
483  uncollapsed_lower_bound,
484  matches_estimated,
485  max_possible,
486  0.0,
487  std::move(dummy),
488  0));
489  }
490 
491  SpyMaster spymaster(&matchspies);
492 
493  bool sort_forward = (order != Xapian::Enquire::DESCENDING);
494  auto mcmp = get_msetcmp_function(sort_by, sort_forward, sort_val_reverse);
495 
496  // Can we stop once the ProtoMSet is full?
497  bool stop_once_full = (sort_forward &&
498  n_shards == 1 &&
499  sort_by == DOCID);
500 
501  ProtoMSet proto_mset(first, maxitems, check_at_least,
502  mcmp, sort_by, total_subqs,
503  pltree,
504  collapse_key, collapse_max,
505  percent_threshold, percent_threshold_factor,
506  max_possible,
507  stop_once_full,
508  time_limit);
509  proto_mset.set_new_min_weight(weight_threshold);
510 
511  while (true) {
512  double min_weight = proto_mset.get_min_weight();
513  if (!pltree.next(min_weight)) {
514  break;
515  }
516 
517  // The weight calculation can be expensive enough that it's worth being
518  // lazy and only calculating it once we know we need to. If sort_by
519  // is DOCID then all weights are zero.
520  double weight = 0.0;
521  bool calculated_weight = (sort_by == DOCID);
522  if (!calculated_weight) {
523  if (sort_by != VAL || min_weight > 0.0) {
524  weight = pltree.get_weight();
525  if (weight < min_weight) {
526  continue;
527  }
528  calculated_weight = true;
529  }
530  }
531 
532  Xapian::docid did = pltree.get_docid();
533  vsdoc.set_document(did);
534  Result new_item(weight, did);
535 
536  if (sort_by != DOCID && sort_by != REL) {
537  if (sorter) {
538  new_item.set_sort_key((*sorter)(doc));
539  } else {
540  new_item.set_sort_key(vsdoc.get_value(sort_key));
541  }
542 
543  if (proto_mset.early_reject(new_item, calculated_weight, spymaster,
544  doc))
545  continue;
546  }
547 
548  // Apply any MatchSpy objects.
549  if (spymaster) {
550  if (!calculated_weight) {
551  weight = pltree.get_weight();
552  new_item.set_weight(weight);
553  calculated_weight = true;
554  }
555  spymaster(doc, weight);
556  }
557 
558  if (!calculated_weight) {
559  weight = pltree.get_weight();
560  new_item.set_weight(weight);
561  }
562 
563  if (!proto_mset.process(std::move(new_item), vsdoc))
564  break;
565  }
566 
567  // Explicitly delete all PostList objects so they report any stats to
568  // the EstimateOp objects.
569  pltree.delete_postlists();
570 
571  return proto_mset.finalise(mdecider,
572  locals,
573  estimates);
574 }
575 
578  Xapian::doccount maxitems,
579  Xapian::doccount check_at_least,
581  const Xapian::Weight& wtscheme,
582  const Xapian::MatchDecider* mdecider,
583  const Xapian::KeyMaker* sorter,
584  Xapian::valueno collapse_key,
585  Xapian::doccount collapse_max,
586  int percent_threshold,
587  double weight_threshold,
589  Xapian::valueno sort_key,
591  bool sort_val_reverse,
592  double time_limit,
593  const vector<opt_intrusive_ptr<Xapian::MatchSpy>>& matchspies)
594 {
595  AssertRel(check_at_least, >=, first + maxitems);
596 
597 #ifdef XAPIAN_HAS_REMOTE_BACKEND
598  if (locals.empty() && remotes.size() == 1) {
599  // Short cut for a single remote database.
600  Assert(remotes[0]);
601  remotes[0]->start_match(first, maxitems, check_at_least, sorter,
602  stats);
603  return remotes[0]->get_mset(matchspies);
604  }
605 #endif
606 
607  // Factor to multiply maximum weight seen by to get the minimum weight we
608  // need to consider.
609  double percent_threshold_factor = percent_threshold / 100.0;
610  // Corresponding correction to that in api/mset.cc to account for excess
611  // precision on x86.
612  percent_threshold_factor -= DBL_EPSILON;
613 
614 #ifdef XAPIAN_HAS_REMOTE_BACKEND
615  for (auto&& submatch : remotes) {
616  Assert(submatch);
617  // We need to fetch the first "first" results too, as merging may push
618  // those down into the part of the merged MSet we care about.
619  Xapian::doccount remote_maxitems = first + maxitems;
620  if (collapse_max != 0) {
621  // If collapsing we need to fetch all check_at_least items in order
622  // to satisfy the requirement that if there are <= check_at_least
623  // results then then estimated number of matches is exact.
624  AssertRel(check_at_least, >=, first + maxitems);
625  remote_maxitems = check_at_least;
626  }
627  submatch->start_match(0, remote_maxitems, check_at_least, sorter,
628  stats);
629  }
630 #endif
631 
632  Xapian::MSet local_mset;
633  if (!locals.empty()) {
634  for (auto&& submatch : locals) {
635  if (submatch)
636  submatch->start_match(stats);
637  }
638 
639  Xapian::doccount local_first = first;
640  Xapian::doccount local_maxitems = maxitems;
641  double local_percent_threshold_factor = percent_threshold_factor;
642 #ifdef XAPIAN_HAS_REMOTE_BACKEND
643  if (!remotes.empty()) {
644  // We need to fetch the first "first" results too, as merging may
645  // push those down into the part of the merged MSet we care about.
646  local_first = 0;
647  local_maxitems = first + maxitems;
648  if (collapse_max != 0) {
649  // If collapsing we need to fetch all check_at_least items in
650  // order to satisfy the requirement that if there are <=
651  // check_at_least results then then estimated number of matches
652  // is exact. FIXME: Can we avoid this for the local shard by
653  // making use of information in the Collapser?
654  AssertRel(check_at_least, >=, first + maxitems);
655  local_maxitems = check_at_least;
656  }
657  local_percent_threshold_factor = 0.0;
658  }
659 #endif
660  local_mset = get_local_mset(local_first, local_maxitems, check_at_least,
661  wtscheme, mdecider,
662  sorter, collapse_key, collapse_max,
663  percent_threshold,
664  local_percent_threshold_factor,
665  weight_threshold, order, sort_key, sort_by,
666  sort_val_reverse, time_limit, matchspies);
667  }
668 
669 #ifdef XAPIAN_HAS_REMOTE_BACKEND
670  if (remotes.empty()) {
671  // Another easy case - only local databases.
672  return local_mset;
673  }
674 
675  // We need to merge MSet objects. We only need the number of remote shards
676  // + 1 if there are any local shards, so reserving n_shards may be more
677  // than we need.
678  vector<pair<Xapian::MSet, Xapian::doccount>> msets;
679  Xapian::MSet merged_mset;
681  [&](RemoteSubMatch* submatch) {
682  Xapian::MSet remote_mset = submatch->get_mset(matchspies);
683  merged_mset.internal->merge_stats(remote_mset.internal.get(),
684  collapse_max != 0);
685  auto& merged_stats = merged_mset.internal->stats;
686  if (!merged_stats) {
687  merged_stats = std::move(remote_mset.internal->stats);
688  } else {
689  merged_stats->merge(*(remote_mset.internal->stats));
690  }
691  if (remote_mset.empty()) {
692  return;
693  }
694  remote_mset.internal->unshard_docids(submatch->get_shard(),
695  db.internal->size());
696  msets.push_back({remote_mset, 0});
697  });
698 
699  if (!locals.empty()) {
700  if (!local_mset.empty())
701  msets.push_back({local_mset, 0});
702  merged_mset.internal->merge_stats(local_mset.internal.get(),
703  collapse_max != 0);
704  merged_mset.internal->stats->merge(stats);
705  }
706 
707  if (merged_mset.internal->max_possible == 0.0) {
708  // All the weights are zero.
709  if (sort_by == REL) {
710  // We're only sorting by DOCID.
711  sort_by = DOCID;
712  } else if (sort_by == REL_VAL || sort_by == VAL_REL) {
713  // Normalise REL_VAL and VAL_REL to VAL, to avoid needlessly
714  // fetching and comparing weights.
715  sort_by = VAL;
716  }
717  // All percentages will be 100% so turn off any percentage cut-off.
718  percent_threshold = 0;
719  percent_threshold_factor = 0.0;
720  }
721 
722  bool sort_forward = (order != Xapian::Enquire::DESCENDING);
723  auto mcmp = get_msetcmp_function(sort_by, sort_forward, sort_val_reverse);
724  auto heap_cmp =
725  [&](const pair<Xapian::MSet, Xapian::doccount>& a,
726  const pair<Xapian::MSet, Xapian::doccount>& b) {
727  return mcmp(b.first.internal->items[b.second],
728  a.first.internal->items[a.second]);
729  };
730 
731  Heap::make(msets.begin(), msets.end(), heap_cmp);
732 
733  double min_weight = 0.0;
734  if (percent_threshold) {
735  min_weight = percent_threshold_factor * 100.0 /
736  merged_mset.internal->percent_scale_factor;
737  }
738 
739  CollapserLite collapser(collapse_max);
740  merged_mset.internal->first = first;
741  while (!msets.empty() && merged_mset.size() != maxitems) {
742  auto& front = msets.front();
743  auto& result = front.first.internal->items[front.second];
744  if (percent_threshold) {
745  if (result.get_weight() < min_weight) {
746  // FIXME: This will need adjusting if we ever support
747  // percentage thresholds when sorting primarily by value.
748  break;
749  }
750  }
751  if (!collapser || collapser.add(result.get_collapse_key())) {
752  if (first) {
753  // Skip the first "first" results from the merge - we had to
754  // also fetch the first "first" results from each shard, as
755  // merging may push those down into the part of the merged MSet
756  // we care about.
757  --first;
758  } else {
759  merged_mset.internal->items.push_back(std::move(result));
760  }
761  }
762  auto n = front.second + 1;
763  if (n == front.first.size()) {
764  Heap::pop(msets.begin(), msets.end(), heap_cmp);
765  msets.resize(msets.size() - 1);
766  } else {
767  front.second = n;
768  Heap::replace(msets.begin(), msets.end(), heap_cmp);
769  }
770  }
771 
772  if (collapser) {
773  auto todo = check_at_least - maxitems;
774  if (merged_mset.size() != maxitems) {
775  todo = 0;
776  }
777  for ( ; !msets.empty() && todo; --todo) {
778  auto& front = msets.front();
779  auto& result = front.first.internal->items[front.second];
780  if (percent_threshold) {
781  if (result.get_weight() < min_weight) {
782  // FIXME: This will need adjusting if we ever support
783  // percentage thresholds when sorting primarily by value.
784  break;
785  }
786  }
787  (void)collapser.add(result.get_collapse_key());
788  auto n = front.second + 1;
789  if (n == front.first.size()) {
790  Heap::pop(msets.begin(), msets.end(), heap_cmp);
791  msets.resize(msets.size() - 1);
792  } else {
793  front.second = n;
794  Heap::replace(msets.begin(), msets.end(), heap_cmp);
795  }
796  }
797 
798  auto mseti = merged_mset.internal;
799  collapser.finalise(mseti->items, percent_threshold);
800 
801  if (check_at_least > 0) {
802  // Each input MSet object to the merge has already been collapsed
803  // and merge_stats() above will have set mset->matches_lower_bound
804  // to the maximum matches_lower_bound of any input, which provides
805  // a lower bound.
806  //
807  // In some cases, the collapser can provide a better lower bound.
808  auto collapser_lb = collapser.get_matches_lower_bound();
809  if (mseti->matches_upper_bound <= check_at_least) {
810  mseti->matches_lower_bound = collapser_lb;
811  mseti->matches_estimated = collapser_lb;
812  mseti->matches_upper_bound = collapser_lb;
813  return merged_mset;
814  }
815 
816  mseti->matches_lower_bound = max(mseti->matches_lower_bound,
817  collapser_lb);
818  }
819 
820  double unique_rate = 1.0;
821 
822  Xapian::doccount docs_considered = collapser.get_docs_considered();
823  Xapian::doccount dups_ignored = collapser.get_dups_ignored();
824  if (docs_considered > 0) {
825  // Scale the estimate by the rate at which we've been finding
826  // unique documents while merging MSet objects.
827  double unique = double(docs_considered - dups_ignored);
828  unique_rate = unique / double(docs_considered);
829  }
830 
831  // We can safely reduce the upper bound by the number of duplicates
832  // we've seen while merging MSet objects.
833  mseti->matches_upper_bound -= collapser.get_dups_ignored();
834 
835  double estimate_scale = unique_rate;
836 
837  if (estimate_scale != 1.0) {
838  auto l = mseti->matches_lower_bound;
839  auto u = mseti->matches_upper_bound;
840  auto e = l + Xapian::doccount((u - l) * estimate_scale + 0.5);
841  mseti->matches_estimated = e;
842  }
843 
844  // Clamp the estimate the range given by the bounds.
845  AssertRel(mseti->matches_lower_bound, <=, mseti->matches_upper_bound);
846  mseti->matches_estimated = std::clamp(mseti->matches_estimated,
847  mseti->matches_lower_bound,
848  mseti->matches_upper_bound);
849  }
850 
851  return merged_mset;
852 #else
853  return local_mset;
854 #endif
855 }
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
Definition: api_anydb.cc:62
if(!(properties &BACKEND))
Definition: api_collated.h:3
@ BACKEND_REMOTE
Definition: backends.h:27
Simpler version of Collapser used when merging MSet objects.
Definition: collapser.h:260
PostList which applies a MatchDecider.
Class for estimating the total number of matching documents.
Definition: estimateop.h:64
Xapian::MSet get_mset(Xapian::doccount first, Xapian::doccount maxitems, Xapian::doccount check_at_least, Xapian::Weight::Internal &stats, const Xapian::Weight &wtscheme, const Xapian::MatchDecider *mdecider, const Xapian::KeyMaker *sorter, Xapian::valueno collapse_key, Xapian::doccount collapse_max, int percent_threshold, double weight_threshold, Xapian::Enquire::docid_order order, Xapian::valueno sort_key, Xapian::Enquire::Internal::sort_setting sort_by, bool sort_val_reverse, double time_limit, const std::vector< opt_ptr_spy > &matchspies)
Run the match and produce an MSet object.
Definition: matcher.cc:577
std::vector< std::unique_ptr< LocalSubMatch > > locals
LocalSubMatch objects for local databases.
Definition: matcher.h:56
Xapian::Database db
Definition: matcher.h:49
std::size_t first_nonselectable
Partition point in remotes.
Definition: matcher.h:86
Xapian::MSet get_local_mset(Xapian::doccount first, Xapian::doccount maxitems, Xapian::doccount check_at_least, const Xapian::Weight &wtscheme, const Xapian::MatchDecider *mdecider, const Xapian::KeyMaker *sorter, Xapian::valueno collapse_key, Xapian::doccount collapse_max, int percent_threshold, double percent_threshold_factor, double weight_threshold, Xapian::Enquire::docid_order order, Xapian::valueno sort_key, Xapian::Enquire::Internal::sort_setting sort_by, bool sort_val_reverse, double time_limit, const std::vector< opt_ptr_spy > &matchspies)
Definition: matcher.cc:341
std::vector< std::unique_ptr< RemoteSubMatch > > remotes
RemoteSubMatch objects for remote databases.
Definition: matcher.h:78
void for_all_remotes(Action action)
Perform action on remotes as they become ready using poll() or select().
Definition: matcher.cc:80
Matcher(const Matcher &)=delete
Sharded database backend.
void set_postlists(PostList **pls, Xapian::doccount n_shards_)
Definition: postlisttree.h:94
Xapian::docid get_docid() const
Definition: postlisttree.h:129
double recalc_maxweight()
Definition: postlisttree.h:111
bool next(double w_min)
Return false if we're done.
Definition: postlisttree.h:144
double get_weight() const
Definition: postlisttree.h:137
void delete_postlists()
Delete all the PostList objects.
Definition: postlisttree.h:77
bool process(Result &&new_item, ValueStreamDocument &vsdoc)
Process new_item.
Definition: protomset.h:301
bool early_reject(Result &new_item, bool calculated_weight, SpyMaster &spymaster, const Xapian::Document &doc)
Definition: protomset.h:250
void set_new_min_weight(double min_wt)
Definition: protomset.h:432
double get_min_weight() const
Definition: protomset.h:174
Xapian::MSet finalise(const Xapian::MatchDecider *mdecider, const std::vector< std::unique_ptr< LocalSubMatch >> &locals, const Xapian::VecUniquePtr< EstimateOp > &estimates)
Definition: protomset.h:491
RemoteDatabase is the baseclass for remote database implementations.
Class for performing matching on a remote database.
void start_match(Xapian::doccount first, Xapian::doccount maxitems, Xapian::doccount check_at_least, const Xapian::KeyMaker *sorter, const Xapian::Weight::Internal &total_stats)
Start the match.
Xapian::doccount get_shard() const
Return the index of the corresponding Database shard.
Xapian::MSet get_mset(const std::vector< opt_ptr_spy > &matchspies)
Get MSet.
void prepare_match(Xapian::Weight::Internal &total_stats)
Fetch and collate statistics.
A result in an MSet.
Definition: result.h:30
void set_weight(double weight_)
Definition: result.h:78
void set_sort_key(const std::string &k)
Definition: result.h:84
A document which gets its values from a ValueStreamManager.
void set_document(Xapian::docid did_)
std::string get_value(Xapian::valueno slot) const
Virtual base class for Database internals.
virtual int get_backend_info(std::string *path) const =0
Get backend information about this database.
virtual void readahead_for_query(const Query &query) const
An indexed database of documents.
Definition: database.h:75
Xapian::Internal::intrusive_ptr_nonnull< Internal > internal
Definition: database.h:95
Class representing a document.
Definition: document.h:64
docid_order
Ordering of docids.
Definition: enquire.h:130
@ DESCENDING
docids sort in descending order.
Definition: enquire.h:134
unsigned _refs
Reference count.
Definition: intrusive_ptr.h:74
A smart pointer that optionally uses intrusive reference counting.
Virtual base class for key making functors.
Definition: keymaker.h:44
Xapian::MSet internals.
Definition: msetinternal.h:44
Class representing a list of search results.
Definition: mset.h:46
Xapian::Internal::intrusive_ptr_nonnull< Internal > internal
Definition: mset.h:78
bool empty() const
Return true if this MSet object is empty.
Definition: mset.h:467
Abstract base class for match deciders.
Definition: matchdecider.h:37
Indicates a problem communicating with a remote database.
Definition: error.h:791
Class representing a query.
Definition: query.h:45
bool empty() const noexcept
Check if this query is Xapian::Query::MatchNothing.
Definition: query.h:661
Class representing a set of documents judged as relevant.
Definition: rset.h:39
Xapian::Internal::intrusive_ptr< Internal > internal
Definition: rset.h:42
UnimplementedError indicates an attempt to use an unimplemented feature.
Definition: error.h:313
Suitable for "simple" type T.
Definition: smallvector.h:62
void push_back(T elt)
Definition: smallvector.h:190
size_type size() const
Definition: smallvector.h:135
Class to hold statistics for a given collection.
void set_query(const Xapian::Query &query_)
Abstract base class for weighting schemes.
Definition: weight.h:38
PostList which applies a MatchDecider.
Xapian::Enquire internals.
SubMatch class for a local database.
static void unimplemented(const char *msg)
Definition: matcher.cc:73
static constexpr auto DOCID
Definition: matcher.cc:65
static constexpr auto REL
Definition: matcher.cc:66
static constexpr auto VAL_REL
Definition: matcher.cc:69
static constexpr auto REL_VAL
Definition: matcher.cc:67
static constexpr auto VAL
Definition: matcher.cc:68
Matcher class.
MSetCmp get_msetcmp_function(Xapian::Enquire::Internal::sort_setting sort_by, bool sort_forward, bool sort_val_reverse)
Select the appropriate msetcmp function.
Definition: msetcmp.cc:100
Result comparison functions.
Xapian::MSet internals.
Work around MSVC's unhelpful non-standard invalid parameter handling.
Sharded database backend.
void pop(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:213
void replace(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:230
void make(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:259
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned valueno
The number for a value slot in a document.
Definition: types.h:90
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
Various assertion macros.
#define AssertRel(A, REL, B)
Definition: omassert.h:123
#define Assert(COND)
Definition: omassert.h:122
Class for managing a tree of PostList objects.
ProtoMSet class.
RemoteDatabase is the baseclass for remote database implementations.
SubMatch class for a remote database.
Set of documents judged as relevant.
include <sys/select.h> with portability workarounds.
Socket handling utilities.
int socket_errno()
Definition: socket_utils.h:121
Class for managing MatchSpy objects during the match.
Xapian::doccount est
Definition: estimateop.h:33
Xapian::doccount min
Definition: estimateop.h:33
Xapian::doccount max
Definition: estimateop.h:33
std::unique_ptr< EstimateOp > est
Definition: estimateop.h:219
A document which gets its values from a ValueStreamManager.
Define preprocessor symbols for the library version.
const char * dummy[]
Definition: version_h.cc:7
Xapian::Weight::Internal class, holding database and term statistics.