00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026 #include <config.h>
00027
00028 #include "multimatch.h"
00029
00030 #include "autoptr.h"
00031 #include "collapser.h"
00032 #include "debuglog.h"
00033 #include "submatch.h"
00034 #include "localsubmatch.h"
00035 #include "omassert.h"
00036 #include "omenquireinternal.h"
00037
00038 #include "emptypostlist.h"
00039 #include "branchpostlist.h"
00040 #include "mergepostlist.h"
00041
00042 #include "document.h"
00043 #include "omqueryinternal.h"
00044
00045 #include "submatch.h"
00046
00047 #include "msetcmp.h"
00048
00049 #include "valuestreamdocument.h"
00050 #include "weightinternal.h"
00051
00052 #include <xapian/errorhandler.h>
00053 #include <xapian/matchspy.h>
00054 #include <xapian/version.h>
00055
00056 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00057 #include "remotesubmatch.h"
00058 #include "remote-database.h"
00059 #endif
00060
00061 #include <algorithm>
00062 #include <cfloat>
00063 #include <climits>
00064 #include <vector>
00065 #include <map>
00066 #include <set>
00067
00068 using namespace std;
00069
00070 const Xapian::Enquire::Internal::sort_setting REL =
00071 Xapian::Enquire::Internal::REL;
00072 const Xapian::Enquire::Internal::sort_setting REL_VAL =
00073 Xapian::Enquire::Internal::REL_VAL;
00074 const Xapian::Enquire::Internal::sort_setting VAL =
00075 Xapian::Enquire::Internal::VAL;
00076 #if 0 // VAL_REL isn't currently used which causes a warning with SGI CC.
00077 const Xapian::Enquire::Internal::sort_setting VAL_REL =
00078 Xapian::Enquire::Internal::VAL_REL;
00079 #endif
00080
00088 static void
00089 split_rset_by_db(const Xapian::RSet * rset,
00090 Xapian::doccount number_of_subdbs,
00091 vector<Xapian::RSet> & subrsets)
00092 {
00093 LOGCALL_STATIC_VOID(MATCH, "split_rset_by_db", rset | number_of_subdbs | subrsets);
00094 if (rset) {
00095 if (number_of_subdbs == 1) {
00096
00097 subrsets.push_back(*rset);
00098 } else {
00099
00100
00101 subrsets.reserve(number_of_subdbs);
00102 for (size_t i = 0; i < number_of_subdbs; ++i) {
00103 subrsets.push_back(Xapian::RSet());
00104 }
00105
00106 const set<Xapian::docid> & rsetitems = rset->internal->get_items();
00107 set<Xapian::docid>::const_iterator j;
00108 for (j = rsetitems.begin(); j != rsetitems.end(); ++j) {
00109 Xapian::doccount local_docid = (*j - 1) / number_of_subdbs + 1;
00110 Xapian::doccount subdatabase = (*j - 1) % number_of_subdbs;
00111 subrsets[subdatabase].add_document(local_docid);
00112 }
00113 }
00114 } else {
00115
00116 subrsets.resize(number_of_subdbs);
00117 }
00118 Assert(subrsets.size() == number_of_subdbs);
00119 }
00120
00139 static void
00140 prepare_sub_matches(vector<Xapian::Internal::RefCntPtr<SubMatch> > & leaves,
00141 Xapian::ErrorHandler * errorhandler,
00142 Xapian::Weight::Internal & stats)
00143 {
00144 LOGCALL_STATIC_VOID(MATCH, "prepare_sub_matches", leaves | errorhandler | stats);
00145
00146 vector<bool> prepared;
00147 prepared.resize(leaves.size(), false);
00148 size_t unprepared = leaves.size();
00149 bool nowait = true;
00150 while (unprepared) {
00151 for (size_t leaf = 0; leaf < leaves.size(); ++leaf) {
00152 if (prepared[leaf]) continue;
00153 try {
00154 SubMatch * submatch = leaves[leaf].get();
00155 if (!submatch || submatch->prepare_match(nowait, stats)) {
00156 prepared[leaf] = true;
00157 --unprepared;
00158 }
00159 } catch (Xapian::Error & e) {
00160 if (!errorhandler) throw;
00161
00162 LOGLINE(EXCEPTION, "Calling error handler for prepare_match() on a SubMatch.");
00163 (*errorhandler)(e);
00164
00165 leaves[leaf] = NULL;
00166 prepared[leaf] = true;
00167 --unprepared;
00168 }
00169 }
00170
00171
00172 nowait = false;
00173 }
00174 }
00175
00177 class MultipleMatchSpy : public Xapian::MatchSpy {
00178 private:
00180 const std::vector<Xapian::MatchSpy *> & spies;
00181
00182 public:
00183 MultipleMatchSpy(const std::vector<Xapian::MatchSpy *> & spies_)
00184 : spies(spies_) {}
00185
00190 void operator()(const Xapian::Document &doc, Xapian::weight wt);
00191 };
00192
00193 void
00194 MultipleMatchSpy::operator()(const Xapian::Document &doc, Xapian::weight wt) {
00195 LOGCALL_VOID(MATCH, "MultipleMatchSpy::operator()", doc | wt);
00196 vector<Xapian::MatchSpy *>::const_iterator i;
00197 for (i = spies.begin(); i != spies.end(); ++i) {
00198 (**i)(doc, wt);
00199 }
00200 }
00201
00203
00205 MultiMatch::MultiMatch(const Xapian::Database &db_,
00206 const Xapian::Query::Internal * query_,
00207 Xapian::termcount qlen,
00208 const Xapian::RSet * omrset,
00209 Xapian::doccount collapse_max_,
00210 Xapian::valueno collapse_key_,
00211 int percent_cutoff_, Xapian::weight weight_cutoff_,
00212 Xapian::Enquire::docid_order order_,
00213 Xapian::valueno sort_key_,
00214 Xapian::Enquire::Internal::sort_setting sort_by_,
00215 bool sort_value_forward_,
00216 Xapian::ErrorHandler * errorhandler_,
00217 Xapian::Weight::Internal & stats,
00218 const Xapian::Weight * weight_,
00219 const vector<Xapian::MatchSpy *> & matchspies_,
00220 bool have_sorter, bool have_mdecider)
00221 : db(db_), query(query_),
00222 collapse_max(collapse_max_), collapse_key(collapse_key_),
00223 percent_cutoff(percent_cutoff_), weight_cutoff(weight_cutoff_),
00224 order(order_),
00225 sort_key(sort_key_), sort_by(sort_by_),
00226 sort_value_forward(sort_value_forward_),
00227 errorhandler(errorhandler_), weight(weight_),
00228 is_remote(db.internal.size()),
00229 matchspies(matchspies_)
00230 {
00231 LOGCALL_CTOR(MATCH, "MultiMatch", db_ | query_ | qlen | omrset | collapse_max_ | collapse_key_ | percent_cutoff_ | weight_cutoff_ | int(order_) | sort_key_ | int(sort_by_) | sort_value_forward_ | errorhandler_ | stats | weight_ | matchspies_ | have_sorter | have_mdecider);
00232
00233 if (!query) return;
00234 query->validate_query();
00235
00236 Xapian::doccount number_of_subdbs = db.internal.size();
00237 vector<Xapian::RSet> subrsets;
00238 split_rset_by_db(omrset, number_of_subdbs, subrsets);
00239
00240 for (size_t i = 0; i != number_of_subdbs; ++i) {
00241 Xapian::Database::Internal *subdb = db.internal[i].get();
00242 Assert(subdb);
00243 Xapian::Internal::RefCntPtr<SubMatch> smatch;
00244 try {
00245
00246 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00247 RemoteDatabase *rem_db = subdb->as_remotedatabase();
00248 if (rem_db) {
00249 if (have_sorter) {
00250 throw Xapian::UnimplementedError("Xapian::KeyMaker not supported for the remote backend");
00251 }
00252 if (have_mdecider) {
00253 throw Xapian::UnimplementedError("Xapian::MatchDecider not supported for the remote backend");
00254 }
00255 rem_db->set_query(query, qlen, collapse_max, collapse_key,
00256 order, sort_key, sort_by, sort_value_forward,
00257 percent_cutoff, weight_cutoff, weight,
00258 subrsets[i], matchspies);
00259 bool decreasing_relevance =
00260 (sort_by == REL || sort_by == REL_VAL);
00261 smatch = new RemoteSubMatch(rem_db, decreasing_relevance, matchspies);
00262 is_remote[i] = true;
00263 } else {
00264 smatch = new LocalSubMatch(subdb, query, qlen, subrsets[i], weight);
00265 }
00266 #else
00267
00268 (void)have_sorter;
00269 (void)have_mdecider;
00270 smatch = new LocalSubMatch(subdb, query, qlen, subrsets[i], weight);
00271 #endif
00272 } catch (Xapian::Error & e) {
00273 if (!errorhandler) throw;
00274 LOGLINE(EXCEPTION, "Calling error handler for creation of a SubMatch from a database and query.");
00275 (*errorhandler)(e);
00276
00277 smatch = NULL;
00278 }
00279 leaves.push_back(smatch);
00280 }
00281
00282 stats.mark_wanted_terms(*query);
00283 prepare_sub_matches(leaves, errorhandler, stats);
00284 stats.set_bounds_from_db(db);
00285 }
00286
00287 Xapian::weight
00288 MultiMatch::getorrecalc_maxweight(PostList *pl)
00289 {
00290 LOGCALL(MATCH, Xapian::weight, "MultiMatch::getorrecalc_maxweight", pl);
00291 Xapian::weight wt;
00292 if (recalculate_w_max) {
00293 LOGLINE(MATCH, "recalculating max weight");
00294 wt = pl->recalc_maxweight();
00295 recalculate_w_max = false;
00296 } else {
00297 wt = pl->get_maxweight();
00298 LOGLINE(MATCH, "pl = (" << pl->get_description() << ")");
00299 AssertEqDoubleParanoid(wt, pl->recalc_maxweight());
00300 }
00301 LOGLINE(MATCH, "max possible doc weight = " << wt);
00302 RETURN(wt);
00303 }
00304
00305 void
00306 MultiMatch::get_mset(Xapian::doccount first, Xapian::doccount maxitems,
00307 Xapian::doccount check_at_least,
00308 Xapian::MSet & mset,
00309 const Xapian::Weight::Internal & stats,
00310 const Xapian::MatchDecider *mdecider,
00311 const Xapian::MatchDecider *matchspy_legacy,
00312 const Xapian::KeyMaker *sorter)
00313 {
00314 LOGCALL_VOID(MATCH, "MultiMatch::get_mset", first | maxitems | check_at_least | Literal("mset") | stats | Literal("mdecider") | Literal("matchspy_legacy") | Literal("sorter"));
00315 AssertRel(check_at_least,>=,maxitems);
00316
00317 if (!query) {
00318 mset = Xapian::MSet(new Xapian::MSet::Internal());
00319 mset.internal->firstitem = first;
00320 return;
00321 }
00322
00323 Assert(!leaves.empty());
00324
00325 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00326
00327
00328 if (leaves.size() == 1 && is_remote[0]) {
00329 RemoteSubMatch * rem_match;
00330 rem_match = static_cast<RemoteSubMatch*>(leaves[0].get());
00331 rem_match->start_match(first, maxitems, check_at_least, stats);
00332 rem_match->get_mset(mset);
00333 return;
00334 }
00335 #endif
00336
00337
00338 {
00339 vector<Xapian::Internal::RefCntPtr<SubMatch> >::iterator leaf;
00340 for (leaf = leaves.begin(); leaf != leaves.end(); ++leaf) {
00341 if (!(*leaf).get()) continue;
00342 try {
00343 (*leaf)->start_match(0, first + maxitems,
00344 first + check_at_least, stats);
00345 } catch (Xapian::Error & e) {
00346 if (!errorhandler) throw;
00347 LOGLINE(EXCEPTION, "Calling error handler for "
00348 "start_match() on a SubMatch.");
00349 (*errorhandler)(e);
00350
00351 *leaf = NULL;
00352 }
00353 }
00354 }
00355
00356
00357 vector<PostList *> postlists;
00358 map<string, Xapian::MSet::Internal::TermFreqAndWeight> termfreqandwts;
00359 map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts_ptr;
00360 termfreqandwts_ptr = &termfreqandwts;
00361
00362 Xapian::termcount total_subqs = 0;
00363
00364
00365
00366
00367 Xapian::doccount definite_matches_not_seen = 0;
00368 for (size_t i = 0; i != leaves.size(); ++i) {
00369 PostList *pl;
00370 try {
00371 pl = leaves[i]->get_postlist_and_term_info(this,
00372 termfreqandwts_ptr,
00373 &total_subqs);
00374 if (termfreqandwts_ptr && !termfreqandwts.empty())
00375 termfreqandwts_ptr = NULL;
00376 if (is_remote[i]) {
00377 if (pl->get_termfreq_min() > first + maxitems) {
00378 LOGLINE(MATCH, "Found " <<
00379 pl->get_termfreq_min() - (first + maxitems)
00380 << " definite matches in remote submatch "
00381 "which aren't passed to local match");
00382 definite_matches_not_seen += pl->get_termfreq_min();
00383 definite_matches_not_seen -= first + maxitems;
00384 }
00385 }
00386 } catch (Xapian::Error & e) {
00387 if (!errorhandler) throw;
00388 LOGLINE(EXCEPTION, "Calling error handler for "
00389 "get_term_info() on a SubMatch.");
00390 (*errorhandler)(e);
00391
00392
00393 leaves[i] = NULL;
00394 pl = new EmptyPostList;
00395 }
00396 postlists.push_back(pl);
00397 }
00398 Assert(!postlists.empty());
00399
00400 ValueStreamDocument vsdoc(db);
00401 ++vsdoc.ref_count;
00402 Xapian::Document doc(&vsdoc);
00403
00404
00405 AutoPtr<PostList> pl;
00406 if (postlists.size() == 1) {
00407 pl.reset(postlists.front());
00408 } else {
00409 pl.reset(new MergePostList(postlists, this, vsdoc, errorhandler));
00410 }
00411
00412 LOGLINE(MATCH, "pl = (" << pl->get_description() << ")");
00413
00414 #ifdef XAPIAN_DEBUG_LOG
00415 {
00416 map<string, Xapian::MSet::Internal::TermFreqAndWeight>::const_iterator tfwi;
00417 for (tfwi = termfreqandwts.begin(); tfwi != termfreqandwts.end(); ++tfwi) {
00418 LOGLINE(MATCH, "termfreqandwts[" << tfwi->first << "] = " << tfwi->second.termfreq << ", " << tfwi->second.termweight);
00419 }
00420 }
00421 #endif
00422
00423
00424 Xapian::doccount docs_matched = 0;
00425 Xapian::weight greatest_wt = 0;
00426 Xapian::termcount greatest_wt_subqs_matched = 0;
00427 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00428 unsigned greatest_wt_subqs_db_num = UINT_MAX;
00429 #endif
00430 vector<Xapian::Internal::MSetItem> items;
00431
00432
00433 const Xapian::weight max_possible = pl->recalc_maxweight();
00434
00435 LOGLINE(MATCH, "pl = (" << pl->get_description() << ")");
00436 recalculate_w_max = false;
00437
00438 Xapian::doccount matches_upper_bound = pl->get_termfreq_max();
00439 Xapian::doccount matches_lower_bound = 0;
00440 Xapian::doccount matches_estimated = pl->get_termfreq_est();
00441
00442 if (mdecider == NULL && matchspy_legacy == NULL) {
00443
00444
00445
00446 matches_lower_bound = pl->get_termfreq_min();
00447 }
00448
00449
00450 Xapian::MatchSpy *matchspy = NULL;
00451 MultipleMatchSpy multispy(matchspies);
00452 if (!matchspies.empty()) {
00453 if (matchspies.size() == 1) {
00454 matchspy = matchspies[0];
00455 } else {
00456 matchspy = &multispy;
00457 }
00458 }
00459
00460
00461
00462 if (check_at_least == 0) {
00463 pl.reset(NULL);
00464 Xapian::doccount uncollapsed_lower_bound = matches_lower_bound;
00465 if (collapse_max) {
00466
00467
00468
00469 if (matches_lower_bound > collapse_max)
00470 matches_lower_bound = collapse_max;
00471 }
00472
00473 mset = Xapian::MSet(new Xapian::MSet::Internal(
00474 first,
00475 matches_upper_bound,
00476 matches_lower_bound,
00477 matches_estimated,
00478 matches_upper_bound,
00479 uncollapsed_lower_bound,
00480 matches_estimated,
00481 max_possible, greatest_wt, items,
00482 termfreqandwts,
00483 0));
00484 return;
00485 }
00486
00487
00488 Xapian::doccount decider_considered = 0;
00489
00490 Xapian::doccount decider_denied = 0;
00491
00492
00493
00494 Xapian::doccount max_msize = first + maxitems;
00495 items.reserve(max_msize + 1);
00496
00497
00498
00499 Xapian::Internal::MSetItem min_item(0.0, 0);
00500
00501
00502 Xapian::weight min_weight = weight_cutoff;
00503
00504
00505 Xapian::weight percent_cutoff_factor = percent_cutoff / 100.0;
00506
00507
00508 percent_cutoff_factor -= DBL_EPSILON;
00509
00510
00511 Collapser collapser(collapse_key, collapse_max);
00512
00514 bool sort_forward = (order != Xapian::Enquire::DESCENDING);
00515 MSetCmp mcmp(get_msetcmp_function(sort_by, sort_forward, sort_value_forward));
00516
00517
00518
00519
00520
00521
00522
00523
00524
00525
00526
00527
00528
00529
00530 bool is_heap = false;
00531
00532 while (true) {
00533 bool pushback;
00534
00535 if (rare(recalculate_w_max)) {
00536 if (min_weight > 0.0) {
00537 if (rare(getorrecalc_maxweight(pl.get()) < min_weight)) {
00538 LOGLINE(MATCH, "*** TERMINATING EARLY (1)");
00539 break;
00540 }
00541 }
00542 }
00543
00544 PostList * pl_copy = pl.get();
00545 if (rare(next_handling_prune(pl_copy, min_weight, this))) {
00546 (void)pl.release();
00547 pl.reset(pl_copy);
00548 LOGLINE(MATCH, "*** REPLACING ROOT");
00549
00550 if (min_weight > 0.0) {
00551
00552
00553
00554 if (rare(getorrecalc_maxweight(pl.get()) < min_weight)) {
00555 LOGLINE(MATCH, "*** TERMINATING EARLY (2)");
00556 break;
00557 }
00558 }
00559 }
00560
00561 if (rare(pl->at_end())) {
00562 LOGLINE(MATCH, "Reached end of potential matches");
00563 break;
00564 }
00565
00566
00567
00568
00569 Xapian::weight wt = 0.0;
00570 bool calculated_weight = false;
00571 if (sort_by != VAL || min_weight > 0.0) {
00572 wt = pl->get_weight();
00573 if (wt < min_weight) {
00574 LOGLINE(MATCH, "Rejecting potential match due to insufficient weight");
00575 continue;
00576 }
00577 calculated_weight = true;
00578 }
00579
00580 Xapian::docid did = pl->get_docid();
00581 vsdoc.set_document(did);
00582 LOGLINE(MATCH, "Candidate document id " << did << " wt " << wt);
00583 Xapian::Internal::MSetItem new_item(wt, did);
00584 if (sort_by != REL) {
00585 if (sorter) {
00586 new_item.sort_key = (*sorter)(doc);
00587 } else {
00588 new_item.sort_key = vsdoc.get_value(sort_key);
00589 }
00590
00591
00592
00593
00594
00595 if (!mcmp(new_item, min_item)) {
00596 if (mdecider == NULL && !collapser && matchspy_legacy == NULL) {
00597
00598
00599 LOGLINE(MATCH, "Making note of match item which sorts lower than min_item");
00600 ++docs_matched;
00601 if (!calculated_weight) wt = pl->get_weight();
00602 if (matchspy) {
00603 matchspy->operator()(doc, wt);
00604 }
00605 if (wt > greatest_wt) goto new_greatest_weight;
00606 continue;
00607 }
00608 if (docs_matched >= check_at_least) {
00609
00610 LOGLINE(MATCH, "Dropping candidate which sorts lower than min_item");
00611
00612 if (!calculated_weight) wt = pl->get_weight();
00613 if (wt > greatest_wt) goto new_greatest_weight;
00614 continue;
00615 }
00616
00617
00618
00619 LOGLINE(MATCH, "Keeping candidate which sorts lower than min_item for further investigation");
00620 }
00621 }
00622
00623
00624 if (matchspy != NULL || mdecider != NULL || matchspy_legacy != NULL) {
00625 const unsigned int multiplier = db.internal.size();
00626 Assert(multiplier != 0);
00627 Xapian::doccount n = (did - 1) % multiplier;
00628
00629
00630 if (!is_remote[n]) {
00631 ++decider_considered;
00632 if (matchspy_legacy && !matchspy_legacy->operator()(doc)) {
00633 ++decider_denied;
00634 continue;
00635 }
00636 if (mdecider && !mdecider->operator()(doc)) {
00637 ++decider_denied;
00638 continue;
00639 }
00640 if (matchspy) {
00641 if (!calculated_weight) {
00642 wt = pl->get_weight();
00643 new_item.wt = wt;
00644 calculated_weight = true;
00645 }
00646 matchspy->operator()(doc, wt);
00647 }
00648 }
00649 }
00650
00651 if (!calculated_weight) {
00652
00653 wt = pl->get_weight();
00654 new_item.wt = wt;
00655 }
00656
00657 pushback = true;
00658
00659
00660 if (collapser) {
00661 collapse_result res;
00662 res = collapser.process(new_item, pl.get(), vsdoc, mcmp);
00663 if (res == REJECTED) {
00664
00665
00666 if (sort_by != REL && sort_by != REL_VAL) {
00667 if (wt > greatest_wt) goto new_greatest_weight;
00668 }
00669 continue;
00670 }
00671
00672 if (res == REPLACED) {
00673
00674
00675 Assert(!items.empty());
00676
00677 const Xapian::Internal::MSetItem & old_item =
00678 collapser.old_item;
00679
00680
00681
00682
00683
00684 Xapian::weight old_wt = old_item.wt;
00685 if (old_wt >= min_weight && mcmp(old_item, min_item)) {
00686
00687
00688 Xapian::docid olddid = old_item.did;
00689 vector<Xapian::Internal::MSetItem>::iterator i;
00690 for (i = items.begin(); i != items.end(); ++i) {
00691 if (i->did == olddid) {
00692 LOGLINE(MATCH, "collapse: removing " <<
00693 olddid << ": " <<
00694 new_item.collapse_key);
00695
00696
00697
00698
00699
00700 *i = new_item;
00701 pushback = false;
00702 is_heap = false;
00703 break;
00704 }
00705 }
00706 }
00707 }
00708 }
00709
00710
00711 if (pushback) {
00712 ++docs_matched;
00713 if (items.size() >= max_msize) {
00714 items.push_back(new_item);
00715 if (!is_heap) {
00716 is_heap = true;
00717 make_heap(items.begin(), items.end(), mcmp);
00718 } else {
00719 push_heap<vector<Xapian::Internal::MSetItem>::iterator,
00720 MSetCmp>(items.begin(), items.end(), mcmp);
00721 }
00722 pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
00723 MSetCmp>(items.begin(), items.end(), mcmp);
00724 items.pop_back();
00725
00726 min_item = items.front();
00727 if (sort_by == REL || sort_by == REL_VAL) {
00728 if (docs_matched >= check_at_least) {
00729 if (sort_by == REL) {
00730
00731
00732
00733 if (rare(max_possible == 0 && sort_forward)) {
00734
00735
00736
00737
00738
00739 if (leaves.size() == 1) break;
00740 }
00741 }
00742 if (min_item.wt > min_weight) {
00743 LOGLINE(MATCH, "Setting min_weight to " <<
00744 min_item.wt << " from " << min_weight);
00745 min_weight = min_item.wt;
00746 }
00747 }
00748 }
00749 if (rare(getorrecalc_maxweight(pl.get()) < min_weight)) {
00750 LOGLINE(MATCH, "*** TERMINATING EARLY (3)");
00751 break;
00752 }
00753 } else {
00754 items.push_back(new_item);
00755 is_heap = false;
00756 if (sort_by == REL && items.size() == max_msize) {
00757 if (docs_matched >= check_at_least) {
00758
00759
00760
00761 if (rare(max_possible == 0 && sort_forward)) {
00762
00763
00764
00765
00766
00767 if (leaves.size() == 1) break;
00768 }
00769 }
00770 }
00771 }
00772 }
00773
00774
00775 if (wt > greatest_wt) {
00776 new_greatest_weight:
00777 greatest_wt = wt;
00778 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00779 const unsigned int multiplier = db.internal.size();
00780 unsigned int db_num = (did - 1) % multiplier;
00781 if (is_remote[db_num]) {
00782
00783
00784 greatest_wt_subqs_db_num = db_num;
00785 } else
00786 #endif
00787 {
00788 greatest_wt_subqs_matched = pl->count_matching_subqs();
00789 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00790 greatest_wt_subqs_db_num = UINT_MAX;
00791 #endif
00792 }
00793 if (percent_cutoff) {
00794 Xapian::weight w = wt * percent_cutoff_factor;
00795 if (w > min_weight) {
00796 min_weight = w;
00797 if (!is_heap) {
00798 is_heap = true;
00799 make_heap<vector<Xapian::Internal::MSetItem>::iterator,
00800 MSetCmp>(items.begin(), items.end(), mcmp);
00801 }
00802 while (!items.empty() && items.front().wt < min_weight) {
00803 pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
00804 MSetCmp>(items.begin(), items.end(), mcmp);
00805 Assert(items.back().wt < min_weight);
00806 items.pop_back();
00807 }
00808 #ifdef XAPIAN_ASSERTIONS_PARANOID
00809 vector<Xapian::Internal::MSetItem>::const_iterator i;
00810 for (i = items.begin(); i != items.end(); ++i) {
00811 Assert(i->wt >= min_weight);
00812 }
00813 #endif
00814 }
00815 }
00816 }
00817 }
00818
00819
00820 pl.reset(NULL);
00821
00822 double percent_scale = 0;
00823 if (!items.empty() && greatest_wt > 0) {
00824
00825
00826 vector<Xapian::Internal::MSetItem>::const_iterator best;
00827 best = min_element(items.begin(), items.end(), mcmp);
00828
00829 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00830 if (greatest_wt_subqs_db_num != UINT_MAX) {
00831 const unsigned int n = greatest_wt_subqs_db_num;
00832 RemoteSubMatch * rem_match;
00833 rem_match = static_cast<RemoteSubMatch*>(leaves[n].get());
00834 percent_scale = rem_match->get_percent_factor() / 100.0;
00835 } else
00836 #endif
00837 {
00838 percent_scale = greatest_wt_subqs_matched / double(total_subqs);
00839 percent_scale /= greatest_wt;
00840 }
00841 Assert(percent_scale > 0);
00842 if (percent_cutoff) {
00843
00844
00845
00846
00847 Xapian::weight min_wt = percent_cutoff_factor / percent_scale;
00848 if (!is_heap) {
00849 is_heap = true;
00850 make_heap<vector<Xapian::Internal::MSetItem>::iterator,
00851 MSetCmp>(items.begin(), items.end(), mcmp);
00852 }
00853 while (!items.empty() && items.front().wt < min_wt) {
00854 pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
00855 MSetCmp>(items.begin(), items.end(), mcmp);
00856 Assert(items.back().wt < min_wt);
00857 items.pop_back();
00858 }
00859 #ifdef XAPIAN_ASSERTIONS_PARANOID
00860 vector<Xapian::Internal::MSetItem>::const_iterator j;
00861 for (j = items.begin(); j != items.end(); ++j) {
00862 Assert(j->wt >= min_wt);
00863 }
00864 #endif
00865 }
00866 percent_scale *= 100.0;
00867 }
00868
00869 LOGLINE(MATCH,
00870 "docs_matched = " << docs_matched <<
00871 ", definite_matches_not_seen = " << definite_matches_not_seen <<
00872 ", matches_lower_bound = " << matches_lower_bound <<
00873 ", matches_estimated = " << matches_estimated <<
00874 ", matches_upper_bound = " << matches_upper_bound);
00875
00876
00877
00878 docs_matched += definite_matches_not_seen;
00879
00880 Xapian::doccount uncollapsed_lower_bound = matches_lower_bound;
00881 Xapian::doccount uncollapsed_upper_bound = matches_upper_bound;
00882 Xapian::doccount uncollapsed_estimated = matches_estimated;
00883 if (items.size() < max_msize) {
00884
00885
00886 LOGLINE(MATCH, "items.size() = " << items.size() <<
00887 ", max_msize = " << max_msize << ", setting bounds equal");
00888 Assert(definite_matches_not_seen == 0);
00889 Assert(percent_cutoff || docs_matched == items.size());
00890 matches_lower_bound = matches_upper_bound = matches_estimated
00891 = items.size();
00892 if (collapser && matches_lower_bound > uncollapsed_lower_bound)
00893 uncollapsed_lower_bound = matches_lower_bound;
00894 } else if (!collapser && docs_matched < check_at_least) {
00895
00896
00897 LOGLINE(MATCH, "Setting bounds equal");
00898 matches_lower_bound = matches_upper_bound = matches_estimated
00899 = docs_matched;
00900 if (collapser && matches_lower_bound > uncollapsed_lower_bound)
00901 uncollapsed_lower_bound = matches_lower_bound;
00902 } else {
00903 AssertRel(matches_estimated,>=,matches_lower_bound);
00904 AssertRel(matches_estimated,<=,matches_upper_bound);
00905
00906
00907
00908
00909 double estimate_scale = 1.0;
00910 double unique_rate = 1.0;
00911
00912 if (collapser) {
00913 LOGLINE(MATCH, "Adjusting bounds due to collapse_key");
00914 matches_lower_bound = collapser.get_matches_lower_bound();
00915 if (matches_lower_bound > uncollapsed_lower_bound)
00916 uncollapsed_lower_bound = matches_lower_bound;
00917
00918
00919
00920
00921 Xapian::doccount docs_considered = collapser.get_docs_considered();
00922 Xapian::doccount dups_ignored = collapser.get_dups_ignored();
00923 if (docs_considered > 0) {
00924 double unique = double(docs_considered - dups_ignored);
00925 unique_rate = unique / double(docs_considered);
00926 }
00927
00928
00929
00930 matches_upper_bound -= dups_ignored;
00931
00932 LOGLINE(MATCH, "matches_lower_bound=" << matches_lower_bound <<
00933 ", matches_estimated=" << matches_estimated <<
00934 "*" << estimate_scale << "*" << unique_rate <<
00935 ", matches_upper_bound=" << matches_upper_bound);
00936 }
00937
00938 if (mdecider || matchspy_legacy) {
00939 if (!percent_cutoff) {
00940 if (!collapser) {
00941
00942
00943
00944 matches_lower_bound = max(docs_matched, matches_lower_bound);
00945 }
00946 }
00947
00948
00949
00950
00951 if (decider_considered > 0) {
00952 double accept = double(decider_considered - decider_denied);
00953 double accept_rate = accept / double(decider_considered);
00954 estimate_scale *= accept_rate;
00955 }
00956
00957
00958
00959
00960
00961 matches_upper_bound -= decider_denied;
00962 if (collapser) uncollapsed_upper_bound -= decider_denied;
00963 }
00964
00965 if (percent_cutoff) {
00966 estimate_scale *= (1.0 - percent_cutoff_factor);
00967
00968
00969
00970
00971
00972
00973
00974
00975
00976 matches_lower_bound = items.size();
00977 if (collapser) uncollapsed_lower_bound = matches_lower_bound;
00978
00979
00980
00981
00982 LOGLINE(MATCH, "Adjusted bounds due to percent_cutoff (" <<
00983 percent_cutoff << "): now have matches_estimated=" <<
00984 matches_estimated << ", matches_lower_bound=" <<
00985 matches_lower_bound);
00986 }
00987
00988 if (collapser && estimate_scale != 1.0) {
00989 uncollapsed_estimated =
00990 Xapian::doccount(uncollapsed_estimated * estimate_scale + 0.5);
00991 }
00992
00993 estimate_scale *= unique_rate;
00994
00995 if (estimate_scale != 1.0) {
00996 matches_estimated =
00997 Xapian::doccount(matches_estimated * estimate_scale + 0.5);
00998 if (matches_estimated < matches_lower_bound)
00999 matches_estimated = matches_lower_bound;
01000 }
01001
01002 if (collapser || mdecider || matchspy_legacy) {
01003 LOGLINE(MATCH, "Clamping estimate between bounds: "
01004 "matches_lower_bound = " << matches_lower_bound <<
01005 ", matches_estimated = " << matches_estimated <<
01006 ", matches_upper_bound = " << matches_upper_bound);
01007 AssertRel(matches_lower_bound,<=,matches_upper_bound);
01008 matches_estimated = max(matches_estimated, matches_lower_bound);
01009 matches_estimated = min(matches_estimated, matches_upper_bound);
01010 } else if (!percent_cutoff) {
01011 AssertRel(docs_matched,<=,matches_upper_bound);
01012 if (docs_matched > matches_lower_bound)
01013 matches_lower_bound = docs_matched;
01014 if (docs_matched > matches_estimated)
01015 matches_estimated = docs_matched;
01016 }
01017
01018 if (collapser && !mdecider && !percent_cutoff && !matchspy_legacy) {
01019 AssertRel(docs_matched,<=,uncollapsed_upper_bound);
01020 if (docs_matched > uncollapsed_lower_bound)
01021 uncollapsed_lower_bound = docs_matched;
01022 }
01023 }
01024
01025 if (collapser) {
01026 AssertRel(uncollapsed_lower_bound,<=,uncollapsed_upper_bound);
01027 if (uncollapsed_estimated < uncollapsed_lower_bound) {
01028 uncollapsed_estimated = uncollapsed_lower_bound;
01029 } else if (uncollapsed_estimated > uncollapsed_upper_bound) {
01030 uncollapsed_estimated = uncollapsed_upper_bound;
01031 }
01032 } else {
01033
01034 uncollapsed_lower_bound = matches_lower_bound;
01035 uncollapsed_upper_bound = matches_upper_bound;
01036 uncollapsed_estimated = matches_estimated;
01037 }
01038
01039 LOGLINE(MATCH, items.size() << " items in potential mset");
01040
01041 if (first > 0) {
01042
01043 if (items.size() <= first) {
01044 items.clear();
01045 } else {
01046 LOGLINE(MATCH, "finding " << first << "th");
01047
01048
01049
01050
01051 vector<Xapian::Internal::MSetItem>::reverse_iterator nth;
01052 nth = items.rbegin() + first;
01053 nth_element(items.rbegin(), nth, items.rend(), mcmp);
01054
01055 items.erase(items.begin() + items.size() - first, items.end());
01056 }
01057 }
01058
01059 LOGLINE(MATCH, "sorting " << items.size() << " entries");
01060
01061
01062 sort(items.begin(), items.end(), mcmp);
01063
01064 if (!items.empty()) {
01065 LOGLINE(MATCH, "min weight in mset = " << items.back().wt);
01066 LOGLINE(MATCH, "max weight in mset = " << items[0].wt);
01067 }
01068
01069 AssertRel(matches_estimated,>=,matches_lower_bound);
01070 AssertRel(matches_estimated,<=,matches_upper_bound);
01071
01072 AssertRel(uncollapsed_estimated,>=,uncollapsed_lower_bound);
01073 AssertRel(uncollapsed_estimated,<=,uncollapsed_upper_bound);
01074
01075
01076
01077
01078
01079
01080
01081
01082
01083 if (!items.empty() && collapser && !collapser.empty()) {
01084
01085
01086 Xapian::weight min_wt = percent_cutoff_factor / (percent_scale / 100);
01087 Xapian::doccount entries = collapser.entries();
01088 vector<Xapian::Internal::MSetItem>::iterator i;
01089 for (i = items.begin(); i != items.end(); ++i) {
01090
01091 if (i->collapse_key.empty()) continue;
01092
01093
01094 i->collapse_count = collapser.get_collapse_count(i->collapse_key, percent_cutoff, min_wt);
01095 if (--entries == 0) {
01096
01097 break;
01098 }
01099 }
01100 }
01101
01102 mset = Xapian::MSet(new Xapian::MSet::Internal(
01103 first,
01104 matches_upper_bound,
01105 matches_lower_bound,
01106 matches_estimated,
01107 uncollapsed_upper_bound,
01108 uncollapsed_lower_bound,
01109 uncollapsed_estimated,
01110 max_possible, greatest_wt, items,
01111 termfreqandwts,
01112 percent_scale));
01113 }