00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026 #include <config.h>
00027
00028 #include "multimatch.h"
00029
00030 #include "autoptr.h"
00031 #include "collapser.h"
00032 #include "debuglog.h"
00033 #include "submatch.h"
00034 #include "localsubmatch.h"
00035 #include "omassert.h"
00036 #include "omenquireinternal.h"
00037
00038 #include "emptypostlist.h"
00039 #include "branchpostlist.h"
00040 #include "mergepostlist.h"
00041
00042 #include "document.h"
00043 #include "omqueryinternal.h"
00044
00045 #include "msetcmp.h"
00046
00047 #include "valuestreamdocument.h"
00048 #include "weightinternal.h"
00049
00050 #include <xapian/errorhandler.h>
00051 #include <xapian/matchspy.h>
00052 #include <xapian/version.h>
00053
00054 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00055 #include "remotesubmatch.h"
00056 #include "remote-database.h"
00057 #endif
00058
00059 #include <algorithm>
00060 #include <cfloat>
00061 #include <climits>
00062 #include <vector>
00063 #include <map>
00064 #include <set>
00065
00066 using namespace std;
00067
00068 const Xapian::Enquire::Internal::sort_setting REL =
00069 Xapian::Enquire::Internal::REL;
00070 const Xapian::Enquire::Internal::sort_setting REL_VAL =
00071 Xapian::Enquire::Internal::REL_VAL;
00072 const Xapian::Enquire::Internal::sort_setting VAL =
00073 Xapian::Enquire::Internal::VAL;
00074 #if 0 // VAL_REL isn't currently used which causes a warning with SGI CC.
00075 const Xapian::Enquire::Internal::sort_setting VAL_REL =
00076 Xapian::Enquire::Internal::VAL_REL;
00077 #endif
00078
00086 static void
00087 split_rset_by_db(const Xapian::RSet * rset,
00088 Xapian::doccount number_of_subdbs,
00089 vector<Xapian::RSet> & subrsets)
00090 {
00091 LOGCALL_STATIC_VOID(MATCH, "split_rset_by_db", rset | number_of_subdbs | subrsets);
00092 if (rset) {
00093 if (number_of_subdbs == 1) {
00094
00095 subrsets.push_back(*rset);
00096 } else {
00097
00098
00099 subrsets.reserve(number_of_subdbs);
00100 for (size_t i = 0; i < number_of_subdbs; ++i) {
00101 subrsets.push_back(Xapian::RSet());
00102 }
00103
00104 const set<Xapian::docid> & rsetitems = rset->internal->get_items();
00105 set<Xapian::docid>::const_iterator j;
00106 for (j = rsetitems.begin(); j != rsetitems.end(); ++j) {
00107 Xapian::doccount local_docid = (*j - 1) / number_of_subdbs + 1;
00108 Xapian::doccount subdatabase = (*j - 1) % number_of_subdbs;
00109 subrsets[subdatabase].add_document(local_docid);
00110 }
00111 }
00112 } else {
00113
00114 subrsets.resize(number_of_subdbs);
00115 }
00116 Assert(subrsets.size() == number_of_subdbs);
00117 }
00118
00137 static void
00138 prepare_sub_matches(vector<Xapian::Internal::RefCntPtr<SubMatch> > & leaves,
00139 Xapian::ErrorHandler * errorhandler,
00140 Xapian::Weight::Internal & stats)
00141 {
00142 LOGCALL_STATIC_VOID(MATCH, "prepare_sub_matches", leaves | errorhandler | stats);
00143
00144 vector<bool> prepared;
00145 prepared.resize(leaves.size(), false);
00146 size_t unprepared = leaves.size();
00147 bool nowait = true;
00148 while (unprepared) {
00149 for (size_t leaf = 0; leaf < leaves.size(); ++leaf) {
00150 if (prepared[leaf]) continue;
00151 try {
00152 SubMatch * submatch = leaves[leaf].get();
00153 if (!submatch || submatch->prepare_match(nowait, stats)) {
00154 prepared[leaf] = true;
00155 --unprepared;
00156 }
00157 } catch (Xapian::Error & e) {
00158 if (!errorhandler) throw;
00159
00160 LOGLINE(EXCEPTION, "Calling error handler for prepare_match() on a SubMatch.");
00161 (*errorhandler)(e);
00162
00163 leaves[leaf] = NULL;
00164 prepared[leaf] = true;
00165 --unprepared;
00166 }
00167 }
00168
00169
00170 nowait = false;
00171 }
00172 }
00173
00175 class MultipleMatchSpy : public Xapian::MatchSpy {
00176 private:
00178 const std::vector<Xapian::MatchSpy *> & spies;
00179
00180 public:
00181 MultipleMatchSpy(const std::vector<Xapian::MatchSpy *> & spies_)
00182 : spies(spies_) {}
00183
00188 void operator()(const Xapian::Document &doc, Xapian::weight wt);
00189 };
00190
00191 void
00192 MultipleMatchSpy::operator()(const Xapian::Document &doc, Xapian::weight wt) {
00193 LOGCALL_VOID(MATCH, "MultipleMatchSpy::operator()", doc | wt);
00194 vector<Xapian::MatchSpy *>::const_iterator i;
00195 for (i = spies.begin(); i != spies.end(); ++i) {
00196 (**i)(doc, wt);
00197 }
00198 }
00199
00201
00203 MultiMatch::MultiMatch(const Xapian::Database &db_,
00204 const Xapian::Query::Internal * query_,
00205 Xapian::termcount qlen,
00206 const Xapian::RSet * omrset,
00207 Xapian::doccount collapse_max_,
00208 Xapian::valueno collapse_key_,
00209 int percent_cutoff_, Xapian::weight weight_cutoff_,
00210 Xapian::Enquire::docid_order order_,
00211 Xapian::valueno sort_key_,
00212 Xapian::Enquire::Internal::sort_setting sort_by_,
00213 bool sort_value_forward_,
00214 Xapian::ErrorHandler * errorhandler_,
00215 Xapian::Weight::Internal & stats,
00216 const Xapian::Weight * weight_,
00217 const vector<Xapian::MatchSpy *> & matchspies_,
00218 bool have_sorter, bool have_mdecider)
00219 : db(db_), query(query_),
00220 collapse_max(collapse_max_), collapse_key(collapse_key_),
00221 percent_cutoff(percent_cutoff_), weight_cutoff(weight_cutoff_),
00222 order(order_),
00223 sort_key(sort_key_), sort_by(sort_by_),
00224 sort_value_forward(sort_value_forward_),
00225 errorhandler(errorhandler_), weight(weight_),
00226 is_remote(db.internal.size()),
00227 matchspies(matchspies_)
00228 {
00229 LOGCALL_CTOR(MATCH, "MultiMatch", db_ | query_ | qlen | omrset | collapse_max_ | collapse_key_ | percent_cutoff_ | weight_cutoff_ | int(order_) | sort_key_ | int(sort_by_) | sort_value_forward_ | errorhandler_ | stats | weight_ | matchspies_ | have_sorter | have_mdecider);
00230
00231 if (!query) return;
00232 query->validate_query();
00233
00234 Xapian::doccount number_of_subdbs = db.internal.size();
00235 vector<Xapian::RSet> subrsets;
00236 split_rset_by_db(omrset, number_of_subdbs, subrsets);
00237
00238 for (size_t i = 0; i != number_of_subdbs; ++i) {
00239 Xapian::Database::Internal *subdb = db.internal[i].get();
00240 Assert(subdb);
00241 Xapian::Internal::RefCntPtr<SubMatch> smatch;
00242 try {
00243
00244 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00245 RemoteDatabase *rem_db = subdb->as_remotedatabase();
00246 if (rem_db) {
00247 if (have_sorter) {
00248 throw Xapian::UnimplementedError("Xapian::KeyMaker not supported for the remote backend");
00249 }
00250 if (have_mdecider) {
00251 throw Xapian::UnimplementedError("Xapian::MatchDecider not supported for the remote backend");
00252 }
00253 rem_db->set_query(query, qlen, collapse_max, collapse_key,
00254 order, sort_key, sort_by, sort_value_forward,
00255 percent_cutoff, weight_cutoff, weight,
00256 subrsets[i], matchspies);
00257 bool decreasing_relevance =
00258 (sort_by == REL || sort_by == REL_VAL);
00259 smatch = new RemoteSubMatch(rem_db, decreasing_relevance, matchspies);
00260 is_remote[i] = true;
00261 } else {
00262 smatch = new LocalSubMatch(subdb, query, qlen, subrsets[i], weight);
00263 }
00264 #else
00265
00266 (void)have_sorter;
00267 (void)have_mdecider;
00268 smatch = new LocalSubMatch(subdb, query, qlen, subrsets[i], weight);
00269 #endif
00270 } catch (Xapian::Error & e) {
00271 if (!errorhandler) throw;
00272 LOGLINE(EXCEPTION, "Calling error handler for creation of a SubMatch from a database and query.");
00273 (*errorhandler)(e);
00274
00275 smatch = NULL;
00276 }
00277 leaves.push_back(smatch);
00278 }
00279
00280 stats.mark_wanted_terms(*query);
00281 prepare_sub_matches(leaves, errorhandler, stats);
00282 stats.set_bounds_from_db(db);
00283 }
00284
00285 Xapian::weight
00286 MultiMatch::getorrecalc_maxweight(PostList *pl)
00287 {
00288 LOGCALL(MATCH, Xapian::weight, "MultiMatch::getorrecalc_maxweight", pl);
00289 Xapian::weight wt;
00290 if (recalculate_w_max) {
00291 LOGLINE(MATCH, "recalculating max weight");
00292 wt = pl->recalc_maxweight();
00293 recalculate_w_max = false;
00294 } else {
00295 wt = pl->get_maxweight();
00296 LOGLINE(MATCH, "pl = (" << pl->get_description() << ")");
00297 AssertEqDoubleParanoid(wt, pl->recalc_maxweight());
00298 }
00299 LOGLINE(MATCH, "max possible doc weight = " << wt);
00300 RETURN(wt);
00301 }
00302
00303 void
00304 MultiMatch::get_mset(Xapian::doccount first, Xapian::doccount maxitems,
00305 Xapian::doccount check_at_least,
00306 Xapian::MSet & mset,
00307 const Xapian::Weight::Internal & stats,
00308 const Xapian::MatchDecider *mdecider,
00309 const Xapian::MatchDecider *matchspy_legacy,
00310 const Xapian::KeyMaker *sorter)
00311 {
00312 LOGCALL_VOID(MATCH, "MultiMatch::get_mset", first | maxitems | check_at_least | Literal("mset") | stats | Literal("mdecider") | Literal("matchspy_legacy") | Literal("sorter"));
00313 AssertRel(check_at_least,>=,maxitems);
00314
00315 if (!query) {
00316 mset = Xapian::MSet(new Xapian::MSet::Internal());
00317 mset.internal->firstitem = first;
00318 return;
00319 }
00320
00321 Assert(!leaves.empty());
00322
00323 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00324
00325
00326 if (leaves.size() == 1 && is_remote[0]) {
00327 RemoteSubMatch * rem_match;
00328 rem_match = static_cast<RemoteSubMatch*>(leaves[0].get());
00329 rem_match->start_match(first, maxitems, check_at_least, stats);
00330 rem_match->get_mset(mset);
00331 return;
00332 }
00333 #endif
00334
00335
00336 {
00337 vector<Xapian::Internal::RefCntPtr<SubMatch> >::iterator leaf;
00338 for (leaf = leaves.begin(); leaf != leaves.end(); ++leaf) {
00339 if (!(*leaf).get()) continue;
00340 try {
00341 (*leaf)->start_match(0, first + maxitems,
00342 first + check_at_least, stats);
00343 } catch (Xapian::Error & e) {
00344 if (!errorhandler) throw;
00345 LOGLINE(EXCEPTION, "Calling error handler for "
00346 "start_match() on a SubMatch.");
00347 (*errorhandler)(e);
00348
00349 *leaf = NULL;
00350 }
00351 }
00352 }
00353
00354
00355 vector<PostList *> postlists;
00356 map<string, Xapian::MSet::Internal::TermFreqAndWeight> termfreqandwts;
00357 map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts_ptr;
00358 termfreqandwts_ptr = &termfreqandwts;
00359
00360 Xapian::termcount total_subqs = 0;
00361
00362
00363
00364
00365 Xapian::doccount definite_matches_not_seen = 0;
00366 for (size_t i = 0; i != leaves.size(); ++i) {
00367 PostList *pl;
00368 try {
00369 pl = leaves[i]->get_postlist_and_term_info(this,
00370 termfreqandwts_ptr,
00371 &total_subqs);
00372 if (termfreqandwts_ptr && !termfreqandwts.empty())
00373 termfreqandwts_ptr = NULL;
00374 if (is_remote[i]) {
00375 if (pl->get_termfreq_min() > first + maxitems) {
00376 LOGLINE(MATCH, "Found " <<
00377 pl->get_termfreq_min() - (first + maxitems)
00378 << " definite matches in remote submatch "
00379 "which aren't passed to local match");
00380 definite_matches_not_seen += pl->get_termfreq_min();
00381 definite_matches_not_seen -= first + maxitems;
00382 }
00383 }
00384 } catch (Xapian::Error & e) {
00385 if (!errorhandler) throw;
00386 LOGLINE(EXCEPTION, "Calling error handler for "
00387 "get_term_info() on a SubMatch.");
00388 (*errorhandler)(e);
00389
00390
00391 leaves[i] = NULL;
00392 pl = new EmptyPostList;
00393 }
00394 postlists.push_back(pl);
00395 }
00396 Assert(!postlists.empty());
00397
00398 ValueStreamDocument vsdoc(db);
00399 ++vsdoc.ref_count;
00400 Xapian::Document doc(&vsdoc);
00401
00402
00403 AutoPtr<PostList> pl;
00404 if (postlists.size() == 1) {
00405 pl.reset(postlists.front());
00406 } else {
00407 pl.reset(new MergePostList(postlists, this, vsdoc, errorhandler));
00408 }
00409
00410 LOGLINE(MATCH, "pl = (" << pl->get_description() << ")");
00411
00412 #ifdef XAPIAN_DEBUG_LOG
00413 {
00414 map<string, Xapian::MSet::Internal::TermFreqAndWeight>::const_iterator tfwi;
00415 for (tfwi = termfreqandwts.begin(); tfwi != termfreqandwts.end(); ++tfwi) {
00416 LOGLINE(MATCH, "termfreqandwts[" << tfwi->first << "] = " << tfwi->second.termfreq << ", " << tfwi->second.termweight);
00417 }
00418 }
00419 #endif
00420
00421
00422 Xapian::doccount docs_matched = 0;
00423 Xapian::weight greatest_wt = 0;
00424 Xapian::termcount greatest_wt_subqs_matched = 0;
00425 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00426 unsigned greatest_wt_subqs_db_num = UINT_MAX;
00427 #endif
00428 vector<Xapian::Internal::MSetItem> items;
00429
00430
00431 const Xapian::weight max_possible = pl->recalc_maxweight();
00432
00433 LOGLINE(MATCH, "pl = (" << pl->get_description() << ")");
00434 recalculate_w_max = false;
00435
00436 Xapian::doccount matches_upper_bound = pl->get_termfreq_max();
00437 Xapian::doccount matches_lower_bound = 0;
00438 Xapian::doccount matches_estimated = pl->get_termfreq_est();
00439
00440 if (mdecider == NULL && matchspy_legacy == NULL) {
00441
00442
00443
00444 matches_lower_bound = pl->get_termfreq_min();
00445 }
00446
00447
00448 Xapian::MatchSpy *matchspy = NULL;
00449 MultipleMatchSpy multispy(matchspies);
00450 if (!matchspies.empty()) {
00451 if (matchspies.size() == 1) {
00452 matchspy = matchspies[0];
00453 } else {
00454 matchspy = &multispy;
00455 }
00456 }
00457
00458
00459
00460 if (check_at_least == 0) {
00461 pl.reset(NULL);
00462 Xapian::doccount uncollapsed_lower_bound = matches_lower_bound;
00463 if (collapse_max) {
00464
00465
00466
00467 if (matches_lower_bound > collapse_max)
00468 matches_lower_bound = collapse_max;
00469 }
00470
00471 mset = Xapian::MSet(new Xapian::MSet::Internal(
00472 first,
00473 matches_upper_bound,
00474 matches_lower_bound,
00475 matches_estimated,
00476 matches_upper_bound,
00477 uncollapsed_lower_bound,
00478 matches_estimated,
00479 max_possible, greatest_wt, items,
00480 termfreqandwts,
00481 0));
00482 return;
00483 }
00484
00485
00486 Xapian::doccount decider_considered = 0;
00487
00488 Xapian::doccount decider_denied = 0;
00489
00490
00491
00492 Xapian::doccount max_msize = first + maxitems;
00493 items.reserve(max_msize + 1);
00494
00495
00496
00497 Xapian::Internal::MSetItem min_item(0.0, 0);
00498
00499
00500 Xapian::weight min_weight = weight_cutoff;
00501
00502
00503 Xapian::weight percent_cutoff_factor = percent_cutoff / 100.0;
00504
00505
00506 percent_cutoff_factor -= DBL_EPSILON;
00507
00508
00509 Collapser collapser(collapse_key, collapse_max);
00510
00512 bool sort_forward = (order != Xapian::Enquire::DESCENDING);
00513 MSetCmp mcmp(get_msetcmp_function(sort_by, sort_forward, sort_value_forward));
00514
00515
00516
00517
00518
00519
00520
00521
00522
00523
00524
00525
00526
00527
00528 bool is_heap = false;
00529
00530 while (true) {
00531 bool pushback;
00532
00533 if (rare(recalculate_w_max)) {
00534 if (min_weight > 0.0) {
00535 if (rare(getorrecalc_maxweight(pl.get()) < min_weight)) {
00536 LOGLINE(MATCH, "*** TERMINATING EARLY (1)");
00537 break;
00538 }
00539 }
00540 }
00541
00542 PostList * pl_copy = pl.get();
00543 if (rare(next_handling_prune(pl_copy, min_weight, this))) {
00544 (void)pl.release();
00545 pl.reset(pl_copy);
00546 LOGLINE(MATCH, "*** REPLACING ROOT");
00547
00548 if (min_weight > 0.0) {
00549
00550
00551
00552 if (rare(getorrecalc_maxweight(pl.get()) < min_weight)) {
00553 LOGLINE(MATCH, "*** TERMINATING EARLY (2)");
00554 break;
00555 }
00556 }
00557 }
00558
00559 if (rare(pl->at_end())) {
00560 LOGLINE(MATCH, "Reached end of potential matches");
00561 break;
00562 }
00563
00564
00565
00566
00567 Xapian::weight wt = 0.0;
00568 bool calculated_weight = false;
00569 if (sort_by != VAL || min_weight > 0.0) {
00570 wt = pl->get_weight();
00571 if (wt < min_weight) {
00572 LOGLINE(MATCH, "Rejecting potential match due to insufficient weight");
00573 continue;
00574 }
00575 calculated_weight = true;
00576 }
00577
00578 Xapian::docid did = pl->get_docid();
00579 vsdoc.set_document(did);
00580 LOGLINE(MATCH, "Candidate document id " << did << " wt " << wt);
00581 Xapian::Internal::MSetItem new_item(wt, did);
00582 if (sort_by != REL) {
00583 if (sorter) {
00584 new_item.sort_key = (*sorter)(doc);
00585 } else {
00586 new_item.sort_key = vsdoc.get_value(sort_key);
00587 }
00588
00589
00590
00591
00592
00593 if (!mcmp(new_item, min_item)) {
00594 if (mdecider == NULL && !collapser && matchspy_legacy == NULL) {
00595
00596
00597 LOGLINE(MATCH, "Making note of match item which sorts lower than min_item");
00598 ++docs_matched;
00599 if (!calculated_weight) wt = pl->get_weight();
00600 if (matchspy) {
00601 matchspy->operator()(doc, wt);
00602 }
00603 if (wt > greatest_wt) goto new_greatest_weight;
00604 continue;
00605 }
00606 if (docs_matched >= check_at_least) {
00607
00608 LOGLINE(MATCH, "Dropping candidate which sorts lower than min_item");
00609
00610 if (!calculated_weight) wt = pl->get_weight();
00611 if (wt > greatest_wt) goto new_greatest_weight;
00612 continue;
00613 }
00614
00615
00616
00617 LOGLINE(MATCH, "Keeping candidate which sorts lower than min_item for further investigation");
00618 }
00619 }
00620
00621
00622 if (matchspy != NULL || mdecider != NULL || matchspy_legacy != NULL) {
00623 const unsigned int multiplier = db.internal.size();
00624 Assert(multiplier != 0);
00625 Xapian::doccount n = (did - 1) % multiplier;
00626
00627
00628 if (!is_remote[n]) {
00629 ++decider_considered;
00630 if (matchspy_legacy && !matchspy_legacy->operator()(doc)) {
00631 ++decider_denied;
00632 continue;
00633 }
00634 if (mdecider && !mdecider->operator()(doc)) {
00635 ++decider_denied;
00636 continue;
00637 }
00638 if (matchspy) {
00639 if (!calculated_weight) {
00640 wt = pl->get_weight();
00641 new_item.wt = wt;
00642 calculated_weight = true;
00643 }
00644 matchspy->operator()(doc, wt);
00645 }
00646 }
00647 }
00648
00649 if (!calculated_weight) {
00650
00651 wt = pl->get_weight();
00652 new_item.wt = wt;
00653 }
00654
00655 pushback = true;
00656
00657
00658 if (collapser) {
00659 collapse_result res;
00660 res = collapser.process(new_item, pl.get(), vsdoc, mcmp);
00661 if (res == REJECTED) {
00662
00663
00664 if (sort_by != REL && sort_by != REL_VAL) {
00665 if (wt > greatest_wt) goto new_greatest_weight;
00666 }
00667 continue;
00668 }
00669
00670 if (res == REPLACED) {
00671
00672
00673 Assert(!items.empty());
00674
00675 const Xapian::Internal::MSetItem & old_item =
00676 collapser.old_item;
00677
00678
00679
00680
00681
00682 Xapian::weight old_wt = old_item.wt;
00683 if (old_wt >= min_weight && mcmp(old_item, min_item)) {
00684
00685
00686 Xapian::docid olddid = old_item.did;
00687 vector<Xapian::Internal::MSetItem>::iterator i;
00688 for (i = items.begin(); i != items.end(); ++i) {
00689 if (i->did == olddid) {
00690 LOGLINE(MATCH, "collapse: removing " <<
00691 olddid << ": " <<
00692 new_item.collapse_key);
00693
00694
00695
00696
00697
00698 *i = new_item;
00699 pushback = false;
00700 is_heap = false;
00701 break;
00702 }
00703 }
00704 }
00705 }
00706 }
00707
00708
00709 if (pushback) {
00710 ++docs_matched;
00711 if (items.size() >= max_msize) {
00712 items.push_back(new_item);
00713 if (!is_heap) {
00714 is_heap = true;
00715 make_heap(items.begin(), items.end(), mcmp);
00716 } else {
00717 push_heap<vector<Xapian::Internal::MSetItem>::iterator,
00718 MSetCmp>(items.begin(), items.end(), mcmp);
00719 }
00720 pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
00721 MSetCmp>(items.begin(), items.end(), mcmp);
00722 items.pop_back();
00723
00724 min_item = items.front();
00725 if (sort_by == REL || sort_by == REL_VAL) {
00726 if (docs_matched >= check_at_least) {
00727 if (sort_by == REL) {
00728
00729
00730
00731 if (rare(max_possible == 0 && sort_forward)) {
00732
00733
00734
00735
00736
00737 if (leaves.size() == 1) break;
00738 }
00739 }
00740 if (min_item.wt > min_weight) {
00741 LOGLINE(MATCH, "Setting min_weight to " <<
00742 min_item.wt << " from " << min_weight);
00743 min_weight = min_item.wt;
00744 }
00745 }
00746 }
00747 if (rare(getorrecalc_maxweight(pl.get()) < min_weight)) {
00748 LOGLINE(MATCH, "*** TERMINATING EARLY (3)");
00749 break;
00750 }
00751 } else {
00752 items.push_back(new_item);
00753 is_heap = false;
00754 if (sort_by == REL && items.size() == max_msize) {
00755 if (docs_matched >= check_at_least) {
00756
00757
00758
00759 if (rare(max_possible == 0 && sort_forward)) {
00760
00761
00762
00763
00764
00765 if (leaves.size() == 1) break;
00766 }
00767 }
00768 }
00769 }
00770 }
00771
00772
00773 if (wt > greatest_wt) {
00774 new_greatest_weight:
00775 greatest_wt = wt;
00776 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00777 const unsigned int multiplier = db.internal.size();
00778 unsigned int db_num = (did - 1) % multiplier;
00779 if (is_remote[db_num]) {
00780
00781
00782 greatest_wt_subqs_db_num = db_num;
00783 } else
00784 #endif
00785 {
00786 greatest_wt_subqs_matched = pl->count_matching_subqs();
00787 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00788 greatest_wt_subqs_db_num = UINT_MAX;
00789 #endif
00790 }
00791 if (percent_cutoff) {
00792 Xapian::weight w = wt * percent_cutoff_factor;
00793 if (w > min_weight) {
00794 min_weight = w;
00795 if (!is_heap) {
00796 is_heap = true;
00797 make_heap<vector<Xapian::Internal::MSetItem>::iterator,
00798 MSetCmp>(items.begin(), items.end(), mcmp);
00799 }
00800 while (!items.empty() && items.front().wt < min_weight) {
00801 pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
00802 MSetCmp>(items.begin(), items.end(), mcmp);
00803 Assert(items.back().wt < min_weight);
00804 items.pop_back();
00805 }
00806 #ifdef XAPIAN_ASSERTIONS_PARANOID
00807 vector<Xapian::Internal::MSetItem>::const_iterator i;
00808 for (i = items.begin(); i != items.end(); ++i) {
00809 Assert(i->wt >= min_weight);
00810 }
00811 #endif
00812 }
00813 }
00814 }
00815 }
00816
00817
00818 pl.reset(NULL);
00819
00820 double percent_scale = 0;
00821 if (!items.empty() && greatest_wt > 0) {
00822 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00823 if (greatest_wt_subqs_db_num != UINT_MAX) {
00824 const unsigned int n = greatest_wt_subqs_db_num;
00825 RemoteSubMatch * rem_match;
00826 rem_match = static_cast<RemoteSubMatch*>(leaves[n].get());
00827 percent_scale = rem_match->get_percent_factor() / 100.0;
00828 } else
00829 #endif
00830 {
00831 percent_scale = greatest_wt_subqs_matched / double(total_subqs);
00832 percent_scale /= greatest_wt;
00833 }
00834 Assert(percent_scale > 0);
00835 if (percent_cutoff) {
00836
00837
00838
00839
00840 Xapian::weight min_wt = percent_cutoff_factor / percent_scale;
00841 if (!is_heap) {
00842 is_heap = true;
00843 make_heap<vector<Xapian::Internal::MSetItem>::iterator,
00844 MSetCmp>(items.begin(), items.end(), mcmp);
00845 }
00846 while (!items.empty() && items.front().wt < min_wt) {
00847 pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
00848 MSetCmp>(items.begin(), items.end(), mcmp);
00849 Assert(items.back().wt < min_wt);
00850 items.pop_back();
00851 }
00852 #ifdef XAPIAN_ASSERTIONS_PARANOID
00853 vector<Xapian::Internal::MSetItem>::const_iterator j;
00854 for (j = items.begin(); j != items.end(); ++j) {
00855 Assert(j->wt >= min_wt);
00856 }
00857 #endif
00858 }
00859 }
00860
00861 LOGLINE(MATCH,
00862 "docs_matched = " << docs_matched <<
00863 ", definite_matches_not_seen = " << definite_matches_not_seen <<
00864 ", matches_lower_bound = " << matches_lower_bound <<
00865 ", matches_estimated = " << matches_estimated <<
00866 ", matches_upper_bound = " << matches_upper_bound);
00867
00868
00869
00870 docs_matched += definite_matches_not_seen;
00871
00872 Xapian::doccount uncollapsed_lower_bound = matches_lower_bound;
00873 Xapian::doccount uncollapsed_upper_bound = matches_upper_bound;
00874 Xapian::doccount uncollapsed_estimated = matches_estimated;
00875 if (items.size() < max_msize) {
00876
00877
00878 LOGLINE(MATCH, "items.size() = " << items.size() <<
00879 ", max_msize = " << max_msize << ", setting bounds equal");
00880 Assert(definite_matches_not_seen == 0);
00881 Assert(percent_cutoff || docs_matched == items.size());
00882 matches_lower_bound = matches_upper_bound = matches_estimated
00883 = items.size();
00884 if (collapser && matches_lower_bound > uncollapsed_lower_bound)
00885 uncollapsed_lower_bound = matches_lower_bound;
00886 } else if (!collapser && docs_matched < check_at_least) {
00887
00888
00889 LOGLINE(MATCH, "Setting bounds equal");
00890 matches_lower_bound = matches_upper_bound = matches_estimated
00891 = docs_matched;
00892 if (collapser && matches_lower_bound > uncollapsed_lower_bound)
00893 uncollapsed_lower_bound = matches_lower_bound;
00894 } else {
00895 AssertRel(matches_estimated,>=,matches_lower_bound);
00896 AssertRel(matches_estimated,<=,matches_upper_bound);
00897
00898
00899
00900
00901 double estimate_scale = 1.0;
00902 double unique_rate = 1.0;
00903
00904 if (collapser) {
00905 LOGLINE(MATCH, "Adjusting bounds due to collapse_key");
00906 matches_lower_bound = collapser.get_matches_lower_bound();
00907 if (matches_lower_bound > uncollapsed_lower_bound)
00908 uncollapsed_lower_bound = matches_lower_bound;
00909
00910
00911
00912
00913 Xapian::doccount docs_considered = collapser.get_docs_considered();
00914 Xapian::doccount dups_ignored = collapser.get_dups_ignored();
00915 if (docs_considered > 0) {
00916 double unique = double(docs_considered - dups_ignored);
00917 unique_rate = unique / double(docs_considered);
00918 }
00919
00920
00921
00922 matches_upper_bound -= dups_ignored;
00923
00924 LOGLINE(MATCH, "matches_lower_bound=" << matches_lower_bound <<
00925 ", matches_estimated=" << matches_estimated <<
00926 "*" << estimate_scale << "*" << unique_rate <<
00927 ", matches_upper_bound=" << matches_upper_bound);
00928 }
00929
00930 if (mdecider || matchspy_legacy) {
00931 if (!percent_cutoff) {
00932 if (!collapser) {
00933
00934
00935
00936 matches_lower_bound = max(docs_matched, matches_lower_bound);
00937 }
00938 }
00939
00940
00941
00942
00943 if (decider_considered > 0) {
00944 double accept = double(decider_considered - decider_denied);
00945 double accept_rate = accept / double(decider_considered);
00946 estimate_scale *= accept_rate;
00947 }
00948
00949
00950
00951
00952
00953 matches_upper_bound -= decider_denied;
00954 if (collapser) uncollapsed_upper_bound -= decider_denied;
00955 }
00956
00957 if (percent_cutoff) {
00958 estimate_scale *= (1.0 - percent_cutoff_factor);
00959
00960
00961
00962
00963
00964
00965
00966
00967
00968 matches_lower_bound = items.size();
00969 if (collapser) uncollapsed_lower_bound = matches_lower_bound;
00970
00971
00972
00973
00974 LOGLINE(MATCH, "Adjusted bounds due to percent_cutoff (" <<
00975 percent_cutoff << "): now have matches_estimated=" <<
00976 matches_estimated << ", matches_lower_bound=" <<
00977 matches_lower_bound);
00978 }
00979
00980 if (collapser && estimate_scale != 1.0) {
00981 uncollapsed_estimated =
00982 Xapian::doccount(uncollapsed_estimated * estimate_scale + 0.5);
00983 }
00984
00985 estimate_scale *= unique_rate;
00986
00987 if (estimate_scale != 1.0) {
00988 matches_estimated =
00989 Xapian::doccount(matches_estimated * estimate_scale + 0.5);
00990 if (matches_estimated < matches_lower_bound)
00991 matches_estimated = matches_lower_bound;
00992 }
00993
00994 if (collapser || mdecider || matchspy_legacy) {
00995 LOGLINE(MATCH, "Clamping estimate between bounds: "
00996 "matches_lower_bound = " << matches_lower_bound <<
00997 ", matches_estimated = " << matches_estimated <<
00998 ", matches_upper_bound = " << matches_upper_bound);
00999 AssertRel(matches_lower_bound,<=,matches_upper_bound);
01000 matches_estimated = max(matches_estimated, matches_lower_bound);
01001 matches_estimated = min(matches_estimated, matches_upper_bound);
01002 } else if (!percent_cutoff) {
01003 AssertRel(docs_matched,<=,matches_upper_bound);
01004 if (docs_matched > matches_lower_bound)
01005 matches_lower_bound = docs_matched;
01006 if (docs_matched > matches_estimated)
01007 matches_estimated = docs_matched;
01008 }
01009
01010 if (collapser && !mdecider && !percent_cutoff && !matchspy_legacy) {
01011 AssertRel(docs_matched,<=,uncollapsed_upper_bound);
01012 if (docs_matched > uncollapsed_lower_bound)
01013 uncollapsed_lower_bound = docs_matched;
01014 }
01015 }
01016
01017 if (collapser) {
01018 AssertRel(uncollapsed_lower_bound,<=,uncollapsed_upper_bound);
01019 if (uncollapsed_estimated < uncollapsed_lower_bound) {
01020 uncollapsed_estimated = uncollapsed_lower_bound;
01021 } else if (uncollapsed_estimated > uncollapsed_upper_bound) {
01022 uncollapsed_estimated = uncollapsed_upper_bound;
01023 }
01024 } else {
01025
01026 uncollapsed_lower_bound = matches_lower_bound;
01027 uncollapsed_upper_bound = matches_upper_bound;
01028 uncollapsed_estimated = matches_estimated;
01029 }
01030
01031 LOGLINE(MATCH, items.size() << " items in potential mset");
01032
01033 if (first > 0) {
01034
01035 if (items.size() <= first) {
01036 items.clear();
01037 } else {
01038 LOGLINE(MATCH, "finding " << first << "th");
01039
01040
01041
01042
01043 vector<Xapian::Internal::MSetItem>::reverse_iterator nth;
01044 nth = items.rbegin() + first;
01045 nth_element(items.rbegin(), nth, items.rend(), mcmp);
01046
01047 items.erase(items.begin() + items.size() - first, items.end());
01048 }
01049 }
01050
01051 LOGLINE(MATCH, "sorting " << items.size() << " entries");
01052
01053
01054 sort(items.begin(), items.end(), mcmp);
01055
01056 if (!items.empty()) {
01057 LOGLINE(MATCH, "min weight in mset = " << items.back().wt);
01058 LOGLINE(MATCH, "max weight in mset = " << items[0].wt);
01059 }
01060
01061 AssertRel(matches_estimated,>=,matches_lower_bound);
01062 AssertRel(matches_estimated,<=,matches_upper_bound);
01063
01064 AssertRel(uncollapsed_estimated,>=,uncollapsed_lower_bound);
01065 AssertRel(uncollapsed_estimated,<=,uncollapsed_upper_bound);
01066
01067
01068
01069
01070
01071
01072
01073
01074
01075 if (!items.empty() && collapser && !collapser.empty()) {
01076
01077 Xapian::weight min_wt = 0.0;
01078 if (percent_scale > 0.0)
01079 min_wt = percent_cutoff_factor / percent_scale;
01080 Xapian::doccount entries = collapser.entries();
01081 vector<Xapian::Internal::MSetItem>::iterator i;
01082 for (i = items.begin(); i != items.end(); ++i) {
01083
01084 if (i->collapse_key.empty()) continue;
01085
01086
01087 i->collapse_count = collapser.get_collapse_count(i->collapse_key, percent_cutoff, min_wt);
01088 if (--entries == 0) {
01089
01090 break;
01091 }
01092 }
01093 }
01094
01095 mset = Xapian::MSet(new Xapian::MSet::Internal(
01096 first,
01097 matches_upper_bound,
01098 matches_lower_bound,
01099 matches_estimated,
01100 uncollapsed_upper_bound,
01101 uncollapsed_lower_bound,
01102 uncollapsed_estimated,
01103 max_possible, greatest_wt, items,
01104 termfreqandwts,
01105 percent_scale * 100.0));
01106 }