00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026 #include <config.h>
00027
00028 #include "multimatch.h"
00029 #include "submatch.h"
00030 #include "localmatch.h"
00031 #include "omdebug.h"
00032 #include "omenquireinternal.h"
00033
00034 #include "emptypostlist.h"
00035 #include "branchpostlist.h"
00036 #include "mergepostlist.h"
00037
00038 #include "document.h"
00039 #include "omqueryinternal.h"
00040
00041 #include "submatch.h"
00042 #include "stats.h"
00043
00044 #include "msetcmp.h"
00045
00046 #include <xapian/errorhandler.h>
00047 #include <xapian/version.h>
00048
00049 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00050 #include "remotesubmatch.h"
00051 #include "remote-database.h"
00052 #endif
00053
00054 #include <algorithm>
00055 #include <cfloat>
00056 #include <climits>
00057 #include <vector>
00058 #include <map>
00059 #include <set>
00060
00061 using namespace std;
00062
00063 const Xapian::Enquire::Internal::sort_setting REL =
00064 Xapian::Enquire::Internal::REL;
00065 const Xapian::Enquire::Internal::sort_setting REL_VAL =
00066 Xapian::Enquire::Internal::REL_VAL;
00067 const Xapian::Enquire::Internal::sort_setting VAL =
00068 Xapian::Enquire::Internal::VAL;
00069 #if 0 // VAL_REL isn't currently used which causes a warning with SGI CC.
00070 const Xapian::Enquire::Internal::sort_setting VAL_REL =
00071 Xapian::Enquire::Internal::VAL_REL;
00072 #endif
00073
00081 static void
00082 split_rset_by_db(const Xapian::RSet * rset,
00083 Xapian::doccount number_of_subdbs,
00084 vector<Xapian::RSet> & subrsets)
00085 {
00086 DEBUGCALL_STATIC(MATCH, void, "split_rset_by_db",
00087 (rset ? *rset : Xapian::RSet()) <<
00088 ", " << number_of_subdbs <<
00089 ", [subrsets(size=" << subrsets.size() << "]");
00090 if (rset) {
00091 if (number_of_subdbs == 1) {
00092
00093 subrsets.push_back(*rset);
00094 } else {
00095
00096
00097 subrsets.reserve(number_of_subdbs);
00098 for (size_t i = 0; i < number_of_subdbs; ++i) {
00099 subrsets.push_back(Xapian::RSet());
00100 }
00101
00102 const set<Xapian::docid> & rsetitems = rset->internal->get_items();
00103 set<Xapian::docid>::const_iterator j;
00104 for (j = rsetitems.begin(); j != rsetitems.end(); ++j) {
00105 Xapian::doccount local_docid = (*j - 1) / number_of_subdbs + 1;
00106 Xapian::doccount subdatabase = (*j - 1) % number_of_subdbs;
00107 subrsets[subdatabase].add_document(local_docid);
00108 }
00109 }
00110 } else {
00111
00112 subrsets.resize(number_of_subdbs);
00113 }
00114 Assert(subrsets.size() == number_of_subdbs);
00115 }
00116
00135 static void
00136 prepare_sub_matches(vector<Xapian::Internal::RefCntPtr<SubMatch> > & leaves,
00137 Xapian::ErrorHandler * errorhandler,
00138 Stats & stats)
00139 {
00140 DEBUGCALL_STATIC(MATCH, void, "prepare_sub_matches",
00141 "[leaves(size=" << leaves.size() << ")], " <<
00142 errorhandler << ", " << stats);
00143
00144 vector<bool> prepared;
00145 prepared.resize(leaves.size(), false);
00146 size_t unprepared = leaves.size();
00147 bool nowait = true;
00148 while (unprepared) {
00149 for (size_t leaf = 0; leaf < leaves.size(); ++leaf) {
00150 if (prepared[leaf]) continue;
00151 try {
00152 SubMatch * submatch = leaves[leaf].get();
00153 if (!submatch || submatch->prepare_match(nowait, stats)) {
00154 prepared[leaf] = true;
00155 --unprepared;
00156 }
00157 } catch (Xapian::Error & e) {
00158 if (!errorhandler) throw;
00159
00160 DEBUGLINE(EXCEPTION, "Calling error handler for prepare_match() on a SubMatch.");
00161 (*errorhandler)(e);
00162
00163 leaves[leaf] = NULL;
00164 prepared[leaf] = true;
00165 --unprepared;
00166 }
00167 }
00168
00169
00170 nowait = false;
00171 }
00172 }
00173
00175
00177 MultiMatch::MultiMatch(const Xapian::Database &db_,
00178 const Xapian::Query::Internal * query_,
00179 Xapian::termcount qlen,
00180 const Xapian::RSet * omrset,
00181 Xapian::valueno collapse_key_,
00182 int percent_cutoff_, Xapian::weight weight_cutoff_,
00183 Xapian::Enquire::docid_order order_,
00184 Xapian::valueno sort_key_,
00185 Xapian::Enquire::Internal::sort_setting sort_by_,
00186 bool sort_value_forward_,
00187 Xapian::ErrorHandler * errorhandler_,
00188 Stats & stats,
00189 const Xapian::Weight * weight_,
00190 bool have_sorter, bool have_mdecider)
00191 : db(db_), query(query_),
00192 collapse_key(collapse_key_), percent_cutoff(percent_cutoff_),
00193 weight_cutoff(weight_cutoff_), order(order_),
00194 sort_key(sort_key_), sort_by(sort_by_),
00195 sort_value_forward(sort_value_forward_),
00196 errorhandler(errorhandler_), weight(weight_),
00197 is_remote(db.internal.size())
00198 {
00199 DEBUGCALL(MATCH, void, "MultiMatch", db_ << ", " << query_ << ", " <<
00200 qlen << ", " << (omrset ? *omrset : Xapian::RSet()) << ", " <<
00201 collapse_key_ << ", " <<
00202 percent_cutoff_ << ", " << weight_cutoff_ << ", " <<
00203 int(order_) << ", " << sort_key_ << ", " <<
00204 int(sort_by_) << ", " << sort_value_forward_ << ", " <<
00205 errorhandler_ << ", " << stats << ", [weight_], " <<
00206 have_sorter << ", " << have_mdecider);
00207
00208 if (!query) return;
00209 query->validate_query();
00210
00211 Xapian::doccount number_of_subdbs = db.internal.size();
00212 vector<Xapian::RSet> subrsets;
00213 split_rset_by_db(omrset, number_of_subdbs, subrsets);
00214
00215 for (size_t i = 0; i != number_of_subdbs; ++i) {
00216 Xapian::Database::Internal *subdb = db.internal[i].get();
00217 Assert(subdb);
00218 Xapian::Internal::RefCntPtr<SubMatch> smatch;
00219 try {
00220
00221 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00222 RemoteDatabase *rem_db = subdb->as_remotedatabase();
00223 if (rem_db) {
00224 if (have_sorter) {
00225 throw Xapian::UnimplementedError("Xapian::Sorter not supported for the remote backend");
00226 }
00227 if (have_mdecider) {
00228 throw Xapian::UnimplementedError("Xapian::MatchDecider not supported for the remote backend");
00229 }
00230 rem_db->set_query(query, qlen, collapse_key, order, sort_key,
00231 sort_by, sort_value_forward, percent_cutoff,
00232 weight_cutoff, weight, subrsets[i]);
00233 bool decreasing_relevance =
00234 (sort_by == REL || sort_by == REL_VAL);
00235 smatch = new RemoteSubMatch(rem_db, decreasing_relevance);
00236 is_remote[i] = true;
00237 } else {
00238 #endif
00239 smatch = new LocalSubMatch(subdb, query, qlen, subrsets[i], weight);
00240 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00241 }
00242 #endif
00243 } catch (Xapian::Error & e) {
00244 if (!errorhandler) throw;
00245 DEBUGLINE(EXCEPTION, "Calling error handler for creation of a SubMatch from a database and query.");
00246 (*errorhandler)(e);
00247
00248 smatch = NULL;
00249 }
00250 leaves.push_back(smatch);
00251 }
00252
00253 prepare_sub_matches(leaves, errorhandler, stats);
00254 }
00255
00256 string
00257 MultiMatch::get_collapse_key(PostList *pl, Xapian::docid did,
00258 Xapian::valueno keyno, Xapian::Internal::RefCntPtr<Xapian::Document::Internal> &doc)
00259 {
00260 DEBUGCALL(MATCH, string, "MultiMatch::get_collapse_key", pl << ", " << did << ", " << keyno << ", [doc]");
00261 const string *key = pl->get_collapse_key();
00262 if (key) RETURN(*key);
00263 if (doc.get() == 0) {
00264 unsigned int multiplier = db.internal.size();
00265 Assert(multiplier != 0);
00266 Xapian::doccount n = (did - 1) % multiplier;
00267 Xapian::docid m = (did - 1) / multiplier + 1;
00268
00269 doc = db.internal[n]->open_document(m, true);
00270 }
00271 RETURN(doc->get_value(keyno));
00272 }
00273
00274 Xapian::weight
00275 MultiMatch::getorrecalc_maxweight(PostList *pl)
00276 {
00277 DEBUGCALL(MATCH, Xapian::weight, "MultiMatch::getorrecalc_maxweight", pl);
00278 Xapian::weight wt;
00279 if (recalculate_w_max) {
00280 DEBUGLINE(MATCH, "recalculating max weight");
00281 wt = pl->recalc_maxweight();
00282 recalculate_w_max = false;
00283 } else {
00284 wt = pl->get_maxweight();
00285 AssertEqDoubleParanoid(wt, pl->recalc_maxweight());
00286 }
00287 DEBUGLINE(MATCH, "max possible doc weight = " << wt);
00288 RETURN(wt);
00289 }
00290
00291 void
00292 MultiMatch::get_mset(Xapian::doccount first, Xapian::doccount maxitems,
00293 Xapian::doccount check_at_least,
00294 Xapian::MSet & mset,
00295 const Stats & stats,
00296 const Xapian::MatchDecider *mdecider,
00297 const Xapian::MatchDecider *matchspy,
00298 const Xapian::Sorter *sorter)
00299 {
00300 DEBUGCALL(MATCH, void, "MultiMatch::get_mset", first << ", " << maxitems
00301 << ", " << check_at_least << ", ...");
00302 AssertRel(check_at_least,>=,maxitems);
00303
00304 if (!query) {
00305 mset = Xapian::MSet();
00306 return;
00307 }
00308
00309 Assert(!leaves.empty());
00310
00311 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00312
00313
00314 if (leaves.size() == 1 && is_remote[0]) {
00315 RemoteSubMatch * rem_match;
00316 rem_match = static_cast<RemoteSubMatch*>(leaves[0].get());
00317 rem_match->start_match(first, maxitems, check_at_least, stats);
00318 rem_match->get_mset(mset);
00319 return;
00320 }
00321 #endif
00322
00323
00324 {
00325 vector<Xapian::Internal::RefCntPtr<SubMatch> >::iterator leaf;
00326 for (leaf = leaves.begin(); leaf != leaves.end(); ++leaf) {
00327 if (!(*leaf).get()) continue;
00328 try {
00329 (*leaf)->start_match(0, first + maxitems,
00330 first + check_at_least, stats);
00331 } catch (Xapian::Error & e) {
00332 if (!errorhandler) throw;
00333 DEBUGLINE(EXCEPTION, "Calling error handler for "
00334 "start_match() on a SubMatch.");
00335 (*errorhandler)(e);
00336
00337 *leaf = NULL;
00338 }
00339 }
00340 }
00341
00342
00343 vector<PostList *> postlists;
00344 map<string, Xapian::MSet::Internal::TermFreqAndWeight> termfreqandwts;
00345 map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts_ptr;
00346 termfreqandwts_ptr = &termfreqandwts;
00347
00348
00349
00350
00351
00352 Xapian::doccount definite_matches_not_seen = 0;
00353 for (size_t i = 0; i != leaves.size(); ++i) {
00354 PostList *pl;
00355 try {
00356 pl = leaves[i]->get_postlist_and_term_info(this,
00357 termfreqandwts_ptr);
00358 if (termfreqandwts_ptr && !termfreqandwts.empty())
00359 termfreqandwts_ptr = NULL;
00360 if (is_remote[i]) {
00361 if (pl->get_termfreq_min() > first + maxitems) {
00362 DEBUGLINE(MATCH, "Found " <<
00363 pl->get_termfreq_min() - (first + maxitems)
00364 << " definite matches in remote "
00365 "submatch which aren't passed to local "
00366 "match");
00367 definite_matches_not_seen += pl->get_termfreq_min();
00368 definite_matches_not_seen -= first + maxitems;
00369 }
00370 }
00371 } catch (Xapian::Error & e) {
00372 if (!errorhandler) throw;
00373 DEBUGLINE(EXCEPTION, "Calling error handler for "
00374 "get_term_info() on a SubMatch.");
00375 (*errorhandler)(e);
00376
00377
00378 leaves[i] = NULL;
00379 pl = new EmptyPostList;
00380 }
00381 postlists.push_back(pl);
00382 }
00383 Assert(!postlists.empty());
00384
00385
00386 PostList *pl;
00387 if (postlists.size() == 1) {
00388 pl = postlists.front();
00389 } else {
00390 pl = new MergePostList(postlists, this, errorhandler);
00391 }
00392
00393 DEBUGLINE(MATCH, "pl = (" << pl->get_description() << ")");
00394
00395
00396 Xapian::doccount docs_matched = 0;
00397 Xapian::weight greatest_wt = 0;
00398 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00399 unsigned greatest_wt_subqs_db_num = UINT_MAX;
00400 #endif
00401 vector<Xapian::Internal::MSetItem> items;
00402
00403
00404 const Xapian::weight max_possible = pl->recalc_maxweight();
00405
00406 DEBUGLINE(MATCH, "pl = (" << pl->get_description() << ")");
00407 recalculate_w_max = false;
00408
00409 Xapian::doccount matches_upper_bound = pl->get_termfreq_max();
00410 Xapian::doccount matches_lower_bound = 0;
00411 Xapian::doccount matches_estimated = pl->get_termfreq_est();
00412
00413 if (mdecider == NULL && matchspy == NULL) {
00414
00415
00416
00417 matches_lower_bound = pl->get_termfreq_min();
00418 }
00419
00420
00421
00422 if (check_at_least == 0) {
00423 delete pl;
00424 if (collapse_key != Xapian::BAD_VALUENO) {
00425
00426
00427 if (matches_lower_bound > 1) matches_lower_bound = 1;
00428 }
00429
00430 mset = Xapian::MSet(new Xapian::MSet::Internal(
00431 first,
00432 matches_upper_bound,
00433 matches_lower_bound,
00434 matches_estimated,
00435 max_possible, greatest_wt, items,
00436 termfreqandwts,
00437 0));
00438 return;
00439 }
00440
00441
00442
00443
00444 Xapian::doccount duplicates_found = 0;
00445 Xapian::doccount documents_considered = 0;
00446
00447
00448 Xapian::doccount decider_considered = 0;
00449
00450 Xapian::doccount decider_denied = 0;
00451
00452
00453
00454
00455 Xapian::doccount null_collapse_values = 0;
00456
00457
00458
00459 Xapian::doccount max_msize = first + maxitems;
00460 items.reserve(max_msize + 1);
00461
00462
00463
00464 Xapian::Internal::MSetItem min_item(0.0, 0);
00465
00466
00467 Xapian::weight min_weight = weight_cutoff;
00468
00469
00470 Xapian::weight percent_cutoff_factor = percent_cutoff / 100.0;
00471
00472
00473 percent_cutoff_factor -= DBL_EPSILON;
00474
00475
00476 map<string, pair<Xapian::Internal::MSetItem,Xapian::weight> > collapse_tab;
00477
00479 bool sort_forward = (order != Xapian::Enquire::DESCENDING);
00480 MSetCmp mcmp(get_msetcmp_function(sort_by, sort_forward, sort_value_forward));
00481
00482
00483
00484
00485
00486
00487
00488
00489
00490
00491
00492
00493
00494
00495 bool is_heap = false;
00496
00497 while (true) {
00498 bool pushback;
00499
00500 if (rare(recalculate_w_max)) {
00501 if (min_weight > 0.0) {
00502 if (rare(getorrecalc_maxweight(pl) < min_weight)) {
00503 DEBUGLINE(MATCH, "*** TERMINATING EARLY (1)");
00504 break;
00505 }
00506 }
00507 }
00508
00509 if (rare(next_handling_prune(pl, min_weight, this))) {
00510 DEBUGLINE(MATCH, "*** REPLACING ROOT");
00511
00512 if (min_weight > 0.0) {
00513
00514
00515
00516 if (rare(getorrecalc_maxweight(pl) < min_weight)) {
00517 DEBUGLINE(MATCH, "*** TERMINATING EARLY (2)");
00518 break;
00519 }
00520 }
00521 }
00522
00523 if (rare(pl->at_end())) {
00524 DEBUGLINE(MATCH, "Reached end of potential matches");
00525 break;
00526 }
00527
00528
00529
00530
00531 Xapian::weight wt = 0.0;
00532 bool calculated_weight = false;
00533 if (sort_by != VAL || min_weight > 0.0) {
00534 wt = pl->get_weight();
00535 if (wt < min_weight) {
00536 DEBUGLINE(MATCH, "Rejecting potential match due to insufficient weight");
00537 continue;
00538 }
00539 calculated_weight = true;
00540 }
00541
00542 Xapian::Internal::RefCntPtr<Xapian::Document::Internal> doc;
00543 Xapian::docid did = pl->get_docid();
00544 DEBUGLINE(MATCH, "Candidate document id " << did << " wt " << wt);
00545 Xapian::Internal::MSetItem new_item(wt, did);
00546 if (sort_by != REL) {
00547 const unsigned int multiplier = db.internal.size();
00548 Assert(multiplier != 0);
00549 Xapian::doccount n = (new_item.did - 1) % multiplier;
00550 Xapian::docid m = (new_item.did - 1) / multiplier + 1;
00551 doc = db.internal[n]->open_document(m, true);
00552 if (sorter) {
00553 new_item.sort_key = (*sorter)(Xapian::Document(doc.get()));
00554 } else {
00555 new_item.sort_key = doc->get_value(sort_key);
00556 }
00557
00558
00559
00560
00561
00562 if (!mcmp(new_item, min_item)) {
00563 if (matchspy == NULL && mdecider == NULL &&
00564 collapse_key == Xapian::BAD_VALUENO) {
00565
00566
00567 DEBUGLINE(MATCH, "Making note of match item which sorts lower than min_item");
00568 ++docs_matched;
00569 if (!calculated_weight) wt = pl->get_weight();
00570 if (wt > greatest_wt) goto new_greatest_weight;
00571 continue;
00572 }
00573 if (docs_matched >= check_at_least) {
00574
00575 DEBUGLINE(MATCH, "Dropping candidate which sorts lower than min_item");
00576 continue;
00577 }
00578
00579
00580
00581 DEBUGLINE(MATCH, "Keeping candidate which sorts lower than min_item for further investigation");
00582 }
00583 }
00584
00585
00586 if (matchspy != NULL || mdecider != NULL) {
00587 const unsigned int multiplier = db.internal.size();
00588 Assert(multiplier != 0);
00589 Xapian::doccount n = (did - 1) % multiplier;
00590
00591
00592 if (!is_remote[n]) {
00593 if (doc.get() == 0) {
00594 Xapian::docid m = (did - 1) / multiplier + 1;
00595 doc = db.internal[n]->open_document(m, true);
00596 Assert(doc.get());
00597 }
00598 Xapian::Document mydoc(doc.get());
00599
00600 ++decider_considered;
00601 if (matchspy && !matchspy->operator()(mydoc)) {
00602 ++decider_denied;
00603 continue;
00604 }
00605 if (mdecider && !mdecider->operator()(mydoc)) {
00606 ++decider_denied;
00607 continue;
00608 }
00609 }
00610 }
00611
00612 if (!calculated_weight) {
00613
00614 wt = pl->get_weight();
00615 new_item.wt = wt;
00616 }
00617
00618 pushback = true;
00619 documents_considered++;
00620
00621
00622 if (collapse_key != Xapian::BAD_VALUENO) {
00623 new_item.collapse_key = get_collapse_key(pl, did, collapse_key,
00624 doc);
00625
00626
00627 if (new_item.collapse_key.empty()) {
00628 ++null_collapse_values;
00629 } else {
00630 map<string, pair<Xapian::Internal::MSetItem, Xapian::weight> >::iterator oldkey;
00631 oldkey = collapse_tab.find(new_item.collapse_key);
00632 if (oldkey == collapse_tab.end()) {
00633 DEBUGLINE(MATCH, "collapsem: new key: " <<
00634 new_item.collapse_key);
00635
00636 collapse_tab.insert(make_pair(new_item.collapse_key,
00637 make_pair(new_item, Xapian::weight(0))));
00638 } else {
00639 ++duplicates_found;
00640 Xapian::Internal::MSetItem &old_item = oldkey->second.first;
00641
00642 if (mcmp(old_item, new_item)) {
00643 DEBUGLINE(MATCH, "collapsem: better exists: " <<
00644 new_item.collapse_key);
00645
00646 ++old_item.collapse_count;
00647
00648 if (new_item.wt > oldkey->second.second) {
00649 oldkey->second.second = new_item.wt;
00650 }
00651
00652
00653 if (sort_by != REL && sort_by != REL_VAL) {
00654 if (wt > greatest_wt) goto new_greatest_weight;
00655 }
00656 continue;
00657 }
00658
00659
00660 new_item.collapse_count = old_item.collapse_count + 1;
00661
00662
00663
00664 Assert(!items.empty());
00665
00666
00667
00668
00669
00670 Xapian::weight old_wt = old_item.wt;
00671 if (old_wt >= min_weight && mcmp(old_item, min_item)) {
00672
00673
00674 Xapian::docid olddid = old_item.did;
00675 vector<Xapian::Internal::MSetItem>::iterator i;
00676 for (i = items.begin(); i != items.end(); ++i) {
00677 if (i->did == olddid) {
00678 DEBUGLINE(MATCH, "collapsem: removing " <<
00679 olddid << ": " <<
00680 new_item.collapse_key);
00681
00682
00683
00684
00685
00686
00687 *i = new_item;
00688 pushback = false;
00689 is_heap = false;
00690 break;
00691 }
00692 }
00693 }
00694
00695
00696 oldkey->second = make_pair(new_item, old_wt);
00697 }
00698 }
00699 }
00700
00701
00702 if (pushback) {
00703 ++docs_matched;
00704 if (items.size() >= max_msize) {
00705 items.push_back(new_item);
00706 if (!is_heap) {
00707 is_heap = true;
00708 make_heap(items.begin(), items.end(), mcmp);
00709 } else {
00710 push_heap<vector<Xapian::Internal::MSetItem>::iterator,
00711 MSetCmp>(items.begin(), items.end(), mcmp);
00712 }
00713 pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
00714 MSetCmp>(items.begin(), items.end(), mcmp);
00715 items.pop_back();
00716
00717 min_item = items.front();
00718 if (sort_by == REL || sort_by == REL_VAL) {
00719 if (docs_matched >= check_at_least) {
00720 if (sort_by == REL) {
00721
00722
00723
00724 if (rare(max_possible == 0 && sort_forward)) {
00725
00726
00727
00728
00729
00730 if (leaves.size() == 1) break;
00731 }
00732 }
00733 if (min_item.wt > min_weight) {
00734 DEBUGLINE(MATCH, "Setting min_weight to " <<
00735 min_item.wt << " from " << min_weight);
00736 min_weight = min_item.wt;
00737 }
00738 }
00739 }
00740 if (rare(getorrecalc_maxweight(pl) < min_weight)) {
00741 DEBUGLINE(MATCH, "*** TERMINATING EARLY (3)");
00742 break;
00743 }
00744 } else {
00745 items.push_back(new_item);
00746 is_heap = false;
00747 if (sort_by == REL && items.size() == max_msize) {
00748 if (docs_matched >= check_at_least) {
00749
00750
00751
00752 if (rare(max_possible == 0 && sort_forward)) {
00753
00754
00755
00756
00757
00758 if (leaves.size() == 1) break;
00759 }
00760 }
00761 }
00762 }
00763 }
00764
00765
00766 if (wt > greatest_wt) {
00767 new_greatest_weight:
00768 greatest_wt = wt;
00769 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00770 const unsigned int multiplier = db.internal.size();
00771 unsigned int db_num = (did - 1) % multiplier;
00772 if (is_remote[db_num]) {
00773
00774
00775 greatest_wt_subqs_db_num = db_num;
00776 } else {
00777 greatest_wt_subqs_db_num = UINT_MAX;
00778 }
00779 #endif
00780 if (percent_cutoff) {
00781 Xapian::weight w = wt * percent_cutoff_factor;
00782 if (w > min_weight) {
00783 min_weight = w;
00784 if (!is_heap) {
00785 is_heap = true;
00786 make_heap<vector<Xapian::Internal::MSetItem>::iterator,
00787 MSetCmp>(items.begin(), items.end(), mcmp);
00788 }
00789 while (!items.empty() && items.front().wt < min_weight) {
00790 pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
00791 MSetCmp>(items.begin(), items.end(), mcmp);
00792 Assert(items.back().wt < min_weight);
00793 items.pop_back();
00794 }
00795 #ifdef XAPIAN_DEBUG_PARANOID
00796 vector<Xapian::Internal::MSetItem>::const_iterator i;
00797 for (i = items.begin(); i != items.end(); ++i) {
00798 Assert(i->wt >= min_weight);
00799 }
00800 #endif
00801 }
00802 }
00803 }
00804 }
00805
00806
00807 delete pl;
00808
00809 double percent_scale = 0;
00810 if (!items.empty() && greatest_wt > 0) {
00811
00812
00813 vector<Xapian::Internal::MSetItem>::const_iterator best;
00814 best = min_element(items.begin(), items.end(), mcmp);
00815
00816 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00817 if (greatest_wt_subqs_db_num != UINT_MAX) {
00818 const unsigned int n = greatest_wt_subqs_db_num;
00819 RemoteSubMatch * rem_match;
00820 rem_match = static_cast<RemoteSubMatch*>(leaves[n].get());
00821 percent_scale = rem_match->get_percent_factor() / 100.0;
00822 } else
00823 #endif
00824 if (termfreqandwts.size() > 1) {
00825 Xapian::termcount matching_terms = 0;
00826 map<string,
00827 Xapian::MSet::Internal::TermFreqAndWeight>::const_iterator i;
00828
00829
00830 i = termfreqandwts.find(string());
00831 if (i != termfreqandwts.end()) {
00832 percent_scale += i->second.termweight;
00833 ++matching_terms;
00834 }
00835
00836 Xapian::TermIterator docterms = db.termlist_begin(best->did);
00837 Xapian::TermIterator docterms_end = db.termlist_end(best->did);
00838 while (docterms != docterms_end) {
00839 i = termfreqandwts.find(*docterms);
00840 if (i != termfreqandwts.end()) {
00841 percent_scale += i->second.termweight;
00842 ++matching_terms;
00843 if (matching_terms == termfreqandwts.size()) break;
00844 }
00845 ++docterms;
00846 }
00847
00848 if (matching_terms < termfreqandwts.size()) {
00849
00850 double denom = 0;
00851 for (i = termfreqandwts.begin(); i != termfreqandwts.end(); ++i)
00852 denom += i->second.termweight;
00853
00854 DEBUGLINE(MATCH, "denom = " << denom << " percent_scale = " << percent_scale);
00855 Assert(percent_scale <= denom);
00856 denom *= greatest_wt;
00857 Assert(denom > 0);
00858 percent_scale /= denom;
00859 } else {
00860
00861 percent_scale = 1.0 / greatest_wt;
00862 }
00863 } else {
00864
00865
00866 percent_scale = 1.0 / greatest_wt;
00867 }
00868 Assert(percent_scale > 0);
00869 if (percent_cutoff) {
00870
00871
00872
00873
00874 Xapian::weight min_wt = percent_cutoff_factor / percent_scale;
00875 if (!is_heap) {
00876 is_heap = true;
00877 make_heap<vector<Xapian::Internal::MSetItem>::iterator,
00878 MSetCmp>(items.begin(), items.end(), mcmp);
00879 }
00880 while (!items.empty() && items.front().wt < min_wt) {
00881 pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
00882 MSetCmp>(items.begin(), items.end(), mcmp);
00883 Assert(items.back().wt < min_wt);
00884 items.pop_back();
00885 }
00886 #ifdef XAPIAN_DEBUG_PARANOID
00887 vector<Xapian::Internal::MSetItem>::const_iterator j;
00888 for (j = items.begin(); j != items.end(); ++j) {
00889 Assert(j->wt >= min_wt);
00890 }
00891 #endif
00892 }
00893 percent_scale *= 100.0;
00894 }
00895
00896 DEBUGLINE(MATCH,
00897 "docs_matched = " << docs_matched <<
00898 ", definite_matches_not_seen = " << definite_matches_not_seen <<
00899 ", matches_lower_bound = " << matches_lower_bound <<
00900 ", matches_estimated = " << matches_estimated <<
00901 ", matches_upper_bound = " << matches_upper_bound);
00902
00903
00904
00905 docs_matched += definite_matches_not_seen;
00906
00907 if (items.size() < max_msize) {
00908
00909
00910 DEBUGLINE(MATCH, "items.size() = " << items.size() <<
00911 ", max_msize = " << max_msize << ", setting bounds equal");
00912 Assert(definite_matches_not_seen == 0);
00913 Assert(percent_cutoff || docs_matched == items.size());
00914 matches_lower_bound = matches_upper_bound = matches_estimated
00915 = items.size();
00916 } else if (collapse_key == Xapian::BAD_VALUENO && docs_matched < check_at_least) {
00917
00918
00919 DEBUGLINE(MATCH, "Setting bounds equal");
00920 matches_lower_bound = matches_upper_bound = matches_estimated
00921 = docs_matched;
00922 } else {
00923 AssertRel(matches_estimated,>=,matches_lower_bound);
00924 AssertRel(matches_estimated,<=,matches_upper_bound);
00925
00926
00927
00928
00929 double estimate_scale = 1.0;
00930
00931 if (collapse_key != Xapian::BAD_VALUENO) {
00932
00933
00934
00935
00936 DEBUGLINE(MATCH, "Adjusting bounds due to collapse_key");
00937 matches_lower_bound = null_collapse_values + collapse_tab.size();
00938
00939
00940
00941
00942 if (documents_considered > 0) {
00943 double unique = double(documents_considered - duplicates_found);
00944 double unique_rate = unique / double(documents_considered);
00945 estimate_scale *= unique_rate;
00946 }
00947
00948
00949
00950 matches_upper_bound -= duplicates_found;
00951
00952 DEBUGLINE(MATCH, "matches_lower_bound=" << matches_lower_bound <<
00953 ", matches_estimated=" << matches_estimated <<
00954 "*" << estimate_scale <<
00955 ", matches_upper_bound=" << matches_upper_bound);
00956 }
00957
00958 if (matchspy || mdecider) {
00959 if (collapse_key == Xapian::BAD_VALUENO && !percent_cutoff) {
00960
00961
00962 matches_lower_bound = max(docs_matched, matches_lower_bound);
00963 }
00964
00965
00966
00967
00968 if (decider_considered > 0) {
00969 double accept = double(decider_considered - decider_denied);
00970 double accept_rate = accept / double(decider_considered);
00971 estimate_scale *= accept_rate;
00972 }
00973
00974
00975
00976
00977
00978 matches_upper_bound -= decider_denied;
00979 }
00980
00981 if (percent_cutoff) {
00982 estimate_scale *= (1.0 - percent_cutoff_factor);
00983
00984
00985
00986
00987
00988
00989
00990
00991
00992 matches_lower_bound = items.size();
00993
00994
00995 DEBUGLINE(MATCH, "Adjusted bounds due to percent_cutoff (" <<
00996 percent_cutoff << "): now have matches_estimated=" <<
00997 matches_estimated << ", matches_lower_bound=" <<
00998 matches_lower_bound);
00999 }
01000
01001 if (estimate_scale != 1.0) {
01002 matches_estimated =
01003 Xapian::doccount(matches_estimated * estimate_scale + 0.5);
01004 if (matches_estimated < matches_lower_bound)
01005 matches_estimated = matches_lower_bound;
01006 }
01007
01008 if (collapse_key != Xapian::BAD_VALUENO || matchspy || mdecider) {
01009 DEBUGLINE(MATCH, "Clamping estimate between bounds: "
01010 "matches_lower_bound = " << matches_lower_bound <<
01011 ", matches_estimated = " << matches_estimated <<
01012 ", matches_upper_bound = " << matches_upper_bound);
01013 AssertRel(matches_lower_bound,<=,matches_upper_bound);
01014 matches_estimated = max(matches_estimated, matches_lower_bound);
01015 matches_estimated = min(matches_estimated, matches_upper_bound);
01016 } else if (!percent_cutoff) {
01017 AssertRel(docs_matched,<=,matches_upper_bound);
01018 if (docs_matched > matches_lower_bound)
01019 matches_lower_bound = docs_matched;
01020 if (docs_matched > matches_estimated)
01021 matches_estimated = docs_matched;
01022 }
01023 }
01024
01025 DEBUGLINE(MATCH, items.size() << " items in potential mset");
01026
01027 if (first > 0) {
01028
01029 if (items.size() <= first) {
01030 items.clear();
01031 } else {
01032 DEBUGLINE(MATCH, "finding " << first << "th");
01033
01034
01035
01036
01037 vector<Xapian::Internal::MSetItem>::reverse_iterator nth;
01038 nth = items.rbegin() + first;
01039 nth_element(items.rbegin(), nth, items.rend(), mcmp);
01040
01041 items.erase(items.begin() + items.size() - first, items.end());
01042 }
01043 }
01044
01045 DEBUGLINE(MATCH, "sorting " << items.size() << " entries");
01046
01047
01048 sort(items.begin(), items.end(), mcmp);
01049
01050 if (!items.empty()) {
01051 DEBUGLINE(MATCH, "min weight in mset = " << items.back().wt);
01052 DEBUGLINE(MATCH, "max weight in mset = " << items[0].wt);
01053 }
01054
01055 AssertRel(matches_estimated,>=,matches_lower_bound);
01056 AssertRel(matches_estimated,<=,matches_upper_bound);
01057
01058
01059
01060
01061
01062
01063
01064
01065
01066 if (collapse_key != Xapian::BAD_VALUENO && !items.empty() &&
01067 !collapse_tab.empty()) {
01068
01069
01070 Xapian::weight min_wt = percent_cutoff_factor / (percent_scale / 100);
01071 vector<Xapian::Internal::MSetItem>::iterator i;
01072 for (i = items.begin(); i != items.end() && !collapse_tab.empty(); ++i) {
01073
01074 if ( !i->collapse_key.empty()) {
01075 map<string, pair<Xapian::Internal::MSetItem, Xapian::weight> >::iterator key;
01076 key = collapse_tab.find(i->collapse_key);
01077
01078
01079
01080
01081 Assert(key != collapse_tab.end());
01082
01083
01084
01085 if (key->second.second < min_wt)
01086 i->collapse_count = 0;
01087 else
01088 i->collapse_count = key->second.first.collapse_count;
01089
01090
01091 collapse_tab.erase(key);
01092 }
01093 }
01094 }
01095
01096 mset = Xapian::MSet(new Xapian::MSet::Internal(
01097 first,
01098 matches_upper_bound,
01099 matches_lower_bound,
01100 matches_estimated,
01101 max_possible, greatest_wt, items,
01102 termfreqandwts,
01103 percent_scale));
01104 }
01105
01106
01107
01108 void
01109 MultiMatch::recalc_maxweight()
01110 {
01111 DEBUGCALL(MATCH, void, "MultiMatch::recalc_maxweight", "");
01112 recalculate_w_max = true;
01113 }