matcher/multimatch.cc

Go to the documentation of this file.
00001 /* multimatch.cc
00002  *
00003  * Copyright 1999,2000,2001 BrightStation PLC
00004  * Copyright 2001,2002 Ananova Ltd
00005  * Copyright 2002,2003,2004,2005,2006,2007,2008,2010 Olly Betts
00006  * Copyright 2003 Orange PCS Ltd
00007  * Copyright 2003 Sam Liddicott
00008  * Copyright 2007,2008 Lemur Consulting Ltd
00009  *
00010  * This program is free software; you can redistribute it and/or
00011  * modify it under the terms of the GNU General Public License as
00012  * published by the Free Software Foundation; either version 2 of the
00013  * License, or (at your option) any later version.
00014  *
00015  * This program is distributed in the hope that it will be useful,
00016  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00017  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00018  * GNU General Public License for more details.
00019  *
00020  * You should have received a copy of the GNU General Public License
00021  * along with this program; if not, write to the Free Software
00022  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
00023  * USA
00024  */
00025 
00026 #include <config.h>
00027 
00028 #include "multimatch.h"
00029 #include "submatch.h"
00030 #include "localmatch.h"
00031 #include "omdebug.h"
00032 #include "omenquireinternal.h"
00033 
00034 #include "emptypostlist.h"
00035 #include "branchpostlist.h"
00036 #include "mergepostlist.h"
00037 
00038 #include "document.h"
00039 #include "omqueryinternal.h"
00040 
00041 #include "submatch.h"
00042 #include "stats.h"
00043 
00044 #include "msetcmp.h"
00045 
00046 #include <xapian/errorhandler.h>
00047 #include <xapian/version.h> // For XAPIAN_HAS_REMOTE_BACKEND
00048 
00049 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00050 #include "remotesubmatch.h"
00051 #include "remote-database.h"
00052 #endif /* XAPIAN_HAS_REMOTE_BACKEND */
00053 
00054 #include <algorithm>
00055 #include <cfloat> // For DBL_EPSILON.
00056 #include <climits> // For UINT_MAX.
00057 #include <vector>
00058 #include <map>
00059 #include <set>
00060 
00061 using namespace std;
00062 
00063 const Xapian::Enquire::Internal::sort_setting REL =
00064         Xapian::Enquire::Internal::REL;
00065 const Xapian::Enquire::Internal::sort_setting REL_VAL =
00066         Xapian::Enquire::Internal::REL_VAL;
00067 const Xapian::Enquire::Internal::sort_setting VAL =
00068         Xapian::Enquire::Internal::VAL;
00069 #if 0 // VAL_REL isn't currently used which causes a warning with SGI CC.
00070 const Xapian::Enquire::Internal::sort_setting VAL_REL =
00071         Xapian::Enquire::Internal::VAL_REL;
00072 #endif
00073 
00081 static void
00082 split_rset_by_db(const Xapian::RSet * rset,
00083                  Xapian::doccount number_of_subdbs,
00084                  vector<Xapian::RSet> & subrsets)
00085 {
00086     DEBUGCALL_STATIC(MATCH, void, "split_rset_by_db",
00087                      (rset ? *rset : Xapian::RSet()) <<
00088                      ", " << number_of_subdbs <<
00089                      ", [subrsets(size=" << subrsets.size() << "]");
00090     if (rset) {
00091         if (number_of_subdbs == 1) {
00092             // The common case of a single database is easy to handle.
00093             subrsets.push_back(*rset);
00094         } else {
00095             // Can't just use vector::resize() here, since that creates N
00096             // copies of the same RSet!
00097             subrsets.reserve(number_of_subdbs);
00098             for (size_t i = 0; i < number_of_subdbs; ++i) {
00099                 subrsets.push_back(Xapian::RSet());
00100             }
00101 
00102             const set<Xapian::docid> & rsetitems = rset->internal->get_items();
00103             set<Xapian::docid>::const_iterator j;
00104             for (j = rsetitems.begin(); j != rsetitems.end(); ++j) {
00105                 Xapian::doccount local_docid = (*j - 1) / number_of_subdbs + 1;
00106                 Xapian::doccount subdatabase = (*j - 1) % number_of_subdbs;
00107                 subrsets[subdatabase].add_document(local_docid);
00108             }
00109         }
00110     } else {
00111         // NB vector::resize() creates N copies of the same empty RSet.
00112         subrsets.resize(number_of_subdbs);
00113     }
00114     Assert(subrsets.size() == number_of_subdbs);
00115 }
00116 
00135 static void
00136 prepare_sub_matches(vector<Xapian::Internal::RefCntPtr<SubMatch> > & leaves,
00137                     Xapian::ErrorHandler * errorhandler,
00138                     Stats & stats)
00139 {
00140     DEBUGCALL_STATIC(MATCH, void, "prepare_sub_matches",
00141                      "[leaves(size=" << leaves.size() << ")], " <<
00142                      errorhandler << ", " << stats);
00143     // We use a vector<bool> to track which SubMatches we're already prepared.
00144     vector<bool> prepared;
00145     prepared.resize(leaves.size(), false);
00146     size_t unprepared = leaves.size();
00147     bool nowait = true;
00148     while (unprepared) {
00149         for (size_t leaf = 0; leaf < leaves.size(); ++leaf) {
00150             if (prepared[leaf]) continue;
00151             try {
00152                 SubMatch * submatch = leaves[leaf].get();
00153                 if (!submatch || submatch->prepare_match(nowait, stats)) {
00154                     prepared[leaf] = true;
00155                     --unprepared;
00156                 }
00157             } catch (Xapian::Error & e) {
00158                 if (!errorhandler) throw;
00159 
00160                 DEBUGLINE(EXCEPTION, "Calling error handler for prepare_match() on a SubMatch.");
00161                 (*errorhandler)(e);
00162                 // Continue match without this sub-match.
00163                 leaves[leaf] = NULL;
00164                 prepared[leaf] = true;
00165                 --unprepared;
00166             }
00167         }
00168         // Use blocking IO on subsequent passes, so that we don't go into
00169         // a tight loop.
00170         nowait = false;
00171     }
00172 }
00173 
00175 // Initialisation and cleaning up //
00177 MultiMatch::MultiMatch(const Xapian::Database &db_,
00178                        const Xapian::Query::Internal * query_,
00179                        Xapian::termcount qlen,
00180                        const Xapian::RSet * omrset,
00181                        Xapian::valueno collapse_key_,
00182                        int percent_cutoff_, Xapian::weight weight_cutoff_,
00183                        Xapian::Enquire::docid_order order_,
00184                        Xapian::valueno sort_key_,
00185                        Xapian::Enquire::Internal::sort_setting sort_by_,
00186                        bool sort_value_forward_,
00187                        Xapian::ErrorHandler * errorhandler_,
00188                        Stats & stats,
00189                        const Xapian::Weight * weight_,
00190                        bool have_sorter, bool have_mdecider)
00191         : db(db_), query(query_),
00192           collapse_key(collapse_key_), percent_cutoff(percent_cutoff_),
00193           weight_cutoff(weight_cutoff_), order(order_),
00194           sort_key(sort_key_), sort_by(sort_by_),
00195           sort_value_forward(sort_value_forward_),
00196           errorhandler(errorhandler_), weight(weight_),
00197           is_remote(db.internal.size())
00198 {
00199     DEBUGCALL(MATCH, void, "MultiMatch", db_ << ", " << query_ << ", " <<
00200               qlen << ", " << (omrset ? *omrset : Xapian::RSet()) << ", " <<
00201               collapse_key_ << ", " <<
00202               percent_cutoff_ << ", " << weight_cutoff_ << ", " <<
00203               int(order_) << ", " << sort_key_ << ", " <<
00204               int(sort_by_) << ", " << sort_value_forward_ << ", " <<
00205               errorhandler_ << ", " << stats << ", [weight_], " <<
00206               have_sorter << ", " << have_mdecider);
00207 
00208     if (!query) return;
00209     query->validate_query();
00210 
00211     Xapian::doccount number_of_subdbs = db.internal.size();
00212     vector<Xapian::RSet> subrsets;
00213     split_rset_by_db(omrset, number_of_subdbs, subrsets);
00214 
00215     for (size_t i = 0; i != number_of_subdbs; ++i) {
00216         Xapian::Database::Internal *subdb = db.internal[i].get();
00217         Assert(subdb);
00218         Xapian::Internal::RefCntPtr<SubMatch> smatch;
00219         try {
00220             // There is currently only one special case, for network databases.
00221 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00222             RemoteDatabase *rem_db = subdb->as_remotedatabase();
00223             if (rem_db) {
00224                 if (have_sorter) {
00225                     throw Xapian::UnimplementedError("Xapian::Sorter not supported for the remote backend");
00226                 }
00227                 if (have_mdecider) {
00228                     throw Xapian::UnimplementedError("Xapian::MatchDecider not supported for the remote backend");
00229                 }
00230                 rem_db->set_query(query, qlen, collapse_key, order, sort_key,
00231                                   sort_by, sort_value_forward, percent_cutoff,
00232                                   weight_cutoff, weight, subrsets[i]);
00233                 bool decreasing_relevance =
00234                     (sort_by == REL || sort_by == REL_VAL);
00235                 smatch = new RemoteSubMatch(rem_db, decreasing_relevance);
00236                 is_remote[i] = true;
00237             } else {
00238 #endif /* XAPIAN_HAS_REMOTE_BACKEND */
00239                 smatch = new LocalSubMatch(subdb, query, qlen, subrsets[i], weight);
00240 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00241             }
00242 #endif /* XAPIAN_HAS_REMOTE_BACKEND */
00243         } catch (Xapian::Error & e) {
00244             if (!errorhandler) throw;
00245             DEBUGLINE(EXCEPTION, "Calling error handler for creation of a SubMatch from a database and query.");
00246             (*errorhandler)(e);
00247             // Continue match without this sub-postlist.
00248             smatch = NULL;
00249         }
00250         leaves.push_back(smatch);
00251     }
00252 
00253     prepare_sub_matches(leaves, errorhandler, stats);
00254 }
00255 
00256 string
00257 MultiMatch::get_collapse_key(PostList *pl, Xapian::docid did,
00258                              Xapian::valueno keyno, Xapian::Internal::RefCntPtr<Xapian::Document::Internal> &doc)
00259 {
00260     DEBUGCALL(MATCH, string, "MultiMatch::get_collapse_key", pl << ", " << did << ", " << keyno << ", [doc]");
00261     const string *key = pl->get_collapse_key();
00262     if (key) RETURN(*key);
00263     if (doc.get() == 0) {
00264         unsigned int multiplier = db.internal.size();
00265         Assert(multiplier != 0);
00266         Xapian::doccount n = (did - 1) % multiplier; // which actual database
00267         Xapian::docid m = (did - 1) / multiplier + 1; // real docid in that database
00268 
00269         doc = db.internal[n]->open_document(m, true);
00270     }
00271     RETURN(doc->get_value(keyno));
00272 }
00273 
00274 Xapian::weight
00275 MultiMatch::getorrecalc_maxweight(PostList *pl)
00276 {
00277     DEBUGCALL(MATCH, Xapian::weight, "MultiMatch::getorrecalc_maxweight", pl);
00278     Xapian::weight wt;
00279     if (recalculate_w_max) {
00280         DEBUGLINE(MATCH, "recalculating max weight");
00281         wt = pl->recalc_maxweight();
00282         recalculate_w_max = false;
00283     } else {
00284         wt = pl->get_maxweight();
00285         AssertEqDoubleParanoid(wt, pl->recalc_maxweight());
00286     }
00287     DEBUGLINE(MATCH, "max possible doc weight = " << wt);
00288     RETURN(wt);
00289 }
00290 
00291 void
00292 MultiMatch::get_mset(Xapian::doccount first, Xapian::doccount maxitems,
00293                      Xapian::doccount check_at_least,
00294                      Xapian::MSet & mset,
00295                      const Stats & stats,
00296                      const Xapian::MatchDecider *mdecider,
00297                      const Xapian::MatchDecider *matchspy,
00298                      const Xapian::Sorter *sorter)
00299 {
00300     DEBUGCALL(MATCH, void, "MultiMatch::get_mset", first << ", " << maxitems
00301               << ", " << check_at_least << ", ...");
00302     AssertRel(check_at_least,>=,maxitems);
00303 
00304     if (!query) {
00305         mset = Xapian::MSet(); // FIXME: mset.get_firstitem() will return 0 not first
00306         return;
00307     }
00308 
00309     Assert(!leaves.empty());
00310 
00311 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00312     // If there's only one database and it's remote, we can just unserialise
00313     // its MSet and return that.
00314     if (leaves.size() == 1 && is_remote[0]) {
00315         RemoteSubMatch * rem_match;
00316         rem_match = static_cast<RemoteSubMatch*>(leaves[0].get());
00317         rem_match->start_match(first, maxitems, check_at_least, stats);
00318         rem_match->get_mset(mset);
00319         return;
00320     }
00321 #endif
00322 
00323     // Start matchers.
00324     {
00325         vector<Xapian::Internal::RefCntPtr<SubMatch> >::iterator leaf;
00326         for (leaf = leaves.begin(); leaf != leaves.end(); ++leaf) {
00327             if (!(*leaf).get()) continue;
00328             try {
00329                 (*leaf)->start_match(0, first + maxitems,
00330                                      first + check_at_least, stats);
00331             } catch (Xapian::Error & e) {
00332                 if (!errorhandler) throw;
00333                 DEBUGLINE(EXCEPTION, "Calling error handler for "
00334                           "start_match() on a SubMatch.");
00335                 (*errorhandler)(e);
00336                 // Continue match without this sub-match.
00337                 *leaf = NULL;
00338             }
00339         }
00340     }
00341 
00342     // Get postlists and term info
00343     vector<PostList *> postlists;
00344     map<string, Xapian::MSet::Internal::TermFreqAndWeight> termfreqandwts;
00345     map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts_ptr;
00346     termfreqandwts_ptr = &termfreqandwts;
00347 
00348     // Keep a count of matches which we know exist, but we won't see.  This
00349     // occurs when a submatch is remote, and returns a lower bound on the
00350     // number of matching documents which is higher than the number of
00351     // documents it returns (because it wasn't asked for more documents).
00352     Xapian::doccount definite_matches_not_seen = 0;
00353     for (size_t i = 0; i != leaves.size(); ++i) {
00354         PostList *pl;
00355         try {
00356             pl = leaves[i]->get_postlist_and_term_info(this,
00357                                                        termfreqandwts_ptr);
00358             if (termfreqandwts_ptr && !termfreqandwts.empty())
00359                 termfreqandwts_ptr = NULL;
00360             if (is_remote[i]) {
00361                 if (pl->get_termfreq_min() > first + maxitems) {
00362                     DEBUGLINE(MATCH, "Found " <<
00363                               pl->get_termfreq_min() - (first + maxitems)
00364                               << " definite matches in remote "
00365                               "submatch which aren't passed to local "
00366                               "match");
00367                     definite_matches_not_seen += pl->get_termfreq_min();
00368                     definite_matches_not_seen -= first + maxitems;
00369                 }
00370             }
00371         } catch (Xapian::Error & e) {
00372             if (!errorhandler) throw;
00373             DEBUGLINE(EXCEPTION, "Calling error handler for "
00374                       "get_term_info() on a SubMatch.");
00375             (*errorhandler)(e);
00376             // FIXME: check if *ALL* the remote servers have failed!
00377             // Continue match without this sub-match.
00378             leaves[i] = NULL;
00379             pl = new EmptyPostList;
00380         }
00381         postlists.push_back(pl);
00382     }
00383     Assert(!postlists.empty());
00384 
00385     // Get a single combined postlist
00386     PostList *pl;
00387     if (postlists.size() == 1) {
00388         pl = postlists.front();
00389     } else {
00390         pl = new MergePostList(postlists, this, errorhandler);
00391     }
00392 
00393     DEBUGLINE(MATCH, "pl = (" << pl->get_description() << ")");
00394 
00395     // Empty result set
00396     Xapian::doccount docs_matched = 0;
00397     Xapian::weight greatest_wt = 0;
00398 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00399     unsigned greatest_wt_subqs_db_num = UINT_MAX;
00400 #endif
00401     vector<Xapian::Internal::MSetItem> items;
00402 
00403     // maximum weight a document could possibly have
00404     const Xapian::weight max_possible = pl->recalc_maxweight();
00405 
00406     DEBUGLINE(MATCH, "pl = (" << pl->get_description() << ")");
00407     recalculate_w_max = false;
00408 
00409     Xapian::doccount matches_upper_bound = pl->get_termfreq_max();
00410     Xapian::doccount matches_lower_bound = 0;
00411     Xapian::doccount matches_estimated   = pl->get_termfreq_est();
00412 
00413     if (mdecider == NULL && matchspy == NULL) {
00414         // If we have a matcher decider or match spy, the lower bound must be
00415         // set to 0 as we could discard all hits.  Otherwise set it to the
00416         // minimum number of entries which the postlist could return.
00417         matches_lower_bound = pl->get_termfreq_min();
00418     }
00419 
00420     // Check if any results have been asked for (might just be wanting
00421     // maxweight).
00422     if (check_at_least == 0) {
00423         delete pl;
00424         if (collapse_key != Xapian::BAD_VALUENO) {
00425             // Lower bound must be set to no more than 1, since it's possible
00426             // that all hits will be collapsed to a single hit.
00427             if (matches_lower_bound > 1) matches_lower_bound = 1;
00428         }
00429 
00430         mset = Xapian::MSet(new Xapian::MSet::Internal(
00431                                            first,
00432                                            matches_upper_bound,
00433                                            matches_lower_bound,
00434                                            matches_estimated,
00435                                            max_possible, greatest_wt, items,
00436                                            termfreqandwts,
00437                                            0));
00438         return;
00439     }
00440 
00441     // duplicates_found and documents_considered are used solely to keep track
00442     // of how frequently collapses are occurring, so that the matches_estimated
00443     // can be adjusted accordingly.
00444     Xapian::doccount duplicates_found = 0;
00445     Xapian::doccount documents_considered = 0;
00446 
00447     // Number of documents considered by a decider or matchspy.
00448     Xapian::doccount decider_considered = 0;
00449     // Number of documents denied by the decider or matchspy.
00450     Xapian::doccount decider_denied = 0;
00451 
00452     // We keep track of the number of null collapse values because this allows
00453     // us to provide a better value for matches_lower_bound if there are null
00454     // collapse values.
00455     Xapian::doccount null_collapse_values = 0;
00456 
00457     // Set max number of results that we want - this is used to decide
00458     // when to throw away unwanted items.
00459     Xapian::doccount max_msize = first + maxitems;
00460     items.reserve(max_msize + 1);
00461 
00462     // Tracks the minimum item currently eligible for the MSet - we compare
00463     // candidate items against this.
00464     Xapian::Internal::MSetItem min_item(0.0, 0);
00465 
00466     // Minimum weight an item must have to be worth considering.
00467     Xapian::weight min_weight = weight_cutoff;
00468 
00469     // Factor to multiply maximum weight seen by to get the cutoff weight.
00470     Xapian::weight percent_cutoff_factor = percent_cutoff / 100.0;
00471     // Corresponding correction to that in omenquire.cc to account for excess
00472     // precision on x86.
00473     percent_cutoff_factor -= DBL_EPSILON;
00474 
00475     // Table of keys which have been seen already, for collapsing.
00476     map<string, pair<Xapian::Internal::MSetItem,Xapian::weight> > collapse_tab;
00477 
00479     bool sort_forward = (order != Xapian::Enquire::DESCENDING);
00480     MSetCmp mcmp(get_msetcmp_function(sort_by, sort_forward, sort_value_forward));
00481 
00482     // Perform query
00483 
00484     // We form the mset in two stages.  In the first we fill up our working
00485     // mset.  Adding a new document does not remove another.
00486     //
00487     // In the second, we consider documents which rank higher than the current
00488     // lowest ranking document in the mset.  Each document added expels the
00489     // current lowest ranking document.
00490     //
00491     // If a percentage cutoff is in effect, it can cause the matcher to return
00492     // from the second stage from the first.
00493 
00494     // Is the mset a valid heap?
00495     bool is_heap = false;
00496 
00497     while (true) {
00498         bool pushback;
00499 
00500         if (rare(recalculate_w_max)) {
00501             if (min_weight > 0.0) {
00502                 if (rare(getorrecalc_maxweight(pl) < min_weight)) {
00503                     DEBUGLINE(MATCH, "*** TERMINATING EARLY (1)");
00504                     break;
00505                 }
00506             }
00507         }
00508 
00509         if (rare(next_handling_prune(pl, min_weight, this))) {
00510             DEBUGLINE(MATCH, "*** REPLACING ROOT");
00511 
00512             if (min_weight > 0.0) {
00513                 // No need for a full recalc (unless we've got to do one
00514                 // because of a prune elsewhere) - we're just switching to a
00515                 // subtree.
00516                 if (rare(getorrecalc_maxweight(pl) < min_weight)) {
00517                     DEBUGLINE(MATCH, "*** TERMINATING EARLY (2)");
00518                     break;
00519                 }
00520             }
00521         }
00522 
00523         if (rare(pl->at_end())) {
00524             DEBUGLINE(MATCH, "Reached end of potential matches");
00525             break;
00526         }
00527 
00528         // Only calculate the weight if we need it for mcmp, or there's a
00529         // percentage or weight cutoff in effect.  Otherwise we calculate it
00530         // below if we haven't already rejected this candidate.
00531         Xapian::weight wt = 0.0;
00532         bool calculated_weight = false;
00533         if (sort_by != VAL || min_weight > 0.0) {
00534             wt = pl->get_weight();
00535             if (wt < min_weight) {
00536                 DEBUGLINE(MATCH, "Rejecting potential match due to insufficient weight");
00537                 continue;
00538             }
00539             calculated_weight = true;
00540         }
00541 
00542         Xapian::Internal::RefCntPtr<Xapian::Document::Internal> doc;
00543         Xapian::docid did = pl->get_docid();
00544         DEBUGLINE(MATCH, "Candidate document id " << did << " wt " << wt);
00545         Xapian::Internal::MSetItem new_item(wt, did);
00546         if (sort_by != REL) {
00547             const unsigned int multiplier = db.internal.size();
00548             Assert(multiplier != 0);
00549             Xapian::doccount n = (new_item.did - 1) % multiplier; // which actual database
00550             Xapian::docid m = (new_item.did - 1) / multiplier + 1; // real docid in that database
00551             doc = db.internal[n]->open_document(m, true);
00552             if (sorter) {
00553                 new_item.sort_key = (*sorter)(Xapian::Document(doc.get()));
00554             } else {
00555                 new_item.sort_key = doc->get_value(sort_key);
00556             }
00557 
00558             // We're sorting by value (in part at least), so compare the item
00559             // against the lowest currently in the proto-mset.  If sort_by is
00560             // VAL, then new_item.wt won't yet be set, but that doesn't
00561             // matter since it's not used by the sort function.
00562             if (!mcmp(new_item, min_item)) {
00563                 if (matchspy == NULL && mdecider == NULL &&
00564                     collapse_key == Xapian::BAD_VALUENO) {
00565                     // Document was definitely suitable for mset - no more
00566                     // processing needed.
00567                     DEBUGLINE(MATCH, "Making note of match item which sorts lower than min_item");
00568                     ++docs_matched;
00569                     if (!calculated_weight) wt = pl->get_weight();
00570                     if (wt > greatest_wt) goto new_greatest_weight;
00571                     continue;
00572                 }
00573                 if (docs_matched >= check_at_least) {
00574                     // We've seen enough items - we can drop this one.
00575                     DEBUGLINE(MATCH, "Dropping candidate which sorts lower than min_item");
00576                     continue;
00577                 }
00578                 // We can't drop the item, because we need to show it
00579                 // to the matchspy, test whether the mdecider would
00580                 // accept it, and/or test whether it would be collapsed.
00581                 DEBUGLINE(MATCH, "Keeping candidate which sorts lower than min_item for further investigation");
00582             }
00583         }
00584 
00585         // Use the match spy and/or decision functors (if specified).
00586         if (matchspy != NULL || mdecider != NULL) {
00587             const unsigned int multiplier = db.internal.size();
00588             Assert(multiplier != 0);
00589             Xapian::doccount n = (did - 1) % multiplier; // which actual database
00590             // If the results are from a remote database, then the functor will
00591             // already have been applied there so we can skip this step.
00592             if (!is_remote[n]) {
00593                 if (doc.get() == 0) {
00594                     Xapian::docid m = (did - 1) / multiplier + 1; // real docid in that database
00595                     doc = db.internal[n]->open_document(m, true);
00596                     Assert(doc.get());
00597                 }
00598                 Xapian::Document mydoc(doc.get());
00599 
00600                 ++decider_considered;
00601                 if (matchspy && !matchspy->operator()(mydoc)) {
00602                     ++decider_denied;
00603                     continue;
00604                 }
00605                 if (mdecider && !mdecider->operator()(mydoc)) {
00606                     ++decider_denied;
00607                     continue;
00608                 }
00609             }
00610         }
00611 
00612         if (!calculated_weight) {
00613             // we didn't calculate the weight above, but now we will need it
00614             wt = pl->get_weight();
00615             new_item.wt = wt;
00616         }
00617 
00618         pushback = true;
00619         documents_considered++;
00620 
00621         // Perform collapsing on key if requested.
00622         if (collapse_key != Xapian::BAD_VALUENO) {
00623             new_item.collapse_key = get_collapse_key(pl, did, collapse_key,
00624                                                      doc);
00625 
00626             // Don't collapse on null key
00627             if (new_item.collapse_key.empty()) {
00628                 ++null_collapse_values;
00629             } else {
00630                 map<string, pair<Xapian::Internal::MSetItem, Xapian::weight> >::iterator oldkey;
00631                 oldkey = collapse_tab.find(new_item.collapse_key);
00632                 if (oldkey == collapse_tab.end()) {
00633                     DEBUGLINE(MATCH, "collapsem: new key: " <<
00634                               new_item.collapse_key);
00635                     // Key not been seen before
00636                     collapse_tab.insert(make_pair(new_item.collapse_key,
00637                                         make_pair(new_item, Xapian::weight(0))));
00638                 } else {
00639                     ++duplicates_found;
00640                     Xapian::Internal::MSetItem &old_item = oldkey->second.first;
00641                     // FIXME: what about the (sort_by != REL) case here?
00642                     if (mcmp(old_item, new_item)) {
00643                         DEBUGLINE(MATCH, "collapsem: better exists: " <<
00644                                   new_item.collapse_key);
00645                         // There's already a better match with this key
00646                         ++old_item.collapse_count;
00647                         // But maybe the weight is worth noting
00648                         if (new_item.wt > oldkey->second.second) {
00649                             oldkey->second.second = new_item.wt;
00650                         }
00651                         // If we're sorting by relevance primarily, then we
00652                         // throw away the lower weighted document anyway.
00653                         if (sort_by != REL && sort_by != REL_VAL) {
00654                             if (wt > greatest_wt) goto new_greatest_weight;
00655                         }
00656                         continue;
00657                     }
00658                     // Make a note of the updated collapse count in the
00659                     // replacement item
00660                     new_item.collapse_count = old_item.collapse_count + 1;
00661 
00662                     // There was a previous item in the collapse tab so
00663                     // the MSet can't be empty.
00664                     Assert(!items.empty());
00665 
00666                     // This is best potential MSet entry with this key which
00667                     // we've seen so far.  Check if the previous best entry
00668                     // with this key might still be in the proto-MSet.  If it
00669                     // might be, we need to check through for it.
00670                     Xapian::weight old_wt = old_item.wt;
00671                     if (old_wt >= min_weight && mcmp(old_item, min_item)) {
00672                         // Scan through (unsorted) MSet looking for entry.
00673                         // FIXME: more efficient way than just scanning?
00674                         Xapian::docid olddid = old_item.did;
00675                         vector<Xapian::Internal::MSetItem>::iterator i;
00676                         for (i = items.begin(); i != items.end(); ++i) {
00677                             if (i->did == olddid) {
00678                                 DEBUGLINE(MATCH, "collapsem: removing " <<
00679                                           olddid << ": " <<
00680                                           new_item.collapse_key);
00681                                 // We can replace an arbitrary element in
00682                                 // O(log N) but have to do it by hand (in this
00683                                 // case the new elt is bigger, so we just swap
00684                                 // down the tree).
00685                                 // FIXME: implement this, and clean up is_heap
00686                                 // handling
00687                                 *i = new_item;
00688                                 pushback = false;
00689                                 is_heap = false;
00690                                 break;
00691                             }
00692                         }
00693                     }
00694 
00695                     // Keep the old weight as it is now second best so far
00696                     oldkey->second = make_pair(new_item, old_wt);
00697                 }
00698             }
00699         }
00700 
00701         // OK, actually add the item to the mset.
00702         if (pushback) {
00703             ++docs_matched;
00704             if (items.size() >= max_msize) {
00705                 items.push_back(new_item);
00706                 if (!is_heap) {
00707                     is_heap = true;
00708                     make_heap(items.begin(), items.end(), mcmp);
00709                 } else {
00710                     push_heap<vector<Xapian::Internal::MSetItem>::iterator,
00711                               MSetCmp>(items.begin(), items.end(), mcmp);
00712                 }
00713                 pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
00714                          MSetCmp>(items.begin(), items.end(), mcmp);
00715                 items.pop_back();
00716 
00717                 min_item = items.front();
00718                 if (sort_by == REL || sort_by == REL_VAL) {
00719                     if (docs_matched >= check_at_least) {
00720                         if (sort_by == REL) {
00721                             // We're done if this is a forward boolean match
00722                             // with only one database (bodgetastic, FIXME
00723                             // better if we can!)
00724                             if (rare(max_possible == 0 && sort_forward)) {
00725                                 // In the multi database case, MergePostList
00726                                 // currently processes each database
00727                                 // sequentially (which actually may well be
00728                                 // more efficient) so the docids in general
00729                                 // won't arrive in order.
00730                                 if (leaves.size() == 1) break;
00731                             }
00732                         }
00733                         if (min_item.wt > min_weight) {
00734                             DEBUGLINE(MATCH, "Setting min_weight to " <<
00735                                       min_item.wt << " from " << min_weight);
00736                             min_weight = min_item.wt;
00737                         }
00738                     }
00739                 }
00740                 if (rare(getorrecalc_maxweight(pl) < min_weight)) {
00741                     DEBUGLINE(MATCH, "*** TERMINATING EARLY (3)");
00742                     break;
00743                 }
00744             } else {
00745                 items.push_back(new_item);
00746                 is_heap = false;
00747                 if (sort_by == REL && items.size() == max_msize) {
00748                     if (docs_matched >= check_at_least) {
00749                         // We're done if this is a forward boolean match
00750                         // with only one database (bodgetastic, FIXME
00751                         // better if we can!)
00752                         if (rare(max_possible == 0 && sort_forward)) {
00753                             // In the multi database case, MergePostList
00754                             // currently processes each database
00755                             // sequentially (which actually may well be
00756                             // more efficient) so the docids in general
00757                             // won't arrive in order.
00758                             if (leaves.size() == 1) break;
00759                         }
00760                     }
00761                 }
00762             }
00763         }
00764 
00765         // Keep a track of the greatest weight we've seen.
00766         if (wt > greatest_wt) {
00767 new_greatest_weight:
00768             greatest_wt = wt;
00769 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00770             const unsigned int multiplier = db.internal.size();
00771             unsigned int db_num = (did - 1) % multiplier;
00772             if (is_remote[db_num]) {
00773                 // Note that the greatest weighted document came from a remote
00774                 // database, and which one.
00775                 greatest_wt_subqs_db_num = db_num;
00776             } else {
00777                 greatest_wt_subqs_db_num = UINT_MAX;
00778             }
00779 #endif
00780             if (percent_cutoff) {
00781                 Xapian::weight w = wt * percent_cutoff_factor;
00782                 if (w > min_weight) {
00783                     min_weight = w;
00784                     if (!is_heap) {
00785                         is_heap = true;
00786                         make_heap<vector<Xapian::Internal::MSetItem>::iterator,
00787                                   MSetCmp>(items.begin(), items.end(), mcmp);
00788                     }
00789                     while (!items.empty() && items.front().wt < min_weight) {
00790                         pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
00791                                  MSetCmp>(items.begin(), items.end(), mcmp);
00792                         Assert(items.back().wt < min_weight);
00793                         items.pop_back();
00794                     }
00795 #ifdef XAPIAN_DEBUG_PARANOID
00796                     vector<Xapian::Internal::MSetItem>::const_iterator i;
00797                     for (i = items.begin(); i != items.end(); ++i) {
00798                         Assert(i->wt >= min_weight);
00799                     }
00800 #endif
00801                 }
00802             }
00803         }
00804     }
00805 
00806     // done with posting list tree
00807     delete pl;
00808 
00809     double percent_scale = 0;
00810     if (!items.empty() && greatest_wt > 0) {
00811         // Find the document with the highest weight, then total up the
00812         // weights for the terms it contains
00813         vector<Xapian::Internal::MSetItem>::const_iterator best;
00814         best = min_element(items.begin(), items.end(), mcmp);
00815 
00816 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00817         if (greatest_wt_subqs_db_num != UINT_MAX) {
00818             const unsigned int n = greatest_wt_subqs_db_num;
00819             RemoteSubMatch * rem_match;
00820             rem_match = static_cast<RemoteSubMatch*>(leaves[n].get());
00821             percent_scale = rem_match->get_percent_factor() / 100.0;
00822         } else
00823 #endif
00824         if (termfreqandwts.size() > 1) {
00825             Xapian::termcount matching_terms = 0;
00826             map<string,
00827                 Xapian::MSet::Internal::TermFreqAndWeight>::const_iterator i;
00828 
00829             // Special case for MatchAll queries.
00830             i = termfreqandwts.find(string());
00831             if (i != termfreqandwts.end()) {
00832                 percent_scale += i->second.termweight;
00833                 ++matching_terms;
00834             }
00835 
00836             Xapian::TermIterator docterms = db.termlist_begin(best->did);
00837             Xapian::TermIterator docterms_end = db.termlist_end(best->did);
00838             while (docterms != docterms_end) {
00839                 i = termfreqandwts.find(*docterms);
00840                 if (i != termfreqandwts.end()) {
00841                     percent_scale += i->second.termweight;
00842                     ++matching_terms;
00843                     if (matching_terms == termfreqandwts.size()) break;
00844                 }
00845                 ++docterms;
00846             }
00847 
00848             if (matching_terms < termfreqandwts.size()) {
00849                 // OK, work out weight corresponding to 100%
00850                 double denom = 0;
00851                 for (i = termfreqandwts.begin(); i != termfreqandwts.end(); ++i)
00852                     denom += i->second.termweight;
00853 
00854                 DEBUGLINE(MATCH, "denom = " << denom << " percent_scale = " << percent_scale);
00855                 Assert(percent_scale <= denom);
00856                 denom *= greatest_wt;
00857                 Assert(denom > 0);
00858                 percent_scale /= denom;
00859             } else {
00860                 // If all the terms match, the 2 sums of weights cancel
00861                 percent_scale = 1.0 / greatest_wt;
00862             }
00863         } else {
00864             // If there's only a single term in the query, the top document
00865             // must score 100%.
00866             percent_scale = 1.0 / greatest_wt;
00867         }
00868         Assert(percent_scale > 0);
00869         if (percent_cutoff) {
00870             // FIXME: better to sort and binary chop maybe?
00871             // Or we could just use a linear scan here instead.
00872 
00873             // trim the mset to the correct answer...
00874             Xapian::weight min_wt = percent_cutoff_factor / percent_scale;
00875             if (!is_heap) {
00876                 is_heap = true;
00877                 make_heap<vector<Xapian::Internal::MSetItem>::iterator,
00878                           MSetCmp>(items.begin(), items.end(), mcmp);
00879             }
00880             while (!items.empty() && items.front().wt < min_wt) {
00881                 pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
00882                          MSetCmp>(items.begin(), items.end(), mcmp);
00883                 Assert(items.back().wt < min_wt);
00884                 items.pop_back();
00885             }
00886 #ifdef XAPIAN_DEBUG_PARANOID
00887             vector<Xapian::Internal::MSetItem>::const_iterator j;
00888             for (j = items.begin(); j != items.end(); ++j) {
00889                 Assert(j->wt >= min_wt);
00890             }
00891 #endif
00892         }
00893         percent_scale *= 100.0;
00894     }
00895 
00896     DEBUGLINE(MATCH,
00897               "docs_matched = " << docs_matched <<
00898               ", definite_matches_not_seen = " << definite_matches_not_seen <<
00899               ", matches_lower_bound = " << matches_lower_bound <<
00900               ", matches_estimated = " << matches_estimated <<
00901               ", matches_upper_bound = " << matches_upper_bound);
00902 
00903     // Adjust docs_matched to take account of documents which matched remotely
00904     // but weren't sent across.
00905     docs_matched += definite_matches_not_seen;
00906 
00907     if (items.size() < max_msize) {
00908         // We have fewer items in the mset than we tried to get for it, so we
00909         // must have all the matches in it.
00910         DEBUGLINE(MATCH, "items.size() = " << items.size() <<
00911                   ", max_msize = " << max_msize << ", setting bounds equal");
00912         Assert(definite_matches_not_seen == 0);
00913         Assert(percent_cutoff || docs_matched == items.size());
00914         matches_lower_bound = matches_upper_bound = matches_estimated
00915             = items.size();
00916     } else if (collapse_key == Xapian::BAD_VALUENO && docs_matched < check_at_least) {
00917         // We have seen fewer matches than we checked for, so we must have seen
00918         // all the matches.
00919         DEBUGLINE(MATCH, "Setting bounds equal");
00920         matches_lower_bound = matches_upper_bound = matches_estimated
00921             = docs_matched;
00922     } else {
00923         AssertRel(matches_estimated,>=,matches_lower_bound);
00924         AssertRel(matches_estimated,<=,matches_upper_bound);
00925 
00926         // We can end up scaling the estimate more than once, so collect
00927         // the scale factors and apply them in one go to avoid rounding
00928         // more than once.
00929         double estimate_scale = 1.0;
00930 
00931         if (collapse_key != Xapian::BAD_VALUENO) {
00932             // Lower bound must be set to no more than the number of null
00933             // collapse values seen plus the number of unique non-null collapse
00934             // values seen, since it's possible that all further hits will be
00935             // collapsed to values we've already seen.
00936             DEBUGLINE(MATCH, "Adjusting bounds due to collapse_key");
00937             matches_lower_bound = null_collapse_values + collapse_tab.size();
00938 
00939             // The estimate for the number of hits can be modified by
00940             // multiplying it by the rate at which we've been finding
00941             // unique documents.
00942             if (documents_considered > 0) {
00943                 double unique = double(documents_considered - duplicates_found);
00944                 double unique_rate = unique / double(documents_considered);
00945                 estimate_scale *= unique_rate;
00946             }
00947 
00948             // We can safely reduce the upper bound by the number of
00949             // duplicates we've seen.
00950             matches_upper_bound -= duplicates_found;
00951 
00952             DEBUGLINE(MATCH, "matches_lower_bound=" << matches_lower_bound <<
00953                       ", matches_estimated=" << matches_estimated <<
00954                       "*" << estimate_scale <<
00955                       ", matches_upper_bound=" << matches_upper_bound);
00956         }
00957 
00958         if (matchspy || mdecider) {
00959             if (collapse_key == Xapian::BAD_VALUENO && !percent_cutoff) {
00960                 // We're not collapsing or doing a percentage cutoff, so
00961                 // docs_matched is a lower bound on the total number of matches.
00962                 matches_lower_bound = max(docs_matched, matches_lower_bound);
00963             }
00964 
00965             // The estimate for the number of hits can be modified by
00966             // multiplying it by the rate at which the match decider has
00967             // been accepting documents.
00968             if (decider_considered > 0) {
00969                 double accept = double(decider_considered - decider_denied);
00970                 double accept_rate = accept / double(decider_considered);
00971                 estimate_scale *= accept_rate;
00972             }
00973 
00974             // If a document is denied by a match decider, it is not possible
00975             // for it to found to be a duplicate, so it is safe to also reduce
00976             // the upper bound by the number of documents denied by a match
00977             // decider.
00978             matches_upper_bound -= decider_denied;
00979         }
00980 
00981         if (percent_cutoff) {
00982             estimate_scale *= (1.0 - percent_cutoff_factor);
00983             // another approach:
00984             // Xapian::doccount new_est = items.size() * (1 - percent_cutoff_factor) / (1 - min_weight / greatest_wt);
00985             // and another: items.size() + (1 - greatest_wt * percent_cutoff_factor / min_weight) * (matches_estimated - items.size());
00986 
00987             // Very likely an underestimate, but we can't really do better
00988             // without checking further matches...  Only possibility would be
00989             // to track how many docs made the min weight test but didn't make
00990             // the candidate set since the last greatest_wt change, which we
00991             // could use if the top documents matched all the prob terms.
00992             matches_lower_bound = items.size();
00993             // matches_upper_bound could be reduced by the number of documents
00994             // which fail the min weight test
00995             DEBUGLINE(MATCH, "Adjusted bounds due to percent_cutoff (" <<
00996                       percent_cutoff << "): now have matches_estimated=" <<
00997                       matches_estimated << ", matches_lower_bound=" <<
00998                       matches_lower_bound);
00999         }
01000 
01001         if (estimate_scale != 1.0) {
01002             matches_estimated =
01003                 Xapian::doccount(matches_estimated * estimate_scale + 0.5);
01004             if (matches_estimated < matches_lower_bound)
01005                 matches_estimated = matches_lower_bound;
01006         }
01007 
01008         if (collapse_key != Xapian::BAD_VALUENO || matchspy || mdecider) {
01009             DEBUGLINE(MATCH, "Clamping estimate between bounds: "
01010                       "matches_lower_bound = " << matches_lower_bound <<
01011                       ", matches_estimated = " << matches_estimated <<
01012                       ", matches_upper_bound = " << matches_upper_bound);
01013             AssertRel(matches_lower_bound,<=,matches_upper_bound);
01014             matches_estimated = max(matches_estimated, matches_lower_bound);
01015             matches_estimated = min(matches_estimated, matches_upper_bound);
01016         } else if (!percent_cutoff) {
01017             AssertRel(docs_matched,<=,matches_upper_bound);
01018             if (docs_matched > matches_lower_bound)
01019                 matches_lower_bound = docs_matched;
01020             if (docs_matched > matches_estimated)
01021                 matches_estimated = docs_matched;
01022         }
01023     }
01024 
01025     DEBUGLINE(MATCH, items.size() << " items in potential mset");
01026 
01027     if (first > 0) {
01028         // Remove unwanted leading entries
01029         if (items.size() <= first) {
01030             items.clear();
01031         } else {
01032             DEBUGLINE(MATCH, "finding " << first << "th");
01033             // We perform nth_element() on reverse iterators so that the
01034             // unwanted elements end up at the end of items, which means
01035             // that the call to erase() to remove them doesn't have to copy
01036             // any elements.
01037             vector<Xapian::Internal::MSetItem>::reverse_iterator nth;
01038             nth = items.rbegin() + first;
01039             nth_element(items.rbegin(), nth, items.rend(), mcmp);
01040             // Erase the trailing ``first'' elements
01041             items.erase(items.begin() + items.size() - first, items.end());
01042         }
01043     }
01044 
01045     DEBUGLINE(MATCH, "sorting " << items.size() << " entries");
01046 
01047     // Need a stable sort, but this is provided by comparison operator
01048     sort(items.begin(), items.end(), mcmp);
01049 
01050     if (!items.empty()) {
01051         DEBUGLINE(MATCH, "min weight in mset = " << items.back().wt);
01052         DEBUGLINE(MATCH, "max weight in mset = " << items[0].wt);
01053     }
01054 
01055     AssertRel(matches_estimated,>=,matches_lower_bound);
01056     AssertRel(matches_estimated,<=,matches_upper_bound);
01057 
01058     // We may need to qualify any collapse_count to see if the highest weight
01059     // collapsed item would have qualified percent_cutoff
01060     // We WILL need to restore collapse_count to the mset by taking from
01061     // collapse_tab; this is what comes of copying around whole objects
01062     // instead of taking references, we find it hard to update collapse_count
01063     // of an item that has already been pushed-back as we don't know where it
01064     // is any more.  If we keep or find references we won't need to mess with
01065     // is_heap so much maybe?
01066     if (collapse_key != Xapian::BAD_VALUENO && /*percent_cutoff &&*/ !items.empty() &&
01067         !collapse_tab.empty()) {
01068         // Nicked this formula from above, but for some reason percent_scale
01069         // has since been multiplied by 100 so we take that into account
01070         Xapian::weight min_wt = percent_cutoff_factor / (percent_scale / 100);
01071         vector<Xapian::Internal::MSetItem>::iterator i;
01072         for (i = items.begin(); i != items.end() && !collapse_tab.empty(); ++i) {
01073             // Is this a collapsed hit?
01074             if (/*i->collapse_count > 0 &&*/ !i->collapse_key.empty()) {
01075                 map<string, pair<Xapian::Internal::MSetItem, Xapian::weight> >::iterator key;
01076                 key = collapse_tab.find(i->collapse_key);
01077                 // Because we collapse, each collapse key can only occur once
01078                 // in the items, we remove from collapse_tab here as processed
01079                 // so we can quit early.  Therefore each time we find an item
01080                 // with a collapse_key the collapse_key must be in collapse_tab
01081                 Assert(key != collapse_tab.end());
01082                 // If weight of top collapsed item is not relevant enough
01083                 // then collapse count is bogus in every way
01084                 // FIXME: Should this be <=?
01085                 if (key->second.second < min_wt)
01086                     i->collapse_count = 0;
01087                 else
01088                     i->collapse_count = key->second.first.collapse_count;
01089                 // When collapse_tab is finally empty we can finish this process
01090                 // without examining any further hits
01091                 collapse_tab.erase(key);
01092             }
01093         }
01094     }
01095 
01096     mset = Xapian::MSet(new Xapian::MSet::Internal(
01097                                        first,
01098                                        matches_upper_bound,
01099                                        matches_lower_bound,
01100                                        matches_estimated,
01101                                        max_possible, greatest_wt, items,
01102                                        termfreqandwts,
01103                                        percent_scale));
01104 }
01105 
01106 // This method is called by branch postlists when they rebalance
01107 // in order to recalculate the weights in the tree
01108 void
01109 MultiMatch::recalc_maxweight()
01110 {
01111     DEBUGCALL(MATCH, void, "MultiMatch::recalc_maxweight", "");
01112     recalculate_w_max = true;
01113 }

Documentation for Xapian (version 1.0.20).
Generated on 28 Apr 2010 by Doxygen 1.5.2.