matcher/queryoptimiser.cc

Go to the documentation of this file.
00001 
00004 /* Copyright (C) 2007,2008 Olly Betts
00005  * Copyright (C) 2008 Lemur Consulting Ltd
00006  *
00007  * This program is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU General Public License as
00009  * published by the Free Software Foundation; either version 2 of the
00010  * License, or (at your option) any later version.
00011  *
00012  * This program is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
00020  */
00021 
00022 #include <config.h>
00023 
00024 #include "queryoptimiser.h"
00025 
00026 #include "andmaybepostlist.h"
00027 #include "andnotpostlist.h"
00028 #include "emptypostlist.h"
00029 #include "exactphrasepostlist.h"
00030 #include "multiandpostlist.h"
00031 #include "multimatch.h"
00032 #include "omassert.h"
00033 #include "omdebug.h"
00034 #include "omqueryinternal.h"
00035 #include "orpostlist.h"
00036 #include "phrasepostlist.h"
00037 #include "postlist.h"
00038 #include "valuegepostlist.h"
00039 #include "valuerangepostlist.h"
00040 #include "xorpostlist.h"
00041 
00042 #include <algorithm>
00043 #include <list>
00044 #include <string>
00045 #include <vector>
00046 
00047 using namespace std;
00048 
00049 PostList *
00050 QueryOptimiser::do_subquery(const Xapian::Query::Internal * query, double factor)
00051 {
00052     DEBUGCALL(MATCH, PostList *, "QueryOptimiser::do_subquery",
00053               query << ", " << factor);
00054 
00055     // Handle QueryMatchNothing.
00056     if (!query) RETURN(new EmptyPostList());
00057 
00058     switch (query->op) {
00059         case Xapian::Query::Internal::OP_LEAF:
00060             RETURN(do_leaf(query, factor));
00061 
00062         case Xapian::Query::OP_AND:
00063         case Xapian::Query::OP_FILTER:
00064         case Xapian::Query::OP_NEAR:
00065         case Xapian::Query::OP_PHRASE:
00066             RETURN(do_and_like(query, factor));
00067 
00068         case Xapian::Query::OP_OR:
00069         case Xapian::Query::OP_XOR:
00070         case Xapian::Query::OP_ELITE_SET:
00071             RETURN(do_or_like(query, factor));
00072 
00073         case Xapian::Query::OP_AND_NOT: {
00074             AssertEq(query->subqs.size(), 2);
00075             PostList * l = do_subquery(query->subqs[0], factor);
00076             PostList * r = do_subquery(query->subqs[1], 0.0);
00077             RETURN(new AndNotPostList(l, r, matcher, db_size));
00078         }
00079 
00080         case Xapian::Query::OP_AND_MAYBE: {
00081             AssertEq(query->subqs.size(), 2);
00082             PostList * l = do_subquery(query->subqs[0], factor);
00083             PostList * r = do_subquery(query->subqs[1], factor);
00084             RETURN(new AndMaybePostList(l, r, matcher, db_size));
00085         }
00086 
00087         case Xapian::Query::OP_VALUE_RANGE: {
00088             Xapian::valueno valno(query->parameter);
00089             const string & range_begin = query->tname;
00090             const string & range_end = query->str_parameter;
00091             RETURN(new ValueRangePostList(&db, valno, range_begin, range_end));
00092         }
00093 
00094         case Xapian::Query::OP_VALUE_GE: {
00095             Xapian::valueno valno(query->parameter);
00096             const string & range_begin = query->tname;
00097             RETURN(new ValueGePostList(&db, valno, range_begin));
00098         }
00099 
00100         case Xapian::Query::OP_VALUE_LE: {
00101             Xapian::valueno valno(query->parameter);
00102             const string & range_end = query->tname;
00103             RETURN(new ValueRangePostList(&db, valno, "", range_end));
00104         }
00105 
00106         case Xapian::Query::OP_SCALE_WEIGHT: {
00107             AssertEq(query->subqs.size(), 1);
00108             double sub_factor = factor;
00109             if (sub_factor != 0.0) sub_factor *= query->get_dbl_parameter();
00110             RETURN(do_subquery(query->subqs[0], sub_factor));
00111         }
00112 
00113         default:
00114             Assert(false);
00115             RETURN(NULL);
00116     }
00117 }
00118 
00119 struct PosFilter {
00120     PosFilter(Xapian::Query::Internal::op_t op_, size_t begin_, size_t end_,
00121               Xapian::termcount window_)
00122         : op(op_), begin(begin_), end(end_), window(window_) { }
00123 
00124     Xapian::Query::Internal::op_t op;
00125 
00127     size_t begin, end;
00128 
00129     Xapian::termcount window;
00130 };
00131 
00132 PostList *
00133 QueryOptimiser::do_and_like(const Xapian::Query::Internal *query, double factor)
00134 {
00135     DEBUGCALL(MATCH, PostList *, "QueryOptimiser::do_and_like",
00136               query << ", " << factor);
00137 
00138     list<PosFilter> pos_filters;
00139     vector<PostList *> plists;
00140     do_and_like(query, factor, plists, pos_filters);
00141     AssertRel(plists.size(), >=, 2);
00142 
00143     PostList * pl;
00144     pl = new MultiAndPostList(plists.begin(), plists.end(), matcher, db_size);
00145 
00146     // Sort the positional filters to try to apply them in an efficient order.
00147     // FIXME: We need to figure out what that is!  Try applying lowest cf/tf
00148     // first?
00149 
00150     // Apply any positional filters.
00151     list<PosFilter>::iterator i;
00152     for (i = pos_filters.begin(); i != pos_filters.end(); ++i) {
00153         const PosFilter & filter = *i;
00154 
00155         // FIXME: make NearPostList, etc ctors take a pair of itors so we don't
00156         // need to create this temporary vector.
00157         vector<PostList *> terms(plists.begin() + filter.begin,
00158                                  plists.begin() + filter.end);
00159 
00160         Xapian::termcount window = filter.window;
00161         if (filter.op == Xapian::Query::OP_NEAR) {
00162             pl = new NearPostList(pl, window, terms);
00163         } else if (window == filter.end - filter.begin) {
00164             AssertEq(filter.op, Xapian::Query::OP_PHRASE);
00165             pl = new ExactPhrasePostList(pl, terms);
00166         } else {
00167             AssertEq(filter.op, Xapian::Query::OP_PHRASE);
00168             pl = new PhrasePostList(pl, window, terms);
00169         }
00170     }
00171 
00172     RETURN(pl);
00173 }
00174 
00175 inline bool is_and_like(Xapian::Query::Internal::op_t op) {
00176     return op == Xapian::Query::OP_AND || op == Xapian::Query::OP_FILTER ||
00177            op == Xapian::Query::OP_NEAR || op == Xapian::Query::OP_PHRASE;
00178 }
00179 
00180 void
00181 QueryOptimiser::do_and_like(const Xapian::Query::Internal *query, double factor,
00182                             vector<PostList *> & and_plists,
00183                             list<PosFilter> & pos_filters)
00184 {
00185     DEBUGCALL(MATCH, void, "QueryOptimiser::do_and_like",
00186               query << ", " << factor << ", [and_plists], [pos_filters]");
00187 
00188     Xapian::Query::Internal::op_t op = query->op;
00189     Assert(is_and_like(op));
00190 
00191     bool positional = false;
00192     if (op == Xapian::Query::OP_PHRASE || op == Xapian::Query::OP_NEAR) {
00193         // If this sub-database has no positional information, change
00194         // OP_PHRASE/OP_NEAR into OP_AND so that we actually return some
00195         // matches.
00196         if (!db.has_positions()) {
00197             op = Xapian::Query::OP_AND;
00198         } else {
00199             positional = true;
00200         }
00201     }
00202 
00203     const Xapian::Query::Internal::subquery_list &queries = query->subqs;
00204     AssertRel(queries.size(), >=, 2);
00205 
00206     for (size_t i = 0; i != queries.size(); ++i) {
00207         // The second branch of OP_FILTER is always boolean.
00208         if (i == 1 && op == Xapian::Query::OP_FILTER) factor = 0.0;
00209 
00210         const Xapian::Query::Internal * subq = queries[i];
00211         if (is_and_like(subq->op)) {
00212             do_and_like(subq, factor, and_plists, pos_filters);
00213         } else {
00214             PostList * pl = do_subquery(subq, factor);
00215             and_plists.push_back(pl);
00216         }
00217     }
00218 
00219     if (positional) {
00220         // Record the positional filter to apply higher up the tree.
00221         size_t end = and_plists.size();
00222         size_t begin = end - queries.size();
00223         Xapian::termcount window = query->parameter;
00224 
00225         pos_filters.push_back(PosFilter(op, begin, end, window));
00226     }
00227 }
00228 
00234 struct CmpMaxOrTerms {
00243     bool operator()(const PostList *a, const PostList *b) {
00244         if (a->get_termfreq_max() == 0) return false;
00245         if (b->get_termfreq_max() == 0) return true;
00246 
00247 #if defined(__i386__) || defined(__mc68000__)
00248         // On some architectures, most common of which is x86, floating point
00249         // values are calculated and stored in registers with excess precision.
00250         // If the two get_maxweight() calls below return identical values in a
00251         // register, the excess precision may be dropped for one of them but
00252         // not the other (e.g. because the compiler saves the first calculated
00253         // weight to memory while calculating the second, then reloads it to
00254         // compare).  This leads to both a > b and b > a being true, which
00255         // violates the antisymmetry property of the strict weak ordering
00256         // required by nth_element().  This can have serious consequences (e.g.
00257         // segfaults).
00258         //
00259         // To avoid this, we store each result in a volatile double prior to
00260         // comparing them.  This means that the result of this test should
00261         // match that on other architectures with the same double format (which
00262         // is desirable), and actually has less overhead than rounding both
00263         // results to float (which is another approach which works).
00264         volatile double a_max_wt = a->get_maxweight();
00265         volatile double b_max_wt = b->get_maxweight();
00266         return a_max_wt > b_max_wt;
00267 #else
00268         return a->get_maxweight() > b->get_maxweight();
00269 #endif
00270     }
00271 };
00272 
00273 template<class CLASS> struct delete_ptr {
00274     void operator()(CLASS *p) { delete p; }
00275 };
00276 
00278 struct ComparePostListTermFreqAscending {
00280     bool operator()(const PostList *a, const PostList *b) {
00281         return a->get_termfreq_est() > b->get_termfreq_est();
00282     }
00283 };
00284 
00285 PostList *
00286 QueryOptimiser::do_or_like(const Xapian::Query::Internal *query, double factor)
00287 {
00288     DEBUGCALL(MATCH, PostList *, "QueryOptimiser::do_or_like",
00289               query << ", " << factor);
00290 
00291     // FIXME: we could optimise by merging OP_ELITE_SET and OP_OR like we do
00292     // for AND-like operations.
00293     Xapian::Query::Internal::op_t op = query->op;
00294     Assert(op == Xapian::Query::OP_ELITE_SET || op == Xapian::Query::OP_OR ||
00295            op == Xapian::Query::OP_XOR);
00296 
00297     const Xapian::Query::Internal::subquery_list &queries = query->subqs;
00298     AssertRel(queries.size(), >=, 2);
00299 
00300     vector<PostList *> postlists;
00301     postlists.reserve(queries.size());
00302 
00303     Xapian::Query::Internal::subquery_list::const_iterator q;
00304     for (q = queries.begin(); q != queries.end(); ++q) {
00305         postlists.push_back(do_subquery(*q, factor));
00306     }
00307 
00308     if (op == Xapian::Query::OP_ELITE_SET) {
00309         // Select the best elite_set_size terms.
00310         Xapian::termcount elite_set_size = query->parameter;
00311         Assert(elite_set_size > 0);
00312 
00313         if (postlists.size() > elite_set_size) {
00314             // Call recalc_maxweight() as otherwise get_maxweight()
00315             // may not be valid before next() or skip_to()
00316             for_each(postlists.begin(), postlists.end(),
00317                      mem_fun(&PostList::recalc_maxweight));
00318 
00319             nth_element(postlists.begin(),
00320                         postlists.begin() + elite_set_size - 1,
00321                         postlists.end(), CmpMaxOrTerms());
00322 
00323             for_each(postlists.begin() + elite_set_size, postlists.end(),
00324                      delete_ptr<PostList>());
00325 
00326             if (elite_set_size == 1) RETURN(postlists[0]);
00327 
00328             postlists.resize(elite_set_size);
00329         }
00330     }
00331 
00332     // Make postlists into a heap so that the postlist with the greatest term
00333     // frequency is at the top of the heap.
00334     make_heap(postlists.begin(), postlists.end(),
00335               ComparePostListTermFreqAscending());
00336 
00337     // Now build a tree of binary OrPostList or XorPostList objects.  The
00338     // algorithm used to build the tree is like that used to build an
00339     // optimal Huffman coding tree.  If we called next() repeatedly, this
00340     // arrangement would minimise the number of method calls.  Generally we
00341     // don't actually do that, but this arrangement is still likely to be a
00342     // good one, and it does minimise the work in the worst case.
00343     AssertRel(postlists.size(), >=, 2);
00344     while (true) {
00345         // We build the tree such that at each branch:
00346         //
00347         //   l.get_termfreq_est() >= r.get_termfreq_est()
00348         //
00349         // We do this so that the OrPostList and XorPostList classes can be
00350         // optimised assuming that this is the case.
00351         PostList * r = postlists.front();
00352         pop_heap(postlists.begin(), postlists.end(),
00353                  ComparePostListTermFreqAscending());
00354         postlists.pop_back();
00355         PostList * l = postlists.front();
00356 
00357         PostList * pl;
00358         if (op == Xapian::Query::OP_XOR) {
00359             pl = new XorPostList(l, r, matcher, db_size);
00360         } else {
00361             pl = new OrPostList(l, r, matcher, db_size);
00362         }
00363 
00364         if (postlists.size() == 1) RETURN(pl);
00365 
00366         pop_heap(postlists.begin(), postlists.end(),
00367                  ComparePostListTermFreqAscending());
00368         postlists.back() = pl;
00369         push_heap(postlists.begin(), postlists.end(),
00370                   ComparePostListTermFreqAscending());
00371     }
00372 }

Documentation for Xapian (version 1.0.20).
Generated on 28 Apr 2010 by Doxygen 1.5.2.