xapian-core  1.4.26
exactphrasepostlist.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2006,2007,2009,2010,2011,2014,2015,2017 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <config.h>
22 
23 #include "exactphrasepostlist.h"
24 
25 #include "debuglog.h"
26 #include "backends/positionlist.h"
27 #include "omassert.h"
28 
29 #include <algorithm>
30 #include <vector>
31 
32 using namespace std;
33 
35  const vector<PostList*>::const_iterator &terms_begin,
36  const vector<PostList*>::const_iterator &terms_end)
37  : SelectPostList(source_), terms(terms_begin, terms_end)
38 {
39  size_t n = terms.size();
40  Assert(n > 1);
41  poslists = new PositionList*[n];
42  try {
43  order = new unsigned[n];
44  } catch (...) {
45  delete [] poslists;
46  throw;
47  }
48  for (size_t i = 0; i < n; ++i) order[i] = unsigned(i);
49 }
50 
52 {
53  delete [] poslists;
54  delete [] order;
55 }
56 
57 void
59 {
60  poslists[i] = terms[order[i]]->read_position_list();
61 }
62 
63 class TermCompare {
64  vector<PostList *> & terms;
65 
66  public:
67  explicit TermCompare(vector<PostList *> & terms_) : terms(terms_) { }
68 
69  bool operator()(unsigned a, unsigned b) const {
70  return terms[a]->get_wdf() < terms[b]->get_wdf();
71  }
72 };
73 
74 bool
76 {
77  LOGCALL(MATCH, bool, "ExactPhrasePostList::test_doc", NO_ARGS);
78 
79  // We often don't need to read all the position lists, so rather than using
80  // the shortest position lists first, we approximate by using the terms
81  // with the lowest wdf first. This will typically give the same or a very
82  // similar order.
83  sort(order, order + terms.size(), TermCompare(terms));
84 
85  // If the first term we check only occurs too close to the start of the
86  // document, we only need to read one term's positions. E.g. search for
87  // "ripe mango" when the only occurrence of 'mango' in the current document
88  // is at position 0.
90  if (!poslists[0]->skip_to(order[0]))
91  RETURN(false);
92 
93  // If we get here, we'll need to read the positionlists for at least two
94  // terms, so check the true positionlist length for the two terms with the
95  // lowest wdf and if necessary swap them so the true shorter one is first.
97  if (poslists[0]->get_approx_size() > poslists[1]->get_approx_size()) {
98  if (!poslists[1]->skip_to(order[1]))
99  RETURN(false);
100  swap(poslists[0], poslists[1]);
101  swap(order[0], order[1]);
102  }
103 
104  unsigned read_hwm = 1;
105  Xapian::termpos idx0 = order[0];
106  Xapian::termpos base = poslists[0]->get_position() - idx0;
107  unsigned i = 1;
108  while (true) {
109  if (i > read_hwm) {
110  read_hwm = i;
112  // FIXME: consider comparing with poslist[0] and swapping
113  // if less common. Should we allow for the number of positions
114  // we've read from poslist[0] already?
115  }
116  Xapian::termpos idx = order[i];
117  Xapian::termpos required = base + idx;
118  if (!poslists[i]->skip_to(required))
119  RETURN(false);
121  if (got == required) {
122  if (++i == terms.size()) RETURN(true);
123  continue;
124  }
125  if (!poslists[0]->skip_to(got - idx + idx0))
126  RETURN(false);
127  base = poslists[0]->get_position() - idx0;
128  i = 1;
129  }
130 }
131 
134 {
135  // Calculate an estimate for the wdf of an exact phrase postlist.
136  //
137  // We use the minimum wdf of a sub-postlist as our estimate. See the
138  // comment in NearPostList::get_wdf() for justification of this estimate.
139  vector<PostList *>::const_iterator i = terms.begin();
140  Xapian::termcount wdf = (*i)->get_wdf();
141  while (++i != terms.end()) {
142  wdf = min(wdf, (*i)->get_wdf());
143  }
144  return wdf;
145 }
146 
149 {
150  // It's hard to estimate how many times the exact phrase will occur as
151  // it depends a lot on the phrase, but usually the exact phrase will
152  // occur significantly less often than the individual terms.
153  //
154  // We divide by 4 here rather than by 2 as we do for NearPostList and
155  // PhrasePostList, as a very rough heuristic to represent the fact that the
156  // words must occur exactly in order, and phrases are therefore rarer than
157  // near matches and (non-exact) phrase matches.
158  return source->get_termfreq_est() / 4;
159 }
160 
161 TermFreqs
163  const Xapian::Weight::Internal & stats) const
164 {
165  LOGCALL(MATCH, TermFreqs, "ExactPhrasePostList::get_termfreq_est_using_stats", stats);
166  // No idea how to estimate this - do the same as get_termfreq_est() for
167  // now.
169  result.termfreq /= 4;
170  result.reltermfreq /= 4;
171  result.collfreq /= 4;
172  RETURN(result);
173 }
174 
175 string
177 {
178  return "(ExactPhrase " + source->get_description() + ")";
179 }
#define RETURN(A)
Definition: debuglog.h:493
#define Assert(COND)
Definition: omassert.h:122
PostList * source
TermFreqs get_termfreq_est_using_stats(const Xapian::Weight::Internal &stats) const
Get an estimate for the termfreq and reltermfreq, given the stats.
Abstract base class for postlists.
Definition: postlist.h:37
Return docs containing terms forming a particular exact phrase.
TermCompare(vector< PostList *> &terms_)
void start_position_list(unsigned i)
Start reading from the i-th position list.
std::string get_description() const
Return a string description of this object.
Abstract base class for iterating term positions in a document.
STL namespace.
bool operator()(unsigned a, unsigned b) const
PositionList ** poslists
Xapian::termcount get_wdf() const
Return the wdf for the document at the current position.
Xapian::doccount termfreq
Xapian::doccount get_termfreq_est() const
Get an estimate of the number of documents indexed by this term.
vector< PostList * > & terms
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
virtual TermFreqs get_termfreq_est_using_stats(const Xapian::Weight::Internal &stats) const
Get an estimate for the termfreq and reltermfreq, given the stats.
Definition: postlist.cc:36
virtual Xapian::doccount get_termfreq_est() const =0
Get an estimate of the number of documents indexed by this term.
ExactPhrasePostList(PostList *source_, const std::vector< PostList *>::const_iterator &terms_begin, const std::vector< PostList *>::const_iterator &terms_end)
Class to hold statistics for a given collection.
bool test_doc()
Test if the current document contains the terms as an exact phrase.
virtual Xapian::termpos get_position() const =0
Return the current position.
A postlist parent class for classes which only return selected docs from a source postlist (e...
PostList * skip_to(Xapian::docid did, double w_min)
Skip forward to the specified docid.
std::vector< PostList * > terms
The frequencies for a term.
virtual std::string get_description() const =0
Return a string description of this object.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
Definition: types.h:83
Various assertion macros.
Abstract base class for iterating term positions in a document.
Definition: positionlist.h:31
Debug logging macros.
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:487