xapian-core  2.0.0
exactphrasepostlist.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2006-2022 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include "exactphrasepostlist.h"
24 
25 #include "debuglog.h"
26 #include "backends/positionlist.h"
27 #include "omassert.h"
28 
29 #include <algorithm>
30 #include <vector>
31 
32 using namespace std;
33 
35  EstimateOp* estimate_op_,
36  const vector<PostList*>::const_iterator &terms_begin,
37  const vector<PostList*>::const_iterator &terms_end,
38  PostListTree* pltree_)
39  : SelectPostList(source_, estimate_op_, pltree_),
40  terms(terms_begin, terms_end)
41 {
42  size_t n = terms.size();
43  Assert(n > 1);
44  poslists = new PositionList*[n];
45  try {
46  order = new unsigned[n];
47  } catch (...) {
48  delete [] poslists;
49  throw;
50  }
51  for (size_t i = 0; i < n; ++i) order[i] = unsigned(i);
52 
53  // It's hard to estimate how many times the exact phrase will occur as
54  // it depends a lot on the phrase, but usually the exact phrase will
55  // occur significantly less often than the individual terms.
56  //
57  // We divide by 4 here rather than by 2 as we do for NearPostList and
58  // PhrasePostList, as a very rough heuristic to represent the fact that the
59  // words must occur exactly in order, and phrases are therefore rarer than
60  // near matches and (non-exact) phrase matches.
61  termfreq = pl->get_termfreq() / 4;
62 }
63 
65 {
66  delete [] poslists;
67  delete [] order;
68 }
69 
70 void
72 {
73  poslists[i] = terms[order[i]]->read_position_list();
74 }
75 
76 class TermCompare {
77  vector<PostList *> & terms;
78 
79  public:
80  explicit TermCompare(vector<PostList *> & terms_) : terms(terms_) { }
81 
82  bool operator()(unsigned a, unsigned b) const {
83  return terms[a]->get_wdf() < terms[b]->get_wdf();
84  }
85 };
86 
87 bool
89 {
90  LOGCALL(MATCH, bool, "ExactPhrasePostList::test_doc", NO_ARGS);
91 
92  // We often don't need to read all the position lists, so rather than using
93  // the shortest position lists first, we approximate by using the terms
94  // with the lowest wdf first. This will typically give the same or a very
95  // similar order.
96  sort(order, order + terms.size(), TermCompare(terms));
97 
98  // If the first term we check only occurs too close to the start of the
99  // document, we only need to read one term's positions. E.g. search for
100  // "ripe mango" when the only occurrence of 'mango' in the current document
101  // is at position 0.
103  if (!poslists[0]->skip_to(order[0])) {
104  ++rejected;
105  RETURN(false);
106  }
107 
108  // If we get here, we'll need to read the positionlists for at least two
109  // terms, so check the true positionlist length for the two terms with the
110  // lowest wdf and if necessary swap them so the true shorter one is first.
112  if (poslists[0]->get_approx_size() > poslists[1]->get_approx_size()) {
113  if (!poslists[1]->skip_to(order[1])) {
114  ++rejected;
115  RETURN(false);
116  }
117  swap(poslists[0], poslists[1]);
118  swap(order[0], order[1]);
119  }
120 
121  unsigned read_hwm = 1;
122  Xapian::termpos idx0 = order[0];
123  Xapian::termpos base = poslists[0]->get_position() - idx0;
124  unsigned i = 1;
125  while (true) {
126  if (i > read_hwm) {
127  read_hwm = i;
129  // FIXME: consider comparing with poslist[0] and swapping
130  // if less common. Should we allow for the number of positions
131  // we've read from poslist[0] already?
132  }
133  Xapian::termpos idx = order[i];
134  Xapian::termpos required = base + idx;
135  if (!poslists[i]->skip_to(required))
136  break;
138  if (got == required) {
139  if (++i == terms.size()) {
140  ++accepted;
141  RETURN(true);
142  }
143  continue;
144  }
145  if (!poslists[0]->skip_to(got - idx + idx0))
146  break;
147  base = poslists[0]->get_position() - idx0;
148  i = 1;
149  }
150  ++rejected;
151  RETURN(false);
152 }
153 
156 {
157  // Calculate an estimate for the wdf of an exact phrase postlist.
158  //
159  // We use the minimum wdf of a sub-postlist as our estimate. See the
160  // comment in NearPostList::get_wdf() for justification of this estimate.
161  vector<PostList *>::const_iterator i = terms.begin();
162  Xapian::termcount wdf = (*i)->get_wdf();
163  while (++i != terms.end()) {
164  wdf = min(wdf, (*i)->get_wdf());
165  }
166  return wdf;
167 }
168 
169 string
171 {
172  return "(ExactPhrase " + pl->get_description() + ")";
173 }
Class for estimating the total number of matching documents.
Definition: estimateop.h:64
std::vector< PostList * > terms
PositionList ** poslists
bool test_doc()
Test if the current document contains the terms as an exact phrase.
Xapian::termcount get_wdf() const
Return the wdf for the document at the current position.
void start_position_list(unsigned i)
Start reading from the i-th position list.
std::string get_description() const
Return a string description of this object.
ExactPhrasePostList(PostList *source_, EstimateOp *estimate_op_, const std::vector< PostList * >::const_iterator &terms_begin, const std::vector< PostList * >::const_iterator &terms_end, PostListTree *pltree_)
Base class for classes which filter another PostList.
Xapian::doccount accepted
Number of times test_doc() returned true.
Xapian::doccount rejected
Number of times test_doc() returned false.
PostList * skip_to(Xapian::docid did, double w_min)
Skip forward to the specified docid.
TermCompare(vector< PostList * > &terms_)
bool operator()(unsigned a, unsigned b) const
vector< PostList * > & terms
Abstract base class for postlists.
Definition: postlist.h:40
Xapian::doccount get_termfreq() const
Get an estimate of the number of documents this PostList will return.
Definition: postlist.h:67
virtual std::string get_description() const =0
Return a string description of this object.
Xapian::doccount termfreq
Estimate of the number of documents this PostList will return.
Definition: postlist.h:52
Abstract base class for iterating term positions in a document.
Definition: positionlist.h:32
virtual Xapian::termpos get_position() const =0
Return the current position.
Debug logging macros.
#define RETURN(...)
Definition: debuglog.h:484
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:478
Return docs containing terms forming a particular exact phrase.
void sort(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:277
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
Definition: types.h:75
Various assertion macros.
#define Assert(COND)
Definition: omassert.h:122
Abstract base class for iterating term positions in a document.