xapian-core  2.0.0
nearpostlist.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2006,2007,2009,2010,2011,2014,2015,2017,2018 Olly Betts
5  * Copyright (C) 2007 Lemur Consulting Ltd
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, see
19  * <https://www.gnu.org/licenses/>.
20  */
21 
22 #include <config.h>
23 
24 #include "nearpostlist.h"
25 
26 #include "debuglog.h"
27 #include "backends/positionlist.h"
28 #include "heap.h"
29 #include "omassert.h"
30 #include "str.h"
31 
32 #include <algorithm>
33 #include <vector>
34 
35 using namespace std;
36 
38  EstimateOp* estimate_op_,
39  Xapian::termpos window_,
40  const vector<PostList*>::const_iterator &terms_begin,
41  const vector<PostList*>::const_iterator &terms_end,
42  PostListTree* pltree_)
43  : SelectPostList(source_, estimate_op_, pltree_),
44  window(window_),
45  terms(terms_begin, terms_end)
46 {
47  size_t n = terms.size();
48  Assert(n > 1);
49  poslists = new PositionList*[n];
50 
51  // It's hard to estimate how many times the postlist will match as it
52  // depends a lot on the terms and window, but usually it will occur
53  // significantly less often than the individual terms.
54  termfreq = pl->get_termfreq() / 2;
55 }
56 
58 {
59  delete [] poslists;
60 }
61 
62 struct TermCmp {
63  bool operator()(const PostList * a, const PostList * b) const {
64  return a->get_wdf() < b->get_wdf();
65  }
66 };
67 
68 struct Cmp {
69  bool operator()(const PositionList * a, const PositionList * b) const {
70  return a->get_position() > b->get_position();
71  }
72 };
73 
74 bool
76 {
77  LOGCALL(MATCH, bool, "NearPostList::test_doc", NO_ARGS);
78 
79  // Sort to put least frequent terms first, to try to minimise the number of
80  // position lists we need to read if there are no matches.
81  //
82  // We use wdf as a proxy for the length of the position lists, since we'd
83  // need to read each position list to find its length and we're trying to
84  // avoid having to read them all if we can.
85  sort(terms.begin(), terms.end(), TermCmp());
86 
87  poslists[0] = terms[0]->read_position_list();
88  if (!poslists[0]->next()) {
89  ++rejected;
90  RETURN(false);
91  }
92 
94  PositionList ** end = poslists + 1;
95 
96  while (true) {
97  if (last - poslists[0]->get_position() < window) {
98  if (size_t(end - poslists) != terms.size()) {
99  // We haven't started all the position lists yet, so start the
100  // next one.
101  PositionList * posl =
102  terms[end - poslists]->read_position_list();
103  if (last < window) {
104  if (!posl->next())
105  goto reject;
106  } else {
107  if (!posl->skip_to(last - window + 1))
108  goto reject;
109  }
110  Xapian::termpos pos = posl->get_position();
111  if (pos > last) last = pos;
112  *end++ = posl;
113  Heap::push(poslists, end, Cmp());
114  continue;
115  }
116 
117  // We have all the terms within the specified window, but some may
118  // be at the same position (in particular if the same term is
119  // listed twice). So we work through the terms in ascending
120  // position order, and each time we find one with a duplicate
121  // position, we advance it - if that pushes us out of the window,
122  // we return to the outer loop, otherwise we reinsert it into the
123  // heap at its new position and continue to look for duplicates
124  // we need to adjust.
126  Heap::pop(poslists, end, Cmp());
127  PositionList ** i = end - 1;
128  while (true) {
129  if (poslists[0]->get_position() == pos) {
130  if (!poslists[0]->next())
131  goto reject;
132  Xapian::termpos newpos = poslists[0]->get_position();
133  if (newpos - end[-1]->get_position() >= window) {
134  // No longer fits in the window.
135  last = newpos;
136  break;
137  }
138  Heap::replace(poslists, i, Cmp());
139  continue;
140  }
141  pos = poslists[0]->get_position();
142  Heap::pop(poslists, i, Cmp());
143  if (--i == poslists) {
144  Assert(pos - end[-1]->get_position() < window);
145  ++accepted;
146  RETURN(true);
147  }
148  }
149 
150  Heap::make(poslists, end, Cmp());
151  continue;
152  }
153  if (!poslists[0]->skip_to(last - window + 1))
154  break;
155  last = max(last, poslists[0]->get_position());
156  Heap::replace(poslists, end, Cmp());
157  }
158 reject:
159  ++rejected;
160  RETURN(false);
161 }
162 
165 {
166  // Calculate an estimate for the wdf of a near postlist.
167  //
168  // The natural definition of the wdf for a near postlist is the number of
169  // times the terms occur in a group within the windowsize in the document -
170  // in other words, the number of times a routine like do_test() could find
171  // a match, if it kept going after finding the first one. However,
172  // calculating this would be expensive (in CPU time at least - the position
173  // lists are probably already read from disk), and the benefit is dubious
174  // (given that the estimate here is probably only going to be used for
175  // calculating the weight for synonym postlists, and it's not clear that
176  // the above definition is exactly appropriate in this situation, anyway).
177  //
178  // Instead, we could do an estimate of this value, based on the lengths
179  // of the position lists. Rather than having to open the position lists,
180  // we could use the wdfs, which will be the same value unless the wdfs have
181  // been artificially inflated - in which case we probably want to use the
182  // inflated value anyway (it will have been inflated to represent the fact
183  // that this term is more important than others, so we should make use of
184  // this hint).
185  //
186  // A reasonable way to calculate an estimate would be to consider the
187  // document to be split into W (overlapping) windows, each 1 term apart,
188  // and of the same length as the windowsize used for the phrase. Then,
189  // calculate the probability that each term is found in this window (as
190  // Prob = wdf / doclen), multiply these probabilities to get the
191  // probability that they're all found in the window, and then multiply by
192  // the windowsize again to get an estimate.
193  //
194  // However, this requires the doclen, which won't always be available (ie,
195  // if the weighting scheme doesn't use it, we won't have read it from
196  // disk).
197  //
198  // Rather than force an access to disk to get the doclen, we use an even
199  // rougher estimate: the minimum wdf in the sub-lists. This is actually
200  // the upper bound for the wdf (since the phrase can only occur the same
201  // number of times as the least frequent of its constituent terms).
202  //
203  // If this estimate proves to give bad results, we can revisit this and try
204  // a better approximation.
205  vector<PostList *>::const_iterator i = terms.begin();
206  Xapian::termcount wdf = (*i)->get_wdf();
207  while (++i != terms.end()) {
208  wdf = min(wdf, (*i)->get_wdf());
209  }
210  return wdf;
211 }
212 
213 string
215 {
216  string m = "(Near ";
217  m += str(window);
218  m += ' ';
219  m += pl->get_description();
220  m += ")";
221  return m;
222 }
Class for estimating the total number of matching documents.
Definition: estimateop.h:64
NearPostList(PostList *source_, EstimateOp *estimate_op_, Xapian::termpos window_, const std::vector< PostList * >::const_iterator &terms_begin, const std::vector< PostList * >::const_iterator &terms_end, PostListTree *pltree_)
Definition: nearpostlist.cc:37
Xapian::termpos window
Definition: nearpostlist.h:42
std::vector< PostList * > terms
Definition: nearpostlist.h:44
Xapian::termcount get_wdf() const
Return the wdf for the document at the current position.
std::string get_description() const
Return a string description of this object.
PositionList ** poslists
Definition: nearpostlist.h:46
bool test_doc()
Test if the current document contains the terms within the window.
Definition: nearpostlist.cc:75
Base class for classes which filter another PostList.
Xapian::doccount accepted
Number of times test_doc() returned true.
Xapian::doccount rejected
Number of times test_doc() returned false.
PostList * skip_to(Xapian::docid did, double w_min)
Skip forward to the specified docid.
Abstract base class for postlists.
Definition: postlist.h:40
Xapian::doccount get_termfreq() const
Get an estimate of the number of documents this PostList will return.
Definition: postlist.h:67
virtual Xapian::termcount get_wdf() const
Return the wdf for the document at the current position.
Definition: postlist.cc:34
PostList * next()
Advance the current position to the next document in the postlist.
Definition: postlist.h:168
virtual std::string get_description() const =0
Return a string description of this object.
Xapian::doccount termfreq
Estimate of the number of documents this PostList will return.
Definition: postlist.h:52
Abstract base class for iterating term positions in a document.
Definition: positionlist.h:32
virtual bool next()=0
Advance to the next entry in the positionlist.
virtual bool skip_to(Xapian::termpos termpos)=0
Skip forward to the specified position.
virtual Xapian::termpos get_position() const =0
Return the current position.
Xapian::termpos pos
Debug logging macros.
#define RETURN(...)
Definition: debuglog.h:484
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:478
C++ STL heap implementation with extensions.
void pop(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:213
void replace(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:230
void push(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:135
void make(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:259
void sort(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:277
string str(int value)
Convert int to std::string.
Definition: str.cc:91
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
Definition: types.h:75
Return docs containing terms within a specified window.
Various assertion macros.
#define Assert(COND)
Definition: omassert.h:122
Abstract base class for iterating term positions in a document.
Convert types to std::string.
bool operator()(const PositionList *a, const PositionList *b) const
Definition: nearpostlist.cc:69
bool operator()(const PostList *a, const PostList *b) const
Definition: nearpostlist.cc:63