xapian-core  1.4.25
nearpostlist.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2006,2007,2009,2010,2011,2014,2015,2017 Olly Betts
5  * Copyright (C) 2007 Lemur Consulting Ltd
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include <config.h>
23 
24 #include "nearpostlist.h"
25 
26 #include "debuglog.h"
27 #include "backends/positionlist.h"
28 #include "omassert.h"
29 #include "str.h"
30 
31 #include <algorithm>
32 #include <vector>
33 
34 using namespace std;
35 
37  Xapian::termpos window_,
38  const vector<PostList*>::const_iterator &terms_begin,
39  const vector<PostList*>::const_iterator &terms_end)
40  : SelectPostList(source_), window(window_), terms(terms_begin, terms_end)
41 {
42  size_t n = terms.size();
43  Assert(n > 1);
44  poslists = new PositionList*[n];
45 }
46 
48 {
49  delete [] poslists;
50 }
51 
52 struct TermCmp {
53  bool operator()(const PostList * a, const PostList * b) const {
54  return a->get_wdf() < b->get_wdf();
55  }
56 };
57 
58 struct Cmp {
59  bool operator()(const PositionList * a, const PositionList * b) const {
60  return a->get_position() > b->get_position();
61  }
62 };
63 
64 bool
66 {
67  LOGCALL(MATCH, bool, "NearPostList::test_doc", NO_ARGS);
68 
69  // Sort to put least frequent terms first, to try to minimise the number of
70  // position lists we need to read if there are no matches.
71  //
72  // We use wdf as a proxy for the length of the position lists, since we'd
73  // need to read each position list to find its length and we're trying to
74  // avoid having to read them all if we can.
75  sort(terms.begin(), terms.end(), TermCmp());
76 
77  poslists[0] = terms[0]->read_position_list();
78  if (!poslists[0]->next())
79  RETURN(false);
80 
81  Xapian::termpos last = poslists[0]->get_position();
82  PositionList ** end = poslists + 1;
83 
84  while (true) {
85  if (last - poslists[0]->get_position() < window) {
86  if (size_t(end - poslists) != terms.size()) {
87  // We haven't started all the position lists yet, so start the
88  // next one.
89  PositionList * posl =
90  terms[end - poslists]->read_position_list();
91  if (last < window) {
92  if (!posl->next())
93  RETURN(false);
94  } else {
95  if (!posl->skip_to(last - window + 1))
96  RETURN(false);
97  }
98  Xapian::termpos pos = posl->get_position();
99  if (pos > last) last = pos;
100  *end++ = posl;
101  push_heap<PositionList **, Cmp>(poslists, end, Cmp());
102  continue;
103  }
104 
105  // We have all the terms within the specified window, but some may
106  // be at the same position (in particular if the same term is
107  // listed twice). So we work through the terms in ascending
108  // position order, and each time we find one with a duplicate
109  // position, we advance it - if that pushes us out of the window,
110  // we return to the outer loop, otherwise we reinsert it into the
111  // heap at its new position and continue to look for duplicates
112  // we need to adjust.
113  PositionList ** i = end;
114  pop_heap<PositionList **, Cmp>(poslists, i, Cmp());
115  Xapian::termpos pos = (*--i)->get_position();
116  while (true) {
117  pop_heap<PositionList **, Cmp>(poslists, i, Cmp());
118  if ((*--i)->get_position() == pos) {
119  if (!(*i)->next())
120  RETURN(false);
121  Xapian::termpos newpos = (*i)->get_position();
122  if (newpos - end[-1]->get_position() >= window) {
123  // No longer fits in the window.
124  last = newpos;
125  break;
126  }
127  push_heap<PositionList **, Cmp>(poslists, ++i, Cmp());
128  continue;
129  }
130  pos = (*i)->get_position();
131  if (i == poslists) {
132  Assert(pos - end[-1]->get_position() < window);
133  RETURN(true);
134  }
135  }
136 
137  make_heap<PositionList **, Cmp>(poslists, end, Cmp());
138  continue;
139  }
140  pop_heap<PositionList **, Cmp>(poslists, end, Cmp());
141  if (!end[-1]->skip_to(last - window + 1))
142  break;
143  last = max(last, end[-1]->get_position());
144  push_heap<PositionList **, Cmp>(poslists, end, Cmp());
145  }
146 
147  RETURN(false);
148 }
149 
152 {
153  // Calculate an estimate for the wdf of a near postlist.
154  //
155  // The natural definition of the wdf for a near postlist is the number of
156  // times the terms occur in a group within the windowsize in the document -
157  // in other words, the number of times a routine like do_test() could find
158  // a match, if it kept going after finding the first one. However,
159  // calculating this would be expensive (in CPU time at least - the position
160  // lists are probably already read from disk), and the benefit is dubious
161  // (given that the estimate here is probably only going to be used for
162  // calculating the weight for synonym postlists, and it's not clear that
163  // the above definition is exactly appropriate in this situation, anyway).
164  //
165  // Instead, we could do an estimate of this value, based on the lengths
166  // of the position lists. Rather than having to open the position lists,
167  // we could use the wdfs, which will be the same value unless the wdfs have
168  // been artificially inflated - in which case we probably want to use the
169  // inflated value anyway (it will have been inflated to represent the fact
170  // that this term is more important than others, so we should make use of
171  // this hint).
172  //
173  // A reasonable way to calculate an estimate would be to consider the
174  // document to be split into W (overlapping) windows, each 1 term apart,
175  // and of the same length as the windowsize used for the phrase. Then,
176  // calculate the probability that each term is found in this window (as
177  // Prob = wdf / doclen), multiply these probabilities to get the
178  // probability that they're all found in the window, and then multiply by
179  // the windowsize again to get an estimate.
180  //
181  // However, this requires the doclen, which won't always be available (ie,
182  // if the weighting scheme doesn't use it, we won't have read it from
183  // disk).
184  //
185  // Rather than force an access to disk to get the doclen, we use an even
186  // rougher estimate: the minimum wdf in the sub-lists. This is actually
187  // the upper bound for the wdf (since the phrase can only occur the same
188  // number of times as the least frequent of its constituent terms).
189  //
190  // If this estimate proves to give bad results, we can revisit this and try
191  // a better approximation.
192  vector<PostList *>::const_iterator i = terms.begin();
193  Xapian::termcount wdf = (*i)->get_wdf();
194  while (++i != terms.end()) {
195  wdf = min(wdf, (*i)->get_wdf());
196  }
197  return wdf;
198 }
199 
202 {
203  // It's hard to estimate how many times the postlist will match as it
204  // depends a lot on the terms and window, but usually it will occur
205  // significantly less often than the individual terms.
206  return source->get_termfreq_est() / 2;
207 }
208 
209 TermFreqs
211  const Xapian::Weight::Internal & stats) const
212 {
213  LOGCALL(MATCH, TermFreqs, "NearPostList::get_termfreq_est_using_stats", stats);
214  // No idea how to estimate this - do the same as get_termfreq_est() for
215  // now.
217  result.termfreq /= 2;
218  result.reltermfreq /= 2;
219  result.collfreq /= 2;
220  RETURN(result);
221 }
222 
223 string
225 {
226  string m = "(Near ";
227  m += str(window);
228  m += ' ';
229  m += source->get_description();
230  m += ")";
231  return m;
232 }
#define RETURN(A)
Definition: debuglog.h:493
#define Assert(COND)
Definition: omassert.h:122
PostList * source
TermFreqs get_termfreq_est_using_stats(const Xapian::Weight::Internal &stats) const
Get an estimate for the termfreq and reltermfreq, given the stats.
Abstract base class for postlists.
Definition: postlist.h:37
Xapian::termcount get_wdf() const
Return the wdf for the document at the current position.
virtual bool next()=0
Advance to the next entry in the positionlist.
Xapian::termpos window
Definition: nearpostlist.h:39
Abstract base class for iterating term positions in a document.
STL namespace.
Convert types to std::string.
std::vector< PostList * > terms
Definition: nearpostlist.h:41
Xapian::doccount termfreq
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
virtual bool skip_to(Xapian::termpos termpos)=0
Skip forward to the specified position.
bool test_doc()
Test if the current document contains the terms within the window.
Definition: nearpostlist.cc:65
virtual TermFreqs get_termfreq_est_using_stats(const Xapian::Weight::Internal &stats) const
Get an estimate for the termfreq and reltermfreq, given the stats.
Definition: postlist.cc:36
bool operator()(const PostList *a, const PostList *b) const
Definition: nearpostlist.cc:53
virtual Xapian::doccount get_termfreq_est() const =0
Get an estimate of the number of documents indexed by this term.
Class to hold statistics for a given collection.
Internal * next()
Advance the current position to the next document in the postlist.
Definition: postlist.h:194
string str(int value)
Convert int to std::string.
Definition: str.cc:90
NearPostList(PostList *source_, Xapian::termpos window_, const std::vector< PostList *>::const_iterator &terms_begin, const std::vector< PostList *>::const_iterator &terms_end)
Definition: nearpostlist.cc:36
Return docs containing terms within a specified window.
virtual Xapian::termpos get_position() const =0
Return the current position.
Xapian::doccount get_termfreq_est() const
Get an estimate of the number of documents indexed by this term.
A postlist parent class for classes which only return selected docs from a source postlist (e...
PostList * skip_to(Xapian::docid did, double w_min)
Skip forward to the specified docid.
The frequencies for a term.
virtual std::string get_description() const =0
Return a string description of this object.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
bool operator()(const PositionList *a, const PositionList *b) const
Definition: nearpostlist.cc:59
std::string get_description() const
Return a string description of this object.
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
Definition: types.h:83
Various assertion macros.
Abstract base class for iterating term positions in a document.
Definition: positionlist.h:31
virtual Xapian::termcount get_wdf() const
Return the wdf for the document at the current position.
Definition: postlist.cc:44
PositionList ** poslists
Definition: nearpostlist.h:43
Debug logging macros.
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:487