xapian-core  1.4.26
orpostlist.cc
Go to the documentation of this file.
1 
4 /* Copyright 1999,2000,2001 BrightStation PLC
5  * Copyright 2001,2002 Ananova Ltd
6  * Copyright 2003,2004,2007,2008,2009,2010,2011,2012,2016,2017 Olly Betts
7  * Copyright 2009 Lemur Consulting Ltd
8  * Copyright 2010 Richard Boulton
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public License as
12  * published by the Free Software Foundation; either version 2 of the
13  * License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23  * USA
24  */
25 
26 #include <config.h>
27 #include "orpostlist.h"
28 
29 #include "debuglog.h"
30 #include "multiandpostlist.h"
31 #include "andmaybepostlist.h"
32 #include "omassert.h"
33 
34 #include "orpositionlist.h"
35 
36 #include <algorithm>
37 
39  PostList *right_,
40  MultiMatch *matcher_,
41  Xapian::doccount dbsize_)
42  : BranchPostList(left_, right_, matcher_),
43  lhead(0), rhead(0), lvalid(false), rvalid(false),
44  lmax(0), rmax(0), minmax(0), dbsize(dbsize_)
45 {
46  LOGCALL_CTOR(MATCH, "OrPostList", left_ | right_ | matcher_ | dbsize_);
47  AssertRel(left_->get_termfreq_est(),>=,right_->get_termfreq_est());
48 }
49 
50 PostList *
51 OrPostList::next(double w_min)
52 {
53  LOGCALL(MATCH, PostList *, "OrPostList::next", w_min);
54  if (w_min > minmax) {
55  // we can replace the OR with another operator
56  PostList *ret;
57  if (w_min > lmax) {
58  if (w_min > rmax) {
59  LOGLINE(MATCH, "OR -> AND");
60  ret = new MultiAndPostList(l, r, lmax, rmax, matcher, dbsize);
61  Xapian::docid newdocid = std::max(lhead, rhead);
62  if (newdocid == 0 || (lvalid && rvalid && lhead == rhead)) {
63  ++newdocid;
64  }
65  skip_to_handling_prune(ret, newdocid, w_min, matcher);
66  } else {
67  LOGLINE(MATCH, "OR -> AND MAYBE (1)");
68  AndMaybePostList * ret2 =
70  ret = ret2;
71  // Advance the AndMaybePostList unless the old RHS postlist was
72  // already ahead of the current docid.
73  if (rhead <= lhead) {
74  next_handling_prune(ret, w_min, matcher);
75  } else {
76  handle_prune(ret, ret2->sync_rhs(w_min));
77  }
78  }
79  } else {
80  // w_min > rmax since w_min > minmax but not (w_min > lmax)
81  Assert(w_min > rmax);
82  LOGLINE(MATCH, "OR -> AND MAYBE (2)");
83  AndMaybePostList * ret2 =
85  ret = ret2;
86  if (lhead <= rhead) {
87  next_handling_prune(ret, w_min, matcher);
88  } else {
89  handle_prune(ret, ret2->sync_rhs(w_min));
90  }
91  }
92 
93  l = r = NULL;
94  RETURN(ret);
95  }
96 
97  bool ldry = false;
98  bool rnext = !rvalid;
99 
100  if (!lvalid || lhead <= rhead) {
101  if (lhead == rhead) rnext = true;
102  next_handling_prune(l, w_min - rmax, matcher);
103  lvalid = true;
104  if (l->at_end()) ldry = true;
105  } else {
106  rnext = true;
107  }
108 
109  if (rnext) {
110  next_handling_prune(r, w_min - lmax, matcher);
111  rvalid = true;
112  if (r->at_end()) {
113  PostList *ret = l;
114  l = NULL;
115  RETURN(ret);
116  }
117  rhead = r->get_docid();
118  }
119 
120  if (!ldry) {
121  lhead = l->get_docid();
122  RETURN(NULL);
123  }
124 
125  PostList *ret = r;
126  r = NULL;
127  RETURN(ret);
128 }
129 
130 PostList *
132 {
133  LOGCALL(MATCH, PostList *, "OrPostList::skip_to", did | w_min);
134  if (w_min > minmax) {
135  // we can replace the OR with another operator
136  PostList *ret;
137  if (w_min > lmax) {
138  if (w_min > rmax) {
139  LOGLINE(MATCH, "OR -> AND (in skip_to)");
140  ret = new MultiAndPostList(l, r, lmax, rmax, matcher, dbsize);
141  did = std::max(did, std::max(lhead, rhead));
142  } else {
143  LOGLINE(MATCH, "OR -> AND MAYBE (in skip_to) (1)");
144  AndMaybePostList * ret2 =
146  ret = ret2;
147  handle_prune(ret, ret2->sync_rhs(w_min));
148  did = std::max(did, rhead);
149  }
150  } else {
151  // w_min > rmax since w_min > minmax but not (w_min > lmax)
152  Assert(w_min > rmax);
153  LOGLINE(MATCH, "OR -> AND MAYBE (in skip_to) (2)");
154  AndMaybePostList * ret2 =
156  ret = ret2;
157  handle_prune(ret, ret2->sync_rhs(w_min));
158  did = std::max(did, lhead);
159  }
160 
161  l = r = NULL;
162  skip_to_handling_prune(ret, did, w_min, matcher);
163  RETURN(ret);
164  }
165 
166  bool ldry = false;
167  if (lhead < did) {
168  skip_to_handling_prune(l, did, w_min - rmax, matcher);
169  lvalid = true;
170  ldry = l->at_end();
171  }
172 
173  if (rhead < did) {
174  skip_to_handling_prune(r, did, w_min - lmax, matcher);
175  rvalid = true;
176 
177  if (r->at_end()) {
178  PostList *ret = l;
179  l = NULL;
180  RETURN(ret);
181  }
182  rhead = r->get_docid();
183  }
184 
185  if (!ldry) {
186  lhead = l->get_docid();
187  RETURN(NULL);
188  }
189 
190  PostList *ret = r;
191  r = NULL;
192  RETURN(ret);
193 }
194 
195 PostList *
196 OrPostList::check(Xapian::docid did, double w_min, bool &valid)
197 {
198  LOGCALL(MATCH, PostList *, "OrPostList::check", did | w_min);
199  if (w_min > minmax) {
200  // we can replace the OR with another operator
201  PostList *ret;
202  if (w_min > lmax) {
203  if (w_min > rmax) {
204  LOGLINE(MATCH, "OR -> AND (in check)");
205  ret = new MultiAndPostList(l, r, lmax, rmax, matcher, dbsize);
206  did = std::max(did, std::max(lhead, rhead));
207  } else {
208  LOGLINE(MATCH, "OR -> AND MAYBE (in check) (1)");
210  ret = ret2;
211  handle_prune(ret, ret2->sync_rhs(w_min));
212  did = std::max(did, rhead);
213  }
214  } else {
215  // w_min > rmax since w_min > minmax but not (w_min > lmax)
216  Assert(w_min > rmax);
217  LOGLINE(MATCH, "OR -> AND MAYBE (in check) (2)");
219  ret = ret2;
220  handle_prune(ret, ret2->sync_rhs(w_min));
221  did = std::max(did, lhead);
222  }
223 
224  l = r = NULL;
225  check_handling_prune(ret, did, w_min, matcher, valid);
226  RETURN(ret);
227  }
228 
229  bool ldry = false;
230  if (!lvalid || lhead < did) {
231  lvalid = false;
232  check_handling_prune(l, did, w_min - rmax, matcher, lvalid);
233  ldry = l->at_end();
234  }
235 
236  if (!rvalid || rhead <= did) {
237  rvalid = false;
238  check_handling_prune(r, did, w_min - lmax, matcher, rvalid);
239 
240  if (r->at_end()) {
241  PostList *ret = l;
242  l = NULL;
243  valid = lvalid;
244  RETURN(ret);
245  }
246  if (rvalid) {
247  rhead = r->get_docid();
248  } else {
249  rhead = did + 1;
250  }
251  }
252 
253  if (!ldry) {
254  if (lvalid) {
255  lhead = l->get_docid();
256  } else {
257  lhead = did + 1;
258  }
259 
260  if (lhead < rhead) {
261  valid = lvalid;
262  } else if (rhead < lhead) {
263  valid = rvalid;
264  } else {
265  valid = lvalid || rvalid;
266  }
267  RETURN(NULL);
268  }
269 
270  PostList *ret = r;
271  r = NULL;
272  valid = rvalid;
273  RETURN(ret);
274 }
275 
278 {
279  LOGCALL(MATCH, Xapian::doccount, "OrPostList::get_termfreq_max", NO_ARGS);
280  RETURN(std::min(l->get_termfreq_max() + r->get_termfreq_max(), dbsize));
281 }
282 
285 {
286  LOGCALL(MATCH, Xapian::doccount, "OrPostList::get_termfreq_min", NO_ARGS);
287  RETURN(std::max(l->get_termfreq_min(), r->get_termfreq_min()));
288 }
289 
292 {
293  LOGCALL(MATCH, Xapian::doccount, "OrPostList::get_termfreq_est", NO_ARGS);
294  if (rare(dbsize == 0))
295  RETURN(0);
296  // Estimate assuming independence:
297  // P(l or r) = P(l) + P(r) - P(l) . P(r)
298  double lest = static_cast<double>(l->get_termfreq_est());
299  double rest = static_cast<double>(r->get_termfreq_est());
300  double est = lest + rest - (lest * rest / dbsize);
301  RETURN(static_cast<Xapian::doccount>(est + 0.5));
302 }
303 
304 // Estimate frequency of OR assuming independence.
305 static double
306 est(double l, double r, double n)
307 {
308  return l + r - (l * r / n);
309 }
310 
311 TermFreqs
313  const Xapian::Weight::Internal & stats) const
314 {
315  LOGCALL(MATCH, TermFreqs, "OrPostList::get_termfreq_est_using_stats", stats);
316  // Estimate assuming independence:
317  // P(l or r) = P(l) + P(r) - P(l) . P(r)
318  TermFreqs lfreqs(l->get_termfreq_est_using_stats(stats));
319  TermFreqs rfreqs(r->get_termfreq_est_using_stats(stats));
320 
321  double freqest, relfreqest, collfreqest;
322 
323  // Our caller should have ensured this.
324  Assert(stats.collection_size);
325 
326  freqest = est(lfreqs.termfreq, rfreqs.termfreq, stats.collection_size);
327  if (stats.total_length == 0) {
328  collfreqest = 0;
329  } else {
330  collfreqest = est(lfreqs.collfreq, rfreqs.collfreq, stats.total_length);
331  }
332  if (stats.rset_size == 0) {
333  relfreqest = 0;
334  } else {
335  relfreqest = est(lfreqs.reltermfreq, rfreqs.reltermfreq,
336  stats.rset_size);
337  }
338 
339  RETURN(TermFreqs(static_cast<Xapian::doccount>(freqest + 0.5),
340  static_cast<Xapian::doccount>(relfreqest + 0.5),
341  static_cast<Xapian::termcount>(collfreqest + 0.5)));
342 }
343 
346 {
347  LOGCALL(MATCH, Xapian::docid, "OrPostList::get_docid", NO_ARGS);
348  Assert(lhead != 0 && rhead != 0); // check we've started
349  RETURN(std::min(lhead, rhead));
350 }
351 
352 // only called if we are doing a probabilistic OR
353 double
355 {
356  LOGCALL(MATCH, double, "OrPostList::get_weight", NO_ARGS);
357  Assert(lhead != 0 && rhead != 0); // check we've started
358  if (lhead < rhead) RETURN(l->get_weight());
359  if (lhead > rhead) RETURN(r->get_weight());
360  RETURN(l->get_weight() + r->get_weight());
361 }
362 
363 // only called if we are doing a probabilistic operation
364 double
366 {
367  LOGCALL(MATCH, double, "OrPostList::get_maxweight", NO_ARGS);
368  RETURN(lmax + rmax);
369 }
370 
371 double
373 {
374  LOGCALL(MATCH, double, "OrPostList::recalc_maxweight", NO_ARGS);
375  // l and r cannot be NULL here, because the only place where they get set
376  // to NULL is when the tree is decaying, and the OrPostList is then
377  // immediately replaced.
378  lmax = l->recalc_maxweight();
379  rmax = r->recalc_maxweight();
380  minmax = std::min(lmax, rmax);
382 }
383 
384 bool
386 {
387  LOGCALL(MATCH, bool, "OrPostList::at_end", NO_ARGS);
388  // Can never really happen - OrPostList next/skip_to autoprune
389  AssertParanoid(!(l->at_end()) && !(r->at_end()));
390  RETURN(false);
391 }
392 
393 std::string
395 {
396  return "(" + l->get_description() + " Or " + r->get_description() + ")";
397 }
398 
401 {
402  LOGCALL(MATCH, Xapian::termcount, "OrPostList::get_doclength", NO_ARGS);
404 
405  Assert(lhead != 0 && rhead != 0); // check we've started
406  if (lhead > rhead) {
407  doclength = r->get_doclength();
408  LOGLINE(MATCH, "OrPostList::get_doclength() [right docid=" << rhead <<
409  "] = " << doclength);
410  } else {
411  doclength = l->get_doclength();
412  LOGLINE(MATCH, "OrPostList::get_doclength() [left docid=" << lhead <<
413  "] = " << doclength);
414  }
415 
416  RETURN(doclength);
417 }
418 
421 {
422  LOGCALL(MATCH, Xapian::termcount, "OrPostList::get_unique_terms", NO_ARGS);
423  Xapian::termcount unique_terms;
424 
425  Assert(lhead != 0 && rhead != 0); // check we've started
426  if (lhead > rhead) {
427  unique_terms = r->get_unique_terms();
428  LOGLINE(MATCH, "OrPostList::get_unique_terms() [right docid=" << rhead <<
429  "] = " << unique_terms);
430  } else {
431  unique_terms = l->get_unique_terms();
432  LOGLINE(MATCH, "OrPostList::get_unique_terms() [left docid=" << lhead <<
433  "] = " << unique_terms);
434  }
435 
436  RETURN(unique_terms);
437 }
438 
441 {
442  LOGCALL(MATCH, Xapian::termcount, "OrPostList::get_wdf", NO_ARGS);
443  if (lhead < rhead) RETURN(l->get_wdf());
444  if (lhead > rhead) RETURN(r->get_wdf());
445  RETURN(l->get_wdf() + r->get_wdf());
446 }
447 
450 {
451  LOGCALL(MATCH, Xapian::termcount, "OrPostList::count_matching_subqs", NO_ARGS);
455 }
456 
457 void
459 {
460  if (lhead <= rhead) l->gather_position_lists(orposlist);
461  if (lhead >= rhead) r->gather_position_lists(orposlist);
462 }
Xapian::docid get_docid() const
Return the current docid.
Definition: orpostlist.cc:345
#define RETURN(A)
Definition: debuglog.h:493
#define Assert(COND)
Definition: omassert.h:122
MultiMatch * matcher
The object which is using this postlist to perform a match.
Abstract base class for postlists.
Definition: postlist.h:37
Merged postlist: items from one list, weights from both.
A postlist with weights modified by another postlist.
virtual Xapian::docid get_docid() const =0
Return the current docid.
PostList * l
Left sub-postlist.
virtual void gather_position_lists(OrPositionList *orposlist)
Gather PositionList* objects for a subtree.
#define AssertRel(A, REL, B)
Definition: omassert.h:123
double get_weight() const
Return the weight contribution for the current position.
Definition: orpostlist.cc:354
Xapian::doccount get_termfreq_est() const
Get an estimate of the number of documents indexed by this term.
Definition: orpostlist.cc:291
Xapian::docid lhead
Definition: orpostlist.h:40
Xapian::termcount get_wdf() const
get_wdf() for OR postlists returns the sum of the wdfs of the sub postlists which are at the current ...
Definition: orpostlist.cc:440
virtual double recalc_maxweight()=0
Recalculate the upper bound on what get_weight() can return.
double get_maxweight() const
Return an upper bound on what get_weight() can return.
Definition: orpostlist.cc:365
Base class for postlists which are generated by merging two sub-postlists.
static double est(double l, double r, double n)
Definition: orpostlist.cc:306
PostList * sync_rhs(double w_min)
Synchronise the RHS to the LHS after construction.
#define false
Definition: header.h:9
virtual Xapian::termcount get_unique_terms() const =0
Return the number of unique terms in the current document.
double rmax
Definition: orpostlist.h:42
#define rare(COND)
Definition: config.h:575
Xapian::doccount collection_size
Number of documents in the collection.
bool skip_to_handling_prune(PostList *&pl, Xapian::docid did, double w_min, MultiMatch *matcher)
OR of two posting lists.
Xapian::doccount rset_size
Number of relevant documents in the collection.
void handle_prune(PostList *&kid, PostList *ret)
Utility method, to call recalc_maxweight() and do the pruning if a next() or skip_to() returns non-NU...
double minmax
Definition: orpostlist.h:42
Xapian::termcount count_matching_subqs() const
Count the number of leaf subqueries which match at the current position.
Definition: orpostlist.cc:449
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
PostList * check(Xapian::docid did, double w_min, bool &valid)
Check if the specified docid occurs in this postlist.
Definition: orpostlist.cc:196
std::string get_description() const
Return a string description of this object.
Definition: orpostlist.cc:394
double lmax
Definition: orpostlist.h:42
virtual Xapian::termcount get_doclength() const
Return the document length of the document the current term comes from.
Definition: orpostlist.cc:400
virtual TermFreqs get_termfreq_est_using_stats(const Xapian::Weight::Internal &stats) const
Get an estimate for the termfreq and reltermfreq, given the stats.
Definition: postlist.cc:36
virtual Xapian::doccount get_termfreq_max() const =0
Get an upper bound on the number of documents indexed by this term.
virtual Xapian::doccount get_termfreq_est() const =0
Get an estimate of the number of documents indexed by this term.
double doclength
A normalised document length.
Definition: types.h:59
virtual Xapian::doccount get_termfreq_min() const =0
Get a lower bound on the number of documents indexed by this term.
Class to hold statistics for a given collection.
Internal * next()
Advance the current position to the next document in the postlist.
Definition: postlist.h:194
N-way AND postlist.
bool check_handling_prune(PostList *&pl, Xapian::docid did, double w_min, MultiMatch *matcher, bool &valid)
bool rvalid
Definition: orpostlist.h:41
Xapian::docid rhead
Definition: orpostlist.h:40
virtual Xapian::termcount get_unique_terms() const
Return the number of unique terms in the document.
Definition: orpostlist.cc:420
PostList * skip_to(Xapian::docid did, double w_min)
Skip forward to the specified docid.
Definition: orpostlist.cc:131
bool at_end() const
Return true if the current position is past the last entry in this list.
Definition: orpostlist.cc:385
#define AssertParanoid(COND)
Definition: omassert.h:129
Xapian::doccount dbsize
Definition: orpostlist.h:43
#define LOGCALL_CTOR(CATEGORY, CLASS, PARAMS)
Definition: debuglog.h:489
The frequencies for a term.
bool next_handling_prune(PostList *&pl, double w_min, MultiMatch *matcher)
virtual Xapian::termcount get_doclength() const =0
Return the length of current document.
virtual std::string get_description() const =0
Return a string description of this object.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
PostList * r
Right sub-postlist.
OrPostList(PostList *left_, PostList *right_, MultiMatch *matcher_, Xapian::doccount dbsize_)
Definition: orpostlist.cc:38
virtual Xapian::termcount count_matching_subqs() const
Count the number of leaf subqueries which match at the current position.
N-way AND postlist.
void gather_position_lists(OrPositionList *orposlist)
Gather PositionList* objects for a subtree.
Definition: orpostlist.cc:458
Xapian::doccount get_termfreq_min() const
Get a lower bound on the number of documents indexed by this term.
Definition: orpostlist.cc:284
virtual bool at_end() const =0
Return true if the current position is past the last entry in this list.
Various assertion macros.
#define LOGLINE(a, b)
Definition: debuglog.h:494
Xapian::doccount get_termfreq_max() const
Get an upper bound on the number of documents indexed by this term.
Definition: orpostlist.cc:277
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
Xapian::totallength total_length
Total length of all documents in the collection.
bool lvalid
Definition: orpostlist.h:41
double recalc_maxweight()
Recalculate the upper bound on what get_weight() can return.
Definition: orpostlist.cc:372
TermFreqs get_termfreq_est_using_stats(const Xapian::Weight::Internal &stats) const
Get an estimate for the termfreq and reltermfreq, given the stats.
Definition: orpostlist.cc:312
virtual Xapian::termcount get_wdf() const
Return the wdf for the document at the current position.
Definition: postlist.cc:44
virtual double get_weight() const =0
Return the weight contribution for the current position.
Debug logging macros.
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:487
Merge two PositionList objects using an OR operation.