xapian-core  1.4.26
andnotpostlist.cc
Go to the documentation of this file.
1 
4 /* Copyright 1999,2000,2001 BrightStation PLC
5  * Copyright 2002 Ananova Ltd
6  * Copyright 2003,2004,2007,2009,2011,2012 Olly Betts
7  * Copyright 2009 Lemur Consulting Ltd
8  *
9  * This program is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU General Public License as
11  * published by the Free Software Foundation; either version 2 of the
12  * License, or (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
22  * USA
23  */
24 
25 #include <config.h>
26 #include "andnotpostlist.h"
27 
28 #include "debuglog.h"
29 #include "omassert.h"
30 
31 PostList *
33 {
34  LOGCALL(MATCH, PostList *, "AndNotPostList::advance_to_next_match", w_min | ret);
35  handle_prune(l, ret);
36  if (l->at_end()) {
37  lhead = 0;
38  RETURN(NULL);
39  }
40  lhead = l->get_docid();
41 
42  while (rhead <= lhead) {
43  if (lhead == rhead) {
44  next_handling_prune(l, w_min, matcher);
45  if (l->at_end()) {
46  lhead = 0;
47  RETURN(NULL);
48  }
49  lhead = l->get_docid();
50  }
52  if (r->at_end()) {
53  ret = l;
54  l = NULL;
55  RETURN(ret);
56  }
57  rhead = r->get_docid();
58  }
59  RETURN(NULL);
60 }
61 
63  PostList *right_,
64  MultiMatch *matcher_,
65  Xapian::doccount dbsize_)
66  : BranchPostList(left_, right_, matcher_),
67  lhead(0), rhead(0), dbsize(dbsize_)
68 {
69  LOGCALL_CTOR(MATCH, "AndNotPostList", left_ | right_ | matcher_ | dbsize_);
70 }
71 
72 PostList *
73 AndNotPostList::next(double w_min)
74 {
75  LOGCALL(MATCH, PostList *, "AndNotPostList::next", w_min);
76  RETURN(advance_to_next_match(w_min, l->next(w_min)));
77 }
78 
79 PostList *
81  double w_min,
82  Xapian::docid lh,
83  Xapian::docid rh)
84 {
85  LOGCALL(MATCH, PostList *, "AndNotPostList::sync_and_skip_to", id | w_min | lh | rh);
86  lhead = lh;
87  rhead = rh;
88  RETURN(skip_to(id, w_min));
89 }
90 
91 PostList *
93 {
94  LOGCALL(MATCH, PostList *, "AndNotPostList::skip_to", did | w_min);
95  if (did <= lhead) RETURN(NULL);
96  RETURN(advance_to_next_match(w_min, l->skip_to(did, w_min)));
97 }
98 
101 {
102  LOGCALL(MATCH, Xapian::doccount, "AndNotPostList::get_termfreq_max", NO_ARGS);
103  // Max is when as many docs as possible on left, and none excluded.
105 }
106 
109 {
110  LOGCALL(MATCH, Xapian::doccount, "AndNotPostList::get_termfreq_min", NO_ARGS);
111  // Min is when as few docs as possible on left, and maximum are excluded.
114  if (l_min > r_max) RETURN(l_min - r_max);
115  RETURN(0u);
116 }
117 
120 {
121  LOGCALL(MATCH, Xapian::doccount, "AndNotPostList::get_termfreq_est", NO_ARGS);
122  // We shortcut an empty shard and avoid creating a postlist tree for it.
123  Assert(dbsize);
124  // Estimate assuming independence:
125  // P(l and r) = P(l) . P(r)
126  // P(l not r) = P(l) - P(l and r) = P(l) . ( 1 - P(r))
127  double est = l->get_termfreq_est() *
128  (1.0 - double(r->get_termfreq_est()) / dbsize);
129  RETURN(static_cast<Xapian::doccount>(est + 0.5));
130 }
131 
132 TermFreqs
134  const Xapian::Weight::Internal & stats) const
135 {
136  LOGCALL(MATCH, TermFreqs, "AndNotPostList::get_termfreq_est_using_stats", stats);
137  // Estimate assuming independence:
138  // P(l and r) = P(l) . P(r)
139  // P(l not r) = P(l) - P(l and r) = P(l) . ( 1 - P(r))
140  TermFreqs lfreqs(l->get_termfreq_est_using_stats(stats));
141  TermFreqs rfreqs(r->get_termfreq_est_using_stats(stats));
142 
143  double freqest, relfreqest, collfreqest;
144 
145  // Our caller should have ensured this.
146  Assert(stats.collection_size);
147 
148  freqest = lfreqs.termfreq *
149  (1.0 - (double(rfreqs.termfreq) / stats.collection_size));
150 
151  collfreqest = lfreqs.collfreq;
152  if (stats.total_length != 0) {
153  collfreqest *= (1.0 - (double(rfreqs.collfreq) / stats.total_length));
154  }
155 
156  if (stats.rset_size == 0) {
157  relfreqest = 0;
158  } else {
159  relfreqest = lfreqs.reltermfreq *
160  (1.0 - (double(rfreqs.reltermfreq) / stats.rset_size));
161  }
162 
163  RETURN(TermFreqs(static_cast<Xapian::doccount>(freqest + 0.5),
164  static_cast<Xapian::doccount>(relfreqest + 0.5),
165  static_cast<Xapian::termcount>(collfreqest + 0.5)));
166 }
167 
170 {
171  LOGCALL(MATCH, Xapian::docid, "AndNotPostList::get_docid", NO_ARGS);
172  RETURN(lhead);
173 }
174 
175 // only called if we are doing a probabilistic AND NOT
176 double
178 {
179  LOGCALL(MATCH, double, "AndNotPostList::get_weight", NO_ARGS);
180  RETURN(l->get_weight());
181 }
182 
183 // only called if we are doing a probabilistic AND NOT
184 double
186 {
187  LOGCALL(MATCH, double, "AndNotPostList::get_maxweight", NO_ARGS);
188  RETURN(l->get_maxweight());
189 }
190 
191 double
193 {
194  LOGCALL(MATCH, double, "AndNotPostList::recalc_maxweight", NO_ARGS);
195  // l cannot be NULL here because it is only set to NULL when the tree
196  // decays, so this can never be called at that point.
198 }
199 
200 bool
202 {
203  LOGCALL(MATCH, bool, "AndNotPostList::at_end", NO_ARGS);
204  RETURN(lhead == 0);
205 }
206 
207 std::string
209 {
210  return "(" + l->get_description() + " AndNot " + r->get_description() + ")";
211 }
212 
215 {
216  LOGCALL(MATCH, Xapian::termcount, "AndNotPostList::get_doclength", NO_ARGS);
217  RETURN(l->get_doclength());
218 }
219 
222 {
223  LOGCALL(MATCH, Xapian::termcount, "AndNotPostList::get_unique_terms", NO_ARGS);
225 }
226 
229 {
230  LOGCALL(MATCH, Xapian::termcount, "AndNotPostList::get_wdf", NO_ARGS);
231  RETURN(l->get_wdf());
232 }
233 
236 {
237  LOGCALL(MATCH, Xapian::termcount, "AndNotPostList::count_matching_subqs", NO_ARGS);
239 }
#define RETURN(A)
Definition: debuglog.h:493
#define Assert(COND)
Definition: omassert.h:122
MultiMatch * matcher
The object which is using this postlist to perform a match.
Abstract base class for postlists.
Definition: postlist.h:37
double get_maxweight() const
Return an upper bound on what get_weight() can return.
Xapian::termcount get_wdf() const
get_wdf() for ANDNOT postlists returns the wdf of the left hand side.
virtual Xapian::docid get_docid() const =0
Return the current docid.
PostList * l
Left sub-postlist.
AndNotPostList(PostList *left, PostList *right, MultiMatch *matcher_, Xapian::doccount dbsize_)
Xapian::doccount get_termfreq_est() const
Get an estimate of the number of documents indexed by this term.
Xapian::doccount get_termfreq_min() const
Get a lower bound on the number of documents indexed by this term.
virtual Internal * skip_to(Xapian::docid did, double w_min)=0
Skip forward to the specified docid.
virtual double recalc_maxweight()=0
Recalculate the upper bound on what get_weight() can return.
Base class for postlists which are generated by merging two sub-postlists.
static double est(double l, double r, double n)
Definition: orpostlist.cc:306
double recalc_maxweight()
Recalculate the upper bound on what get_weight() can return.
virtual Xapian::termcount get_unique_terms() const =0
Return the number of unique terms in the current document.
Xapian::doccount collection_size
Number of documents in the collection.
bool skip_to_handling_prune(PostList *&pl, Xapian::docid did, double w_min, MultiMatch *matcher)
Xapian::doccount rset_size
Number of relevant documents in the collection.
void handle_prune(PostList *&kid, PostList *ret)
Utility method, to call recalc_maxweight() and do the pruning if a next() or skip_to() returns non-NU...
virtual Xapian::termcount get_unique_terms() const
Return the number of unique terms in the current document.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
double get_weight() const
Return the weight contribution for the current position.
Return items which are in A, unless they&#39;re in B.
Xapian::docid rhead
virtual Xapian::termcount get_doclength() const
Return the document length of the document the current term comes from.
virtual TermFreqs get_termfreq_est_using_stats(const Xapian::Weight::Internal &stats) const
Get an estimate for the termfreq and reltermfreq, given the stats.
Definition: postlist.cc:36
bool at_end() const
Return true if the current position is past the last entry in this list.
virtual Xapian::doccount get_termfreq_max() const =0
Get an upper bound on the number of documents indexed by this term.
virtual Xapian::doccount get_termfreq_est() const =0
Get an estimate of the number of documents indexed by this term.
virtual Xapian::doccount get_termfreq_min() const =0
Get a lower bound on the number of documents indexed by this term.
Class to hold statistics for a given collection.
Internal * next()
Advance the current position to the next document in the postlist.
Definition: postlist.h:194
Xapian::docid lhead
PostList * advance_to_next_match(double w_min, PostList *ret)
TermFreqs get_termfreq_est_using_stats(const Xapian::Weight::Internal &stats) const
Get an estimate for the termfreq and reltermfreq, given the stats.
#define LOGCALL_CTOR(CATEGORY, CLASS, PARAMS)
Definition: debuglog.h:489
virtual double get_maxweight() const =0
Return an upper bound on what get_weight() can return.
The frequencies for a term.
bool next_handling_prune(PostList *&pl, double w_min, MultiMatch *matcher)
virtual Internal * next(double w_min)=0
Advance the current position to the next document in the postlist.
virtual Xapian::termcount get_doclength() const =0
Return the length of current document.
virtual std::string get_description() const =0
Return a string description of this object.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
PostList * r
Right sub-postlist.
PostList * sync_and_skip_to(Xapian::docid id, double w_min, Xapian::docid lh, Xapian::docid rh)
virtual Xapian::termcount count_matching_subqs() const
Count the number of leaf subqueries which match at the current position.
Xapian::doccount get_termfreq_max() const
Get an upper bound on the number of documents indexed by this term.
PostList * skip_to(Xapian::docid did, double w_min)
Skip forward to the specified docid.
std::string get_description() const
Return a string description of this object.
virtual bool at_end() const =0
Return true if the current position is past the last entry in this list.
Xapian::doccount dbsize
Number of documents in the database this postlist is across.
Xapian::docid get_docid() const
Return the current docid.
Various assertion macros.
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
Xapian::termcount count_matching_subqs() const
Count the number of leaf subqueries which match at the current position.
Xapian::totallength total_length
Total length of all documents in the collection.
virtual Xapian::termcount get_wdf() const
Return the wdf for the document at the current position.
Definition: postlist.cc:44
virtual double get_weight() const =0
Return the weight contribution for the current position.
Debug logging macros.
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:487