xapian-core  1.4.19
maxpostlist.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007,2009,2010,2011,2012,2013,2014 Olly Betts
5  * Copyright (C) 2009 Lemur Consulting Ltd
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include <config.h>
23 
24 #include "maxpostlist.h"
25 
26 #include "debuglog.h"
27 #include "omassert.h"
28 
29 using namespace std;
30 
32 {
33  if (plist) {
34  for (size_t i = 0; i < n_kids; ++i) {
35  delete plist[i];
36  }
37  delete [] plist;
38  }
39 }
40 
43 {
44  Xapian::doccount res = plist[0]->get_termfreq_min();
45  for (size_t i = 1; i < n_kids; ++i) {
46  res = max(res, plist[i]->get_termfreq_min());
47  }
48  return res;
49 }
50 
53 {
54  Xapian::doccount res = plist[0]->get_termfreq_max();
55  for (size_t i = 1; i < n_kids; ++i) {
56  Xapian::doccount c = plist[i]->get_termfreq_max();
57  if (db_size - res <= c)
58  return db_size;
59  res += c;
60  }
61  return res;
62 }
63 
66 {
67  if (rare(db_size == 0))
68  return 0;
69 
70  // We calculate the estimate assuming independence. The simplest
71  // way to calculate this seems to be a series of (n_kids - 1) pairwise
72  // calculations, which gives the same answer regardless of the order.
73  double scale = 1.0 / db_size;
74  double P_est = plist[0]->get_termfreq_est() * scale;
75  for (size_t i = 1; i < n_kids; ++i) {
76  double P_i = plist[i]->get_termfreq_est() * scale;
77  P_est += P_i - P_est * P_i;
78  }
79  return static_cast<Xapian::doccount>(P_est * db_size + 0.5);
80 }
81 
84  const Xapian::Weight::Internal & stats) const
85 {
86  // We calculate the estimate assuming independence. The simplest
87  // way to calculate this seems to be a series of (n_kids - 1) pairwise
88  // calculations, which gives the same answer regardless of the order.
89  TermFreqs freqs(plist[0]->get_termfreq_est_using_stats(stats));
90 
91  // Our caller should have ensured this.
92  Assert(stats.collection_size);
93  double scale = 1.0 / stats.collection_size;
94  double P_est = freqs.termfreq * scale;
95  double rtf_scale = 0.0;
96  if (stats.rset_size != 0) {
97  rtf_scale = 1.0 / stats.rset_size;
98  }
99  double Pr_est = freqs.reltermfreq * rtf_scale;
100  // If total_length is 0, cf must always be 0 so cf_scale is irrelevant.
101  double cf_scale = 0.0;
102  if (usual(stats.total_length != 0)) {
103  cf_scale = 1.0 / stats.total_length;
104  }
105  double Pc_est = freqs.collfreq * cf_scale;
106 
107  for (size_t i = 1; i < n_kids; ++i) {
108  freqs = plist[i]->get_termfreq_est_using_stats(stats);
109  double P_i = freqs.termfreq * scale;
110  P_est += P_i - P_est * P_i;
111  double Pc_i = freqs.collfreq * cf_scale;
112  Pc_est += Pc_i - Pc_est * Pc_i;
113  // If the rset is empty, Pr_est should be 0 already, so leave
114  // it alone.
115  if (stats.rset_size != 0) {
116  double Pr_i = freqs.reltermfreq * rtf_scale;
117  Pr_est += Pr_i - Pr_est * Pr_i;
118  }
119  }
120  return TermFreqs(Xapian::doccount(P_est * stats.collection_size + 0.5),
121  Xapian::doccount(Pr_est * stats.rset_size + 0.5),
122  Xapian::termcount(Pc_est * stats.total_length + 0.5));
123 }
124 
125 double
127 {
128  return max_cached;
129 }
130 
133 {
134  return did;
135 }
136 
139 {
140  Assert(did);
142  bool doclength_set = false;
143  for (size_t i = 0; i < n_kids; ++i) {
144  if (plist[i]->get_docid() == did) {
145  if (doclength_set) {
146  AssertEq(doclength, plist[i]->get_doclength());
147  } else {
148  doclength = plist[i]->get_doclength();
149  doclength_set = true;
150  }
151  }
152  }
153  Assert(doclength_set);
154  return doclength;
155 }
156 
159 {
160  Assert(did);
161  Xapian::termcount unique_terms = 0;
162  bool unique_terms_set = false;
163  for (size_t i = 0; i < n_kids; ++i) {
164  if (plist[i]->get_docid() == did) {
165  if (unique_terms_set) {
166  AssertEq(unique_terms, plist[i]->get_unique_terms());
167  } else {
168  unique_terms = plist[i]->get_unique_terms();
169  unique_terms_set = true;
170  }
171  }
172  }
173  Assert(unique_terms_set);
174  return unique_terms;
175 }
176 
177 double
179 {
180  Assert(did);
181  double res = 0.0;
182  for (size_t i = 0; i < n_kids; ++i) {
183  if (plist[i]->get_docid() == did)
184  res = max(res, plist[i]->get_weight());
185  }
186  return res;
187 }
188 
189 bool
191 {
192  return (did == 0);
193 }
194 
195 double
197 {
198  max_cached = plist[0]->recalc_maxweight();
199  for (size_t i = 1; i < n_kids; ++i) {
200  max_cached = max(max_cached, plist[i]->recalc_maxweight());
201  }
202  return max_cached;
203 }
204 
205 PostList *
206 MaxPostList::next(double w_min)
207 {
208  Xapian::docid old_did = did;
209  did = 0;
210  for (size_t i = 0; i < n_kids; ++i) {
211  Xapian::docid cur_did = 0;
212  if (old_did != 0)
213  cur_did = plist[i]->get_docid();
214  if (cur_did <= old_did) {
215  PostList * res;
216  if (old_did == 0 || cur_did == old_did) {
217  res = plist[i]->next(w_min);
218  } else {
219  res = plist[i]->skip_to(old_did + 1, w_min);
220  }
221  if (res) {
222  delete plist[i];
223  plist[i] = res;
224  }
225 
226  if (plist[i]->at_end()) {
227  erase_sublist(i--);
228  continue;
229  }
230 
231  if (res)
232  matcher->recalc_maxweight();
233 
234  cur_did = plist[i]->get_docid();
235  }
236 
237  if (did == 0 || cur_did < did) {
238  did = cur_did;
239  }
240  }
241 
242  if (n_kids == 1) {
243  n_kids = 0;
244  return plist[0];
245  }
246 
247  return NULL;
248 }
249 
250 PostList *
251 MaxPostList::skip_to(Xapian::docid did_min, double w_min)
252 {
253  Xapian::docid old_did = did;
254  did = 0;
255  for (size_t i = 0; i < n_kids; ++i) {
256  Xapian::docid cur_did = 0;
257  if (old_did != 0)
258  cur_did = plist[i]->get_docid();
259  if (cur_did < did_min) {
260  PostList * res = plist[i]->skip_to(did_min, w_min);
261  if (res) {
262  delete plist[i];
263  plist[i] = res;
264  }
265 
266  if (plist[i]->at_end()) {
267  erase_sublist(i--);
268  continue;
269  }
270 
271  if (res)
272  matcher->recalc_maxweight();
273 
274  cur_did = plist[i]->get_docid();
275  }
276 
277  if (did == 0 || cur_did < did) {
278  did = cur_did;
279  }
280  }
281 
282  if (n_kids == 1) {
283  n_kids = 0;
284  return plist[0];
285  }
286 
287  return NULL;
288 }
289 
290 string
292 {
293  string desc("(");
294  desc += plist[0]->get_description();
295  for (size_t i = 1; i < n_kids; ++i) {
296  desc += " MAX ";
297  desc += plist[i]->get_description();
298  }
299  desc += ')';
300  return desc;
301 }
302 
305 {
306  Xapian::termcount totwdf = 0;
307  for (size_t i = 0; i < n_kids; ++i) {
308  if (plist[i]->get_docid() == did)
309  totwdf += plist[i]->get_wdf();
310  }
311  return totwdf;
312 }
313 
316 {
317  return 1;
318 }
#define Assert(COND)
Definition: omassert.h:122
Abstract base class for postlists.
Definition: postlist.h:37
Xapian::docid get_docid() const
Return the current docid.
Definition: maxpostlist.cc:132
#define AssertEq(A, B)
Definition: omassert.h:124
virtual Xapian::docid get_docid() const =0
Return the current docid.
Xapian::termcount get_unique_terms() const
Return the number of unique terms in the current document.
Definition: maxpostlist.cc:158
#define usual(COND)
Definition: config.h:544
virtual Internal * skip_to(Xapian::docid did, double w_min)=0
Skip forward to the specified docid.
STL namespace.
#define rare(COND)
Definition: config.h:543
Xapian::doccount termfreq
Xapian::doccount collection_size
Number of documents in the collection.
Xapian::doccount rset_size
Number of relevant documents in the collection.
double get_weight() const
Return the weight contribution for the current position.
Definition: maxpostlist.cc:178
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
double doclength
A normalised document length.
Definition: types.h:59
Xapian::termcount get_doclength() const
Return the length of current document.
Definition: maxpostlist.cc:138
Class to hold statistics for a given collection.
PostList * skip_to(Xapian::docid, double w_min)
Skip forward to the specified docid.
Definition: maxpostlist.cc:251
Internal * next()
Advance the current position to the next document in the postlist.
Definition: postlist.h:194
Xapian::doccount get_termfreq_min() const
Get a lower bound on the number of documents indexed by this term.
Definition: maxpostlist.cc:42
TermFreqs get_termfreq_est_using_stats(const Xapian::Weight::Internal &stats) const
Get an estimate for the termfreq and reltermfreq, given the stats.
Definition: maxpostlist.cc:83
Xapian::termcount collfreq
The frequencies for a term.
virtual Internal * next(double w_min)=0
Advance the current position to the next document in the postlist.
Xapian::doccount get_termfreq_est() const
Get an estimate of the number of documents indexed by this term.
Definition: maxpostlist.cc:65
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
std::string get_description() const
Return a string description of this object.
Definition: maxpostlist.cc:291
Xapian::doccount get_termfreq_max() const
Get an upper bound on the number of documents indexed by this term.
Definition: maxpostlist.cc:52
double get_maxweight() const
Return an upper bound on what get_weight() can return.
Definition: maxpostlist.cc:126
Various assertion macros.
double recalc_maxweight()
Recalculate the upper bound on what get_weight() can return.
Definition: maxpostlist.cc:196
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
N-way OR postlist with wt=max(wt_i)
Xapian::totallength total_length
Total length of all documents in the collection.
Xapian::doccount reltermfreq
bool at_end() const
Return true if the current position is past the last entry in this list.
Definition: maxpostlist.cc:190
Debug logging macros.
Xapian::termcount count_matching_subqs() const
Count the number of leaf subqueries which match at the current position.
Definition: maxpostlist.cc:315
Xapian::termcount get_wdf() const
get_wdf() for MaxPostlist returns the sum of the wdfs of the sub postlists which match the current do...
Definition: maxpostlist.cc:304