xapian-core  1.4.25
maxpostlist.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007,2009,2010,2011,2012,2013,2014 Olly Betts
5  * Copyright (C) 2009 Lemur Consulting Ltd
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include <config.h>
23 
24 #include "maxpostlist.h"
25 
26 #include "debuglog.h"
27 #include "omassert.h"
28 
29 using namespace std;
30 
32 {
33  if (plist) {
34  for (size_t i = 0; i < n_kids; ++i) {
35  delete plist[i];
36  }
37  delete [] plist;
38  }
39 }
40 
43 {
44  Xapian::doccount res = plist[0]->get_termfreq_min();
45  for (size_t i = 1; i < n_kids; ++i) {
46  res = max(res, plist[i]->get_termfreq_min());
47  }
48  return res;
49 }
50 
53 {
54  Xapian::doccount res = plist[0]->get_termfreq_max();
55  for (size_t i = 1; i < n_kids; ++i) {
56  Xapian::doccount c = plist[i]->get_termfreq_max();
57  if (db_size - res <= c)
58  return db_size;
59  res += c;
60  }
61  return res;
62 }
63 
66 {
67  // We shortcut an empty shard and avoid creating a postlist tree for it.
68  Assert(db_size);
69 
70  // We calculate the estimate assuming independence. The simplest
71  // way to calculate this seems to be a series of (n_kids - 1) pairwise
72  // calculations, which gives the same answer regardless of the order.
73  double scale = 1.0 / db_size;
74  double P_est = plist[0]->get_termfreq_est() * scale;
75  for (size_t i = 1; i < n_kids; ++i) {
76  double P_i = plist[i]->get_termfreq_est() * scale;
77  P_est += P_i - P_est * P_i;
78  }
79  return static_cast<Xapian::doccount>(P_est * db_size + 0.5);
80 }
81 
82 double
84 {
85  return max_cached;
86 }
87 
90 {
91  return did;
92 }
93 
96 {
97  Assert(did);
99  bool doclength_set = false;
100  for (size_t i = 0; i < n_kids; ++i) {
101  if (plist[i]->get_docid() == did) {
102  if (doclength_set) {
103  AssertEq(doclength, plist[i]->get_doclength());
104  } else {
105  doclength = plist[i]->get_doclength();
106  doclength_set = true;
107  }
108  }
109  }
110  Assert(doclength_set);
111  return doclength;
112 }
113 
116 {
117  Assert(did);
118  Xapian::termcount unique_terms = 0;
119  bool unique_terms_set = false;
120  for (size_t i = 0; i < n_kids; ++i) {
121  if (plist[i]->get_docid() == did) {
122  if (unique_terms_set) {
123  AssertEq(unique_terms, plist[i]->get_unique_terms());
124  } else {
125  unique_terms = plist[i]->get_unique_terms();
126  unique_terms_set = true;
127  }
128  }
129  }
130  Assert(unique_terms_set);
131  return unique_terms;
132 }
133 
134 double
136 {
137  Assert(did);
138  double res = 0.0;
139  for (size_t i = 0; i < n_kids; ++i) {
140  if (plist[i]->get_docid() == did)
141  res = max(res, plist[i]->get_weight());
142  }
143  return res;
144 }
145 
146 bool
148 {
149  return (did == 0);
150 }
151 
152 double
154 {
155  max_cached = plist[0]->recalc_maxweight();
156  for (size_t i = 1; i < n_kids; ++i) {
157  max_cached = max(max_cached, plist[i]->recalc_maxweight());
158  }
159  return max_cached;
160 }
161 
162 PostList *
163 MaxPostList::next(double w_min)
164 {
165  Xapian::docid old_did = did;
166  did = 0;
167  for (size_t i = 0; i < n_kids; ++i) {
168  Xapian::docid cur_did = 0;
169  if (old_did != 0)
170  cur_did = plist[i]->get_docid();
171  if (cur_did <= old_did) {
172  PostList * res;
173  if (old_did == 0 || cur_did == old_did) {
174  res = plist[i]->next(w_min);
175  } else {
176  res = plist[i]->skip_to(old_did + 1, w_min);
177  }
178  if (res) {
179  delete plist[i];
180  plist[i] = res;
181  }
182 
183  if (plist[i]->at_end()) {
184  erase_sublist(i--);
185  continue;
186  }
187 
188  if (res)
189  matcher->recalc_maxweight();
190 
191  cur_did = plist[i]->get_docid();
192  }
193 
194  if (did == 0 || cur_did < did) {
195  did = cur_did;
196  }
197  }
198 
199  if (n_kids == 1) {
200  n_kids = 0;
201  return plist[0];
202  }
203 
204  return NULL;
205 }
206 
207 PostList *
208 MaxPostList::skip_to(Xapian::docid did_min, double w_min)
209 {
210  Xapian::docid old_did = did;
211  did = 0;
212  for (size_t i = 0; i < n_kids; ++i) {
213  Xapian::docid cur_did = 0;
214  if (old_did != 0)
215  cur_did = plist[i]->get_docid();
216  if (cur_did < did_min) {
217  PostList * res = plist[i]->skip_to(did_min, w_min);
218  if (res) {
219  delete plist[i];
220  plist[i] = res;
221  }
222 
223  if (plist[i]->at_end()) {
224  erase_sublist(i--);
225  continue;
226  }
227 
228  if (res)
229  matcher->recalc_maxweight();
230 
231  cur_did = plist[i]->get_docid();
232  }
233 
234  if (did == 0 || cur_did < did) {
235  did = cur_did;
236  }
237  }
238 
239  if (n_kids == 1) {
240  n_kids = 0;
241  return plist[0];
242  }
243 
244  return NULL;
245 }
246 
247 string
249 {
250  string desc("(");
251  desc += plist[0]->get_description();
252  for (size_t i = 1; i < n_kids; ++i) {
253  desc += " MAX ";
254  desc += plist[i]->get_description();
255  }
256  desc += ')';
257  return desc;
258 }
259 
262 {
263  Xapian::termcount totwdf = 0;
264  for (size_t i = 0; i < n_kids; ++i) {
265  if (plist[i]->get_docid() == did)
266  totwdf += plist[i]->get_wdf();
267  }
268  return totwdf;
269 }
270 
273 {
274  return 1;
275 }
#define Assert(COND)
Definition: omassert.h:122
Abstract base class for postlists.
Definition: postlist.h:37
Xapian::docid get_docid() const
Return the current docid.
Definition: maxpostlist.cc:89
#define AssertEq(A, B)
Definition: omassert.h:124
virtual Xapian::docid get_docid() const =0
Return the current docid.
Xapian::termcount get_unique_terms() const
Return the number of unique terms in the current document.
Definition: maxpostlist.cc:115
virtual Internal * skip_to(Xapian::docid did, double w_min)=0
Skip forward to the specified docid.
STL namespace.
double get_weight() const
Return the weight contribution for the current position.
Definition: maxpostlist.cc:135
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
double doclength
A normalised document length.
Definition: types.h:59
Xapian::termcount get_doclength() const
Return the length of current document.
Definition: maxpostlist.cc:95
PostList * skip_to(Xapian::docid, double w_min)
Skip forward to the specified docid.
Definition: maxpostlist.cc:208
Internal * next()
Advance the current position to the next document in the postlist.
Definition: postlist.h:194
Xapian::doccount get_termfreq_min() const
Get a lower bound on the number of documents indexed by this term.
Definition: maxpostlist.cc:42
virtual Internal * next(double w_min)=0
Advance the current position to the next document in the postlist.
Xapian::doccount get_termfreq_est() const
Get an estimate of the number of documents indexed by this term.
Definition: maxpostlist.cc:65
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
std::string get_description() const
Return a string description of this object.
Definition: maxpostlist.cc:248
Xapian::doccount get_termfreq_max() const
Get an upper bound on the number of documents indexed by this term.
Definition: maxpostlist.cc:52
double get_maxweight() const
Return an upper bound on what get_weight() can return.
Definition: maxpostlist.cc:83
Various assertion macros.
double recalc_maxweight()
Recalculate the upper bound on what get_weight() can return.
Definition: maxpostlist.cc:153
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
N-way OR postlist with wt=max(wt_i)
bool at_end() const
Return true if the current position is past the last entry in this list.
Definition: maxpostlist.cc:147
Debug logging macros.
Xapian::termcount count_matching_subqs() const
Count the number of leaf subqueries which match at the current position.
Definition: maxpostlist.cc:272
Xapian::termcount get_wdf() const
get_wdf() for MaxPostlist returns the sum of the wdfs of the sub postlists which match the current do...
Definition: maxpostlist.cc:261