xapian-core  1.4.27
multixorpostlist.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007,2009,2010,2011,2012,2016 Olly Betts
5  * Copyright (C) 2009 Lemur Consulting Ltd
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include <config.h>
23 
24 #include "multixorpostlist.h"
25 
26 #include "debuglog.h"
27 #include "omassert.h"
28 
29 #include <algorithm>
30 
31 using namespace std;
32 
34 {
35  if (plist) {
36  for (size_t i = 0; i < n_kids; ++i) {
37  delete plist[i];
38  }
39  delete [] plist;
40  }
41 }
42 
45 {
46  Xapian::doccount result = 0;
47  Xapian::doccount max = plist[0]->get_termfreq_max();
48  Xapian::doccount sum = max;
49  bool all_exact = (max == plist[0]->get_termfreq_min());
50  unsigned overflow = 0;
51  for (size_t i = 1; i < n_kids; ++i) {
52  Xapian::doccount tf_max = plist[i]->get_termfreq_max();
53  if (tf_max > max) max = tf_max;
54 
55  Xapian::doccount old_sum = sum;
56  sum += tf_max;
57  // Track how many times we overflow the type.
58  if (sum < old_sum)
59  ++overflow;
60  if (all_exact)
61  all_exact = (tf_max == plist[i]->get_termfreq_min());
62  }
63 
64  // If tf_min(i) > sum(j!=i)(tf_max(j)) then all the other subqueries
65  // can't cancel out subquery i. If we overflowed more than once,
66  // then the sum on the right is greater than the maximum possible
67  // tf, so there's no point checking.
68  if (overflow <= 1) {
69  for (size_t i = 0; i < n_kids; ++i) {
70  Xapian::doccount tf_min = plist[i]->get_termfreq_min();
71  Xapian::doccount tf_max = plist[i]->get_termfreq_max();
72 
73  Xapian::doccount all_the_rest = sum - tf_max;
74  // If no overflow, or we un-overflowed again...
75  if (overflow == 0 || all_the_rest > sum) {
76  if (tf_min > all_the_rest) {
77  result = std::max(result, tf_min - all_the_rest);
78  }
79  }
80  }
81  }
82 
83  if (all_exact && result == 0) {
84  // If SUM odd, then the XOR can't be 0, so min XOR is 1 if we didn't
85  // already calculate a minimum.
86  result = sum & 1;
87  }
88 
89  return result;
90 }
91 
94 {
95  // Maximum is if all sub-postlists are disjoint.
96  Xapian::doccount result = plist[0]->get_termfreq_max();
97  bool all_exact = (result == plist[0]->get_termfreq_min());
98  bool overflow = false;
99  for (size_t i = 1; i < n_kids; ++i) {
100  Xapian::doccount tf_max = plist[i]->get_termfreq_max();
101  Xapian::doccount old_result = result;
102  result += tf_max;
103  // Catch overflowing the type too.
104  if (result < old_result)
105  overflow = true;
106  if (all_exact)
107  all_exact = (tf_max == plist[i]->get_termfreq_min());
108  if (!all_exact && (overflow || result >= db_size))
109  return db_size;
110  }
111  if (all_exact && (overflow || result > db_size)) {
112  // If the sub-postlist tfs are all exact, then if the sum of them has
113  // a different odd/even-ness to db_size then max tf of the XOR can't
114  // achieve db_size.
115  return db_size - ((result & 1) != (db_size & 1));
116  }
117  return result;
118 }
119 
122 {
123  LOGCALL(MATCH, Xapian::doccount, "MultiXorPostList::get_termfreq_est", NO_ARGS);
124  // We shortcut an empty shard and avoid creating a postlist tree for it.
125  Assert(db_size);
126  // We calculate the estimate assuming independence. The simplest
127  // way to calculate this seems to be a series of (n_kids - 1) pairwise
128  // calculations, which gives the same answer regardless of the order.
129  double scale = 1.0 / db_size;
130  double P_est = plist[0]->get_termfreq_est() * scale;
131  for (size_t i = 1; i < n_kids; ++i) {
132  double P_i = plist[i]->get_termfreq_est() * scale;
133  P_est += P_i - 2.0 * P_est * P_i;
134  }
135  return static_cast<Xapian::doccount>(P_est * db_size + 0.5);
136 }
137 
138 TermFreqs
140  const Xapian::Weight::Internal & stats) const
141 {
142  LOGCALL(MATCH, TermFreqs, "MultiXorPostList::get_termfreq_est_using_stats", stats);
143  // We calculate the estimate assuming independence. The simplest
144  // way to calculate this seems to be a series of (n_kids - 1) pairwise
145  // calculations, which gives the same answer regardless of the order.
146  TermFreqs freqs(plist[0]->get_termfreq_est_using_stats(stats));
147 
148  // Our caller should have ensured this.
149  Assert(stats.collection_size);
150  double scale = 1.0 / stats.collection_size;
151  double P_est = freqs.termfreq * scale;
152  double rtf_scale = 0.0;
153  if (stats.rset_size != 0) {
154  rtf_scale = 1.0 / stats.rset_size;
155  }
156  double Pr_est = freqs.reltermfreq * rtf_scale;
157  // If total_length is 0, cf must always be 0 so cf_scale is irrelevant.
158  double cf_scale = 0.0;
159  if (usual(stats.total_length != 0)) {
160  cf_scale = 1.0 / stats.total_length;
161  }
162  double Pc_est = freqs.collfreq * cf_scale;
163 
164  for (size_t i = 1; i < n_kids; ++i) {
165  freqs = plist[i]->get_termfreq_est_using_stats(stats);
166  double P_i = freqs.termfreq * scale;
167  P_est += P_i - 2.0 * P_est * P_i;
168  double Pc_i = freqs.collfreq * cf_scale;
169  Pc_est += Pc_i - 2.0 * Pc_est * Pc_i;
170  // If the rset is empty, Pr_est should be 0 already, so leave
171  // it alone.
172  if (stats.rset_size != 0) {
173  double Pr_i = freqs.reltermfreq * rtf_scale;
174  Pr_est += Pr_i - 2.0 * Pr_est * Pr_i;
175  }
176  }
177  RETURN(TermFreqs(Xapian::doccount(P_est * stats.collection_size + 0.5),
178  Xapian::doccount(Pr_est * stats.rset_size + 0.5),
179  Xapian::termcount(Pc_est * stats.total_length + 0.5)));
180 }
181 
182 double
184 {
185  LOGCALL(MATCH, double, "MultiXorPostList::get_maxweight", NO_ARGS);
186  RETURN(max_total);
187 }
188 
191 {
192  return did;
193 }
194 
197 {
198  Assert(did);
200  bool doclength_set = false;
201  for (size_t i = 0; i < n_kids; ++i) {
202  if (plist[i]->get_docid() == did) {
203  if (doclength_set) {
204  AssertEq(doclength, plist[i]->get_doclength());
205  } else {
206  doclength = plist[i]->get_doclength();
207  doclength_set = true;
208  }
209  }
210  }
211  Assert(doclength_set);
212  return doclength;
213 }
214 
217 {
218  Assert(did);
219  Xapian::termcount unique_terms = 0;
220  bool unique_terms_set = false;
221  for (size_t i = 0; i < n_kids; ++i) {
222  if (plist[i]->get_docid() == did) {
223  if (unique_terms_set) {
224  AssertEq(unique_terms, plist[i]->get_unique_terms());
225  } else {
226  unique_terms = plist[i]->get_unique_terms();
227  unique_terms_set = true;
228  }
229  }
230  }
231  Assert(unique_terms_set);
232  return unique_terms;
233 }
234 
235 double
237 {
238  Assert(did);
239  double result = 0;
240  for (size_t i = 0; i < n_kids; ++i) {
241  if (plist[i]->get_docid() == did)
242  result += plist[i]->get_weight();
243  }
244  return result;
245 }
246 
247 bool
249 {
250  return (did == 0);
251 }
252 
253 double
255 {
256  LOGCALL(MATCH, double, "MultiXorPostList::recalc_maxweight", NO_ARGS);
257  max_total = plist[0]->recalc_maxweight();
258  double min_max = max_total;
259  for (size_t i = 1; i < n_kids; ++i) {
260  double new_max = plist[i]->recalc_maxweight();
261  if (new_max < min_max)
262  min_max = new_max;
263  max_total += new_max;
264  }
265  if ((n_kids & 1) == 0) {
266  // If n_kids is even, we omit the child with the smallest maxweight.
267  max_total -= min_max;
268  }
269  RETURN(max_total);
270 }
271 
272 PostList *
274 {
275  LOGCALL(MATCH, PostList *, "MultiXorPostList::next", w_min);
276  Xapian::docid old_did = did;
277  did = 0;
278  size_t matching_count = 0;
279  for (size_t i = 0; i < n_kids; ++i) {
280  if (old_did == 0 || plist[i]->get_docid() <= old_did) {
281  // FIXME: calculate the min weight required here...
282  PostList * res = plist[i]->next(0);
283  if (res) {
284  delete plist[i];
285  plist[i] = res;
286  matcher->recalc_maxweight();
287  }
288 
289  if (plist[i]->at_end()) {
290  erase_sublist(i--);
291  continue;
292  }
293  }
294 
295  Xapian::docid new_did = plist[i]->get_docid();
296  if (did == 0 || new_did < did) {
297  did = new_did;
298  matching_count = 1;
299  } else if (new_did == did) {
300  ++matching_count;
301  }
302  }
303 
304  if (n_kids == 1) {
305  n_kids = 0;
306  RETURN(plist[0]);
307  }
308 
309  // We've reached the end of all posting lists.
310  if (did == 0)
311  RETURN(NULL);
312 
313  // An odd number of sub-postlists match this docid, so the XOR matches.
314  if (matching_count & 1)
315  RETURN(NULL);
316 
317  // An even number of sub-postlists match this docid, so advance again.
318  RETURN(next(w_min));
319 }
320 
321 PostList *
323 {
324  LOGCALL(MATCH, PostList *, "MultiXorPostList::skip_to", did_min | w_min);
325  Xapian::docid old_did = did;
326  did = 0;
327  size_t matching_count = 0;
328  for (size_t i = 0; i < n_kids; ++i) {
329  if (old_did == 0 || plist[i]->get_docid() < did_min) {
330  // FIXME: calculate the min weight required here...
331  PostList * res = plist[i]->skip_to(did_min, 0);
332  if (res) {
333  delete plist[i];
334  plist[i] = res;
335  matcher->recalc_maxweight();
336  }
337 
338  if (plist[i]->at_end()) {
339  erase_sublist(i--);
340  continue;
341  }
342  }
343 
344  Xapian::docid new_did = plist[i]->get_docid();
345  if (did == 0 || new_did < did) {
346  did = new_did;
347  matching_count = 1;
348  } else if (new_did == did) {
349  ++matching_count;
350  }
351  }
352 
353  if (n_kids == 1) {
354  AssertEq(matching_count, 1);
355  n_kids = 0;
356  RETURN(plist[0]);
357  }
358 
359  // We've reached the end of all posting lists.
360  if (did == 0)
361  RETURN(NULL);
362 
363  // An odd number of sub-postlists match this docid, so the XOR matches.
364  if (matching_count & 1)
365  RETURN(NULL);
366 
367  // An even number of sub-postlists match this docid, so call next.
368  RETURN(next(w_min));
369 }
370 
371 string
373 {
374  string desc("(");
375  desc += plist[0]->get_description();
376  for (size_t i = 1; i < n_kids; ++i) {
377  desc += " XOR ";
378  desc += plist[i]->get_description();
379  }
380  desc += ')';
381  return desc;
382 }
383 
386 {
387  Xapian::termcount totwdf = 0;
388  for (size_t i = 0; i < n_kids; ++i) {
389  if (plist[i]->get_docid() == did)
390  totwdf += plist[i]->get_wdf();
391  }
392  return totwdf;
393 }
394 
397 {
398  Xapian::termcount total = 0;
399  for (size_t i = 0; i < n_kids; ++i) {
400  if (plist[i]->get_docid() == did)
401  total += plist[i]->count_matching_subqs();
402  }
403  return total;
404 }
#define RETURN(A)
Definition: debuglog.h:493
#define Assert(COND)
Definition: omassert.h:122
double recalc_maxweight()
Recalculate the upper bound on what get_weight() can return.
Abstract base class for postlists.
Definition: postlist.h:37
bool at_end() const
Return true if the current position is past the last entry in this list.
N-way XOR postlist.
Xapian::doccount get_termfreq_est() const
Get an estimate of the number of documents indexed by this term.
#define AssertEq(A, B)
Definition: omassert.h:124
#define usual(COND)
Definition: config.h:576
Xapian::docid get_docid() const
Return the current docid.
virtual Internal * skip_to(Xapian::docid did, double w_min)=0
Skip forward to the specified docid.
virtual double recalc_maxweight()=0
Recalculate the upper bound on what get_weight() can return.
STL namespace.
Xapian::doccount termfreq
Xapian::doccount collection_size
Number of documents in the collection.
Xapian::doccount rset_size
Number of relevant documents in the collection.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
Xapian::termcount count_matching_subqs() const
Count the number of leaf subqueries which match at the current position.
Xapian::doccount get_termfreq_max() const
Get an upper bound on the number of documents indexed by this term.
double doclength
A normalised document length.
Definition: types.h:59
Xapian::doccount get_termfreq_min() const
Get a lower bound on the number of documents indexed by this term.
Class to hold statistics for a given collection.
Internal * next()
Advance the current position to the next document in the postlist.
Definition: postlist.h:194
Xapian::termcount collfreq
Xapian::termcount get_unique_terms() const
Return the number of unique terms in the current document.
double get_weight() const
Return the weight contribution for the current position.
PostList * skip_to(Xapian::docid, double w_min)
Skip forward to the specified docid.
The frequencies for a term.
virtual Internal * next(double w_min)=0
Advance the current position to the next document in the postlist.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Xapian::termcount get_doclength() const
Return the length of current document.
std::string get_description() const
Return a string description of this object.
Various assertion macros.
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
Xapian::totallength total_length
Total length of all documents in the collection.
TermFreqs get_termfreq_est_using_stats(const Xapian::Weight::Internal &stats) const
Get an estimate for the termfreq and reltermfreq, given the stats.
Xapian::doccount reltermfreq
Xapian::termcount get_wdf() const
get_wdf() for MultiXorPostlists returns the sum of the wdfs of the sub postlists which match the curr...
Debug logging macros.
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:487
double get_maxweight() const
Return an upper bound on what get_weight() can return.