xapian-core  2.0.0
estimateop.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2022,2026 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include "estimateop.h"
24 
25 #include "omassert.h"
26 #include "overflow.h"
27 
28 #include <algorithm>
29 #include <memory>
30 
31 using namespace std;
32 
35  Xapian::docid db_first,
36  Xapian::docid db_last)
37 {
38  // Clamp first and last to those from the DB - we lazily set first=1 and/or
39  // last=Xapian::docid(-1) when we don't have better bounds.
40  estimates.first = max(estimates.first, db_first);
41  estimates.last = min(estimates.last, db_last);
42 
43  // If there are gaps in the docid numbering then max_range could be greater
44  // than db_size. It might seem tempting to clamping max_range to db_size
45  // but the gaps will tend to inflate all the range lengths proportionally
46  // so the effect should tend to cancel out if we leave max_range alone.
47  Xapian::doccount max_range = estimates.last - estimates.first + 1;
48 
49  Estimates result;
50  // LocalSubMatch::resolve() checks db_size == 0 so we should be able to
51  // assume it is non-zero in all the cases below.
52  Assert(db_size);
53  switch (type) {
54  case KNOWN:
55  case POSTING_SOURCE:
56  result = estimates;
57  break;
58  case AND: {
59  result.min = max_range;
60  result.max = max_range;
61  double est = max_range;
62  auto n_subqueries = sub_estimates.size();
63  for (unsigned n = 0; n < n_subqueries; ++n) {
64  Estimates r = sub_estimates[n]->resolve(db_size, db_first, db_last);
65 
66  // The number of matching documents is minimised when we have the
67  // minimum number of matching documents from each sub-postlist, and
68  // these are maximally disjoint.
69  auto range_i = r.last - r.first + 1;
70  // min_overlap is the minimum number of documents from this subquery
71  // which have to be in the range where all the subqueries overlap.
72  Xapian::doccount min_overlap;
73  if (!sub_overflows(r.min, range_i - max_range, min_overlap)) {
74  if (!add_overflows(result.min, min_overlap, result.min) &&
75  result.min <= max_range) {
76  // It's possible there's no overlap.
77  result.min = 0;
78  } else {
79  // The pair-wise overlap is (a + b - max_range) - this works
80  // even if (a + b) overflows (a and b are each <= max_range
81  // so subtracting max_range must un-overflow us).
82  result.min -= max_range;
83  }
84  } else {
85  // This child could have nothing in the overlap.
86  result.min = 0;
87  }
88 
89  result.max = std::min(result.max, r.max);
90 
91  // We calculate the estimate assuming independence - it's the
92  // product of (est/range) for the children, multiplied by the
93  // range for the AND.
94  if (usual(range_i > 0)) {
95  est = est * r.est / range_i;
96  }
97  }
98 
99  result.est = static_cast<Xapian::doccount>(est + 0.5);
100  result.est = std::clamp(result.est, result.min, result.max);
101  break;
102  }
103  case AND_NOT: {
104  AssertEq(sub_estimates.size(), 2);
105  result = sub_estimates[0]->resolve(db_size, db_first, db_last);
106  Estimates r = sub_estimates[1]->resolve(db_size, db_first, db_last);
107 
108  // An AND_NOT with no overlap gets optimised away before now, so we
109  // know the ranges overlap.
110  Xapian::doccount overlap =
111  min(result.last, r.last) - max(result.first, r.first) + 1;
112 
113  // The maximum number of documents that could be in common.
114  auto max_actual_overlap = min(overlap, r.max);
115 
116  if (sub_overflows(result.min, max_actual_overlap, result.min)) {
117  // This AND_NOT could match no documents.
118  result.min = 0;
119  }
120 
121  Xapian::doccount range_l = result.last - result.first + 1;
122  Xapian::doccount range_r = r.last - r.first + 1;
123  // We can't match more documents than our left-side does.
124  // We also can't more documents than when the non-overlaps are
125  // full and the overlap as segregated as possible with the LHS
126  // achieving its maximum and the RHS its minimum.
127  //
128  // Note that the second expression can't overflow the type.
129  result.max = std::min(result.max, range_l - overlap + range_r - r.min);
130 
131  // Max is when:
132  // <---- LHS ------->
133  // [LLLLLLL|llllbbrr|RRRRRR]
134  // <-------RHS----->
135  //
136  // #L = range_l - overlap
137  // #R = range_r - overlap
138  // #r+#b = r_min - #R
139  // #l+#b = l_max - #L
140  // #b = rmin - #R + l_max - #L - overlap
141  // result.max = #L + #l = #L + l_max - #L -
142  // - (rmin - #R + l_max - #L - overlap)
143  // = - rmin + #R + #L + overlap)
144  // = #L + #R + overlap - rmin
145  // = range_l + range_r - overlap - rmin
146 
147  // We calculate the estimate assuming independence, but taking into
148  // account the range information.
149  double est = result.est * (1.0 - double(overlap) * r.est /
150  (double(range_l) * range_r));
151 
152  result.est = static_cast<Xapian::doccount>(est + 0.5);
153  break;
154  }
155  case OR: {
156  // FIXME: We don't currently make full use of the range information
157  // here. Trivial testing suggested it doesn't make much difference
158  // for the estimate but we should investigate more deeply as it may
159  // be useful for min and max, and perhaps for the estimate in cases
160  // we didn't try.
161  //
162  // We also only tried applying the range version of
163  // estimate_or_assuming_indep() from orpostlist.cc which works on
164  // pairswise OR multiple times. A fuller algorithm would work
165  // through the range starts and ends something like:
166  //
167  // * Sort start and ends into order and work through (at most 2*N-1
168  // regions)
169  // * For min allocate to where there's most overlap, e.g.
170  //
171  // AAAAAAAAAAAAAAAAAAAAAAA
172  // | | | aaaaaaa|aaaaa| | | |
173  // | | BBBBBBBBBBBBBBBBBBBBBBBBB | |
174  // | | | bbbbbbb|bbbbb|bbbbbb | | |
175  // | | | CCCCCCCCCCCCCCCCCCCCCCCC |
176  // | | | |ccccc|ccc | | |
177  //
178  // * For max try to allocate where there's least overlap
179  // * For est assume each range splits proportionally over the regions
180  // it participates (potentially O(n*n) in number of subqueries)
181  result = sub_estimates[0]->resolve(db_size, db_first, db_last);
182  double scale = max_range == 0.0 ? 1.0 : 1.0 / max_range;
183  double P_est = result.est * scale;
184  auto n_subqueries = sub_estimates.size();
185  for (unsigned i = 1; i != n_subqueries; ++i) {
186  Estimates r = sub_estimates[i]->resolve(db_size, db_first, db_last);
187 
188  result.min = std::max(result.min, r.min);
189 
190  // Sum(max) over subqueries but saturating at max_range.
191  if (max_range - result.max <= r.max) {
192  result.max = max_range;
193  } else {
194  result.max += r.max;
195  }
196 
197  // We calculate the estimate assuming independence. The simplest
198  // way to calculate this seems to be a series of (n_subqueries - 1)
199  // pairwise calculations, which gives the same answer regardless of
200  // the order.
201  double P_i = r.est * scale;
202  P_est += P_i - P_est * P_i;
203  }
204 
205  result.est = static_cast<Xapian::doccount>(P_est * max_range + 0.5);
206  break;
207  }
208  case XOR: {
209  // FIXME: We don't currently make full use of the range information
210  // here.
211  double scale = max_range == 0.0 ? 1.0 : 1.0 / max_range;
212 
213  bool all_exact = true;
214  bool invert = false;
215 
216  unsigned max_overflow = 0;
217  Xapian::doccount max_sum = 0;
218 
219  // We calculate the estimate assuming independence. The simplest
220  // way to calculate this seems to be a series of (n_subqueries - 1)
221  // pairwise calculations, which gives the same answer regardless of the
222  // order.
223  double P_est = 0.0;
224 
225  unsigned j = 0;
226 
227  // Need to buffer min and max values from subqueries so we can compute
228  // our min as a second pass.
229  auto n_subqueries = sub_estimates.size();
230  unique_ptr<Estimates[]> min_and_max{new Estimates[n_subqueries]};
231  for (unsigned i = 0; i < n_subqueries; ++i) {
232  min_and_max[j] =
233  sub_estimates[i]->resolve(db_size, db_first, db_last);
234  const Estimates& r = min_and_max[j];
235 
236  if (r.min == db_size) {
237  // A subquery matches all documents which just inverts which
238  // documents match - note that but otherwise ignore this
239  // subquery.
240  invert = !invert;
241  } else {
242  // Maximum is if all sub-postlists are disjoint.
243  Xapian::doccount max_i = r.max;
244  if (add_overflows(max_sum, max_i, max_sum)) {
245  // Track how many times we overflow the type.
246  ++max_overflow;
247  }
248  all_exact = all_exact && (max_i == r.min);
249 
250  double P_i = r.est * scale;
251  P_est += P_i - 2.0 * P_est * P_i;
252 
253  ++j;
254  }
255  }
256 
257  if (max_overflow || max_sum > db_size) {
258  if (all_exact) {
259  // If the sub-postlist tfs are all exact, then if the sum of
260  // them has a different odd/even-ness to db_size then max tf of
261  // the XOR can't achieve db_size.
262  result.max = db_size - ((max_sum & 1) != (db_size & 1));
263  } else {
264  result.max = db_size;
265  }
266  } else {
267  result.max = max_sum;
268  }
269 
270  result.est = static_cast<Xapian::doccount>(P_est * max_range + 0.5);
271 
272  // Calculate min.
273  Xapian::doccount min = 0;
274  if (max_overflow <= 1) {
275  // If min_i(i) > sum(j!=i)(max_i(j)) then all the other subqueries
276  // can't cancel out subquery i. If we overflowed more than once,
277  // then the true value of max_sum is greater than the maximum
278  // possible tf, so there's no point checking.
279  for (unsigned i = 0; i < j; ++i) {
280  const Estimates& r = min_and_max[i];
281  Xapian::doccount all_the_rest = max_sum - r.max;
282  // If no overflow, or we un-overflowed again...
283  if (max_overflow == 0 || all_the_rest > max_sum) {
284  if (r.min > all_the_rest) {
285  min = std::max(min, r.min - all_the_rest);
286  }
287  }
288  }
289  }
290  if (all_exact && min == 0) {
291  // If all the subqueries termfreqs are exact then the parity of
292  // result.min must match the parity of max_sum. That's already
293  // ensured unless min was clamped to zero in which case result.min
294  // should be 1 if max_sum is odd.
295  result.min = max_sum & 1;
296  } else {
297  result.min = min;
298  }
299 
300  if (invert) {
301  auto old_min = result.min;
302  result.min = db_size - result.max;
303  result.est = db_size - result.est;
304  result.max = db_size - old_min;
305  }
306  break;
307  }
308  case DECIDER:
309  case NEAR:
310  case PHRASE:
311  case EXACT_PHRASE: {
312  result = sub_estimates[0]->resolve(db_size, db_first, db_last);
313  if (estimates.min == 0 && estimates.max == 0) {
314  result.min = 0;
315  // We've arranged for type's value to be the factor we want to
316  // scale by in the absence of accept/reject counts.
317  result.est /= unsigned(type);
318  break;
319  }
320  result.min = estimates.min;
321  result.max -= estimates.max;
322  double scale = double(estimates.min) / (estimates.min + estimates.max);
323  result.est = Xapian::doccount(result.est * scale + 0.5);
324  result.est = std::clamp(result.est, result.min, result.max);
325  break;
326  }
327 #if defined __GNUC__ && !defined __clang__
328  default:
329  // Without this, GCC (noted with 13.3.0 and 15.2.0) incorrectly warns
330  // that result.max may be used uninitialised.
331  //
332  // It seems unhelpful to add a default initialisation as that would
333  // mean compilers wouldn't warn if we genuinely failed to initialise.
334  //
335  // Adding this default case unconditionally seems similarly unhelpful
336  // as it would prevent compilers warning if an enum value isn't handled
337  // by this switch.
338  //
339  // (GCC now won't warn about the latter, but at least other compilers
340  // still can.)
341  __builtin_unreachable();
342 #endif
343  }
344  AssertRel(result.min, <=, result.est);
345  AssertRel(result.est, <=, result.max);
346 
347  result.first = estimates.first;
348  result.last = estimates.last;
349 
350  // The range size is an upper bound for max and est. This is redundant in
351  // some cases (for example it isn't needed for AND) but it's very cheap and
352  // simpler to always ensure this.
353  if (max_range < result.max) {
354  result.max = max_range;
355  if (max_range < result.est) {
356  result.est = max_range;
357  }
358  }
359  return result;
360 }
Estimates resolve(Xapian::doccount db_size, Xapian::docid db_first, Xapian::docid db_last)
Definition: estimateop.cc:34
#define usual(COND)
Definition: config.h:608
Calculated bounds on and estimate of number of matches.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
Various assertion macros.
#define AssertEq(A, B)
Definition: omassert.h:124
#define AssertRel(A, REL, B)
Definition: omassert.h:123
#define Assert(COND)
Definition: omassert.h:122
Arithmetic operations with overflow checks.
std::enable_if_t< std::is_unsigned_v< T1 > &&std::is_unsigned_v< T2 > &&std::is_unsigned_v< R >, bool > sub_overflows(T1 a, T2 b, R &res)
Subtraction with overflow checking.
Definition: overflow.h:125
std::enable_if_t< std::is_unsigned_v< T1 > &&std::is_unsigned_v< T2 > &&std::is_unsigned_v< R >, bool > add_overflows(T1 a, T2 b, R &res)
Addition with overflow checking.
Definition: overflow.h:58
#define OR
#define XOR
#define NEAR
#define AND
Xapian::doccount est
Definition: estimateop.h:33
Xapian::docid last
Upper bound on docids matched.
Definition: estimateop.h:39
Xapian::docid first
Lower bound on docids matched.
Definition: estimateop.h:36
Xapian::doccount min
Definition: estimateop.h:33
Xapian::doccount max
Definition: estimateop.h:33