xapian-core  2.0.0
estimateop.h
Go to the documentation of this file.
1 
4 /* Copyright (C) 2022,2025,2026 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #ifndef XAPIAN_INCLUDED_ESTIMATEOP_H
22 #define XAPIAN_INCLUDED_ESTIMATEOP_H
23 
24 #include <memory>
25 
26 #include "xapian/types.h"
27 
28 #include "api/smallvector.h"
29 #include "omassert.h"
30 #include "backends/postlist.h"
31 
32 struct Estimates {
34 
37 
40 
41  Estimates() { }
42 
44  Xapian::doccount est_,
45  Xapian::doccount max_,
46  Xapian::doccount first_ = 1,
47  Xapian::doccount last_ = Xapian::docid(-1))
48  : min(min_), est(est_), max(max_), first(first_), last(last_) { }
49 };
50 
51 // Clean up Microsoft namespace pollution.
52 #ifdef NEAR
53 # undef NEAR
54 #endif
55 
64 class EstimateOp {
65  public:
66  enum op_type {
68  // In the absence of accept/reject counts we just scale the AND by
69  // dividing by these values:
70  DECIDER = 1,
71  NEAR = 2, PHRASE = 3, EXACT_PHRASE = 4,
74  };
75 
76  private:
78 
90 
92 
93  public:
96  Xapian::docid first, Xapian::docid last)
97  : type(KNOWN), estimates(tf_, tf_, tf_, first, last) { }
98 
101  : type(POSTING_SOURCE) { }
102 
104  EstimateOp(Estimates estimates_)
105  : type(KNOWN), estimates(estimates_) { }
106 
109  : type(KNOWN), estimates(tf_, tf_, tf_) { }
110 
113  Xapian::VecUniquePtr<EstimateOp>&& sub_estimates_)
114  : type(type_), sub_estimates(std::move(sub_estimates_)) {
115  estimates.first = first;
116  estimates.last = last;
117  }
118 
121  std::unique_ptr<EstimateOp>&& est1,
122  std::unique_ptr<EstimateOp>&& est2)
123  : type(type_) {
124  sub_estimates.push_back(est1.release());
125  sub_estimates.push_back(est2.release());
126  estimates.first = first;
127  estimates.last = last;
128  }
129 
134  EstimateOp(op_type type_, EstimateOp* sub_estimate)
135  : type(type_), estimates(0, 0, 0) {
136  sub_estimates.push_back(sub_estimate);
137  }
138 
144  estimates.first = first;
145  }
146 
147  void report_ratio(Xapian::doccount accepted, Xapian::doccount rejected) {
148  Assert(type == DECIDER ||
149  type == NEAR ||
150  type == PHRASE ||
151  type == EXACT_PHRASE);
152  // Store ratio to use later.
153  estimates.min = accepted;
154  estimates.max = rejected;
155  }
156 
159  Xapian::doccount rejected) {
160  AssertEq(type, KNOWN);
161 
162  // Degenerate range case.
163  if (estimates.min == estimates.max) return;
164 
165  // The static min is 0.
166  AssertEq(estimates.min, 0);
167  estimates.min = accepted;
168 
169  // Combine the static estimate already in estimate.est with a dynamic
170  // estimate based on accepted/rejected ratio using a weighted average
171  // based on the proportion of value_freq actually looked at.
172  auto est = double(estimates.max - accepted - rejected);
173  est = est / estimates.max * estimates.est;
174  estimates.est = accepted + Xapian::doccount(est + 0.5);
175 
176  // The static max is the value slot frequency, so every reject can
177  // be removed from that.
178  estimates.max -= rejected;
179 
180  // We shouldn't need to clamp here to ensure the invariant.
183  }
184 
187  Xapian::doccount est,
188  Xapian::doccount max_) {
190  estimates.min = min_;
191  estimates.est = est;
192  estimates.max = max_;
193  estimates.first = 1;
195  }
196 
198  Xapian::docid db_first,
199  Xapian::docid db_last);
200 
201  unsigned get_subquery_count() const { return sub_estimates.size(); }
202 };
203 
204 namespace Xapian::Internal {
205 
207  // Note that pl may hold a pointer to est and pl's destructor may try to
208  // report stats via that pointer, so the order of destruction of pl and
209  // est matters. We don't currently enforce that here, but instead the
210  // stats reporting is skipped if no matching has yet occurred, which is
211  // enough to avoid calls to a deleted est (after matching we carefully
212  // arrange to destroy the postlist tree before the estimates).
213  //
214  // FIXME: It would be cleaner to enforce that pl gets destroyed first here,
215  // but that seems to require that this struct owns the pl which requires a
216  // significant refactor.
217  PostList* pl = nullptr;
218 
219  std::unique_ptr<EstimateOp> est;
220 
222 
224  : pl(pl_), est(est_) { }
225 
226  PostListAndEstimate(PostList* pl_, std::unique_ptr<EstimateOp>&& est_)
227  : pl(pl_), est(std::move(est_)) { }
228 };
229 
230 }
231 
233 
234 #endif // XAPIAN_INCLUDED_ESTIMATEOP_H
Class for estimating the total number of matching documents.
Definition: estimateop.h:64
EstimateOp()
PostingSource.
Definition: estimateop.h:100
EstimateOp(op_type type_, EstimateOp *sub_estimate)
DECIDER, NEAR, PHRASE or EXACT_PHRASE.
Definition: estimateop.h:134
EstimateOp(Xapian::doccount tf_)
Value range degenerate case.
Definition: estimateop.h:108
EstimateOp(op_type type_, Xapian::docid first, Xapian::docid last, Xapian::VecUniquePtr< EstimateOp > &&sub_estimates_)
AND, AND_NOT, OR or XOR.
Definition: estimateop.h:112
unsigned get_subquery_count() const
Definition: estimateop.h:201
void report_ratio(Xapian::doccount accepted, Xapian::doccount rejected)
Definition: estimateop.h:147
void report_termfreqs(Xapian::doccount min_, Xapian::doccount est, Xapian::doccount max_)
Fill in estimates for POSTING_SOURCE.
Definition: estimateop.h:186
EstimateOp(Estimates estimates_)
Value range.
Definition: estimateop.h:104
Estimates estimates
Estimates.
Definition: estimateop.h:89
void report_first(Xapian::docid first)
Report the first docid indexed.
Definition: estimateop.h:143
@ POSTING_SOURCE
Definition: estimateop.h:72
@ EXACT_PHRASE
Definition: estimateop.h:71
EstimateOp(op_type type_, Xapian::docid first, Xapian::docid last, std::unique_ptr< EstimateOp > &&est1, std::unique_ptr< EstimateOp > &&est2)
AND, AND_NOT, OR or XOR (pair-wise).
Definition: estimateop.h:120
op_type type
Definition: estimateop.h:77
EstimateOp(Xapian::doccount tf_, Xapian::docid first, Xapian::docid last)
Leaf term.
Definition: estimateop.h:95
Xapian::VecUniquePtr< EstimateOp > sub_estimates
Definition: estimateop.h:91
Estimates resolve(Xapian::doccount db_size, Xapian::docid db_first, Xapian::docid db_last)
Definition: estimateop.cc:34
void report_range_ratio(Xapian::doccount accepted, Xapian::doccount rejected)
Adjust static estimates for value range.
Definition: estimateop.h:158
Abstract base class for postlists.
Definition: postlist.h:40
Suitable for "simple" type T.
Definition: smallvector.h:62
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
Various assertion macros.
#define AssertEq(A, B)
Definition: omassert.h:124
#define AssertRel(A, REL, B)
Definition: omassert.h:123
#define Assert(COND)
Definition: omassert.h:122
Abstract base class for postlists.
Custom vector implementations using small vector optimisation.
Xapian::doccount est
Definition: estimateop.h:33
Xapian::docid last
Upper bound on docids matched.
Definition: estimateop.h:39
Xapian::docid first
Lower bound on docids matched.
Definition: estimateop.h:36
Xapian::doccount min
Definition: estimateop.h:33
Estimates(Xapian::doccount min_, Xapian::doccount est_, Xapian::doccount max_, Xapian::doccount first_=1, Xapian::doccount last_=Xapian::docid(-1))
Definition: estimateop.h:43
Xapian::doccount max
Definition: estimateop.h:33
PostListAndEstimate(PostList *pl_, std::unique_ptr< EstimateOp > &&est_)
Definition: estimateop.h:226
PostListAndEstimate(PostList *pl_, EstimateOp *est_)
Definition: estimateop.h:223
std::unique_ptr< EstimateOp > est
Definition: estimateop.h:219
typedefs for Xapian