xapian-core  2.0.0
orpostlist.cc
Go to the documentation of this file.
1 
4 /* Copyright 2017,2022 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include "orpostlist.h"
24 
25 #include "andmaybepostlist.h"
26 #include "andpostlist.h"
27 #include "min_non_zero.h"
28 #include "postlisttree.h"
29 
30 #include <algorithm>
31 
32 using namespace std;
33 
34 template<typename T>
35 static void
36 estimate_or_assuming_indep(double a, double b, double n, T& res)
37 {
38  Assert(n != 0.0);
39  res = static_cast<T>(a + b - (a * b / n) + 0.5);
40 }
41 
42 template<typename T>
43 static void
44 estimate_or_assuming_indep(double a, double af, double al,
45  double b, double bf, double bl,
46  T& res)
47 {
48  // Clamp estimates to range lengths.
49  a = min(a, al - af + 1.0);
50  b = min(b, bl - bf + 1.0);
51  AssertRel(a,>=,0);
52  AssertRel(b,>=,0);
53 
54  if (al < bf || bl < af) {
55  // Disjoint ranges.
56  res = static_cast<T>(a + b + 0.5);
57  return;
58  }
59 
60  // Arrange for af <= bf.
61  if (af > bf) {
62  swap(a, b);
63  swap(af, bf);
64  swap(al, bl);
65  }
66 
67  // Arrange for al <= bl.
68  if (al > bl) {
69  bf += (al - bl);
70  bl = al;
71  }
72 
73  double arate = a / (al - af + 1);
74  double brate = b / (bl - bf + 1);
75 
76  double r = arate * (bf - af) +
77  brate * (bl - al) +
78  (arate + brate - arate * brate) * (al - bf + 1);
79  res = static_cast<T>(r + 0.5);
80 }
81 
83  PostListTree* pltree_)
84  : l(left), r(right), pltree(pltree_)
85 {
86  auto l_tf_est = l->get_termfreq();
87  auto r_tf_est = r->get_termfreq();
88  Xapian::docid l_first = 1, l_last = Xapian::docid(-1);
89  Xapian::docid r_first = 1, r_last = Xapian::docid(-1);
90  l->get_docid_range(l_first, l_last);
91  r->get_docid_range(r_first, r_last);
92  if (l_last < l_first) {
93  l_last = 0;
94  l_first = 1;
95  }
96  if (r_last < r_first) {
97  r_last = 0;
98  r_first = 1;
99  }
100  estimate_or_assuming_indep(l_tf_est, l_first, l_last,
101  r_tf_est, r_first, r_last,
102  termfreq);
103 }
104 
105 PostList*
107  double w_min,
108  bool* valid_ptr)
109 {
110  l = new AndPostList(l, r, l_max, r_max, pltree, termfreq);
111  r = NULL;
112  PostList* result;
113  if (valid_ptr) {
114  result = l->check(did, w_min, *valid_ptr);
115  } else {
116  result = l->skip_to(did, w_min);
117  }
118  if (!result) {
119  result = l;
120  l = NULL;
121  }
122  pltree->force_recalc();
123  return result;
124 }
125 
126 PostList*
128  PostList* right,
129  Xapian::docid did,
130  double w_min,
131  bool* valid_ptr)
132 {
133  if (l != left) swap(l_max, r_max);
134  l = new AndMaybePostList(left, right, l_max, r_max, pltree);
135  r = NULL;
136  PostList* result;
137  if (valid_ptr) {
138  result = l->check(did, w_min, *valid_ptr);
139  } else {
140  result = l->skip_to(did, w_min);
141  }
142  if (!result) {
143  result = l;
144  l = NULL;
145  }
146  pltree->force_recalc();
147  return result;
148 }
149 
152 {
153  // Handle l_did or r_did being zero correctly (which means the last call on
154  // that side was a check() which came back !valid).
155  return min_non_zero(l_did, r_did);
156 }
157 
158 double
160  Xapian::termcount unique_terms,
161  Xapian::termcount wdfdocmax) const
162 {
163  if (r_did == 0 || l_did < r_did)
164  return l->get_weight(doclen, unique_terms, wdfdocmax);
165  if (l_did == 0 || l_did > r_did)
166  return r->get_weight(doclen, unique_terms, wdfdocmax);
167  return l->get_weight(doclen, unique_terms, wdfdocmax) +
168  r->get_weight(doclen, unique_terms, wdfdocmax);
169 }
170 
171 double
173 {
174  l_max = l->recalc_maxweight();
175  r_max = r->recalc_maxweight();
176  return l_max + r_max;
177 }
178 
179 PostList*
180 OrPostList::next(double w_min)
181 {
182  if (w_min > l_max) {
183  if (w_min > r_max) {
184  // Work out the smallest docid which the AND could match at.
185  Xapian::docid did;
186  if (l_did == r_did || r_did == 0) {
187  did = l_did + 1;
188  } else if (l_did == 0) {
189  did = r_did + 1;
190  } else {
191  // The OR last matched at min(l_did, r_did), so the AND could
192  // match at the max().
193  did = max(l_did, r_did);
194  }
195  return decay_to_and(did, w_min);
196  }
197  // Work out the smallest docid which r AND_MAYBE l could match at.
198  Xapian::docid did;
199  if (r_did == 0) {
200  did = l_did + 1;
201  } else if (l_did - 1 >= r_did - 1) {
202  did = r_did + 1;
203  } else {
204  // l_did and r_did both non zero and l_did < r_did.
205  did = r_did;
206  }
207  return decay_to_andmaybe(r, l, did, w_min);
208  }
209  if (w_min > r_max) {
210  // Work out the smallest docid which l AND_MAYBE r could match at.
211  Xapian::docid did;
212  if (l_did == 0) {
213  did = r_did + 1;
214  } else if (r_did - 1 >= l_did - 1) {
215  did = l_did + 1;
216  } else {
217  // l_did and r_did both non zero and r_did < l_did.
218  did = l_did;
219  }
220  return decay_to_andmaybe(l, r, did, w_min);
221  }
222 
223  // We always advance_l if l_did is 0, and similarly for advance_r.
224  bool advance_l = (l_did <= r_did);
225  bool advance_r = (l_did >= r_did);
226 
227  if (advance_l) {
228  PostList* result = l->next(w_min - r_max);
229  if (result) {
230  delete l;
231  l = result;
232  }
233  }
234 
235  if (advance_r) {
236  PostList* result = r->next(w_min - l_max);
237  if (result) {
238  delete r;
239  r = result;
240  }
241  }
242 
243  if (advance_l) {
244  if (l->at_end()) {
245  PostList* result = r;
246  r = NULL;
247  pltree->force_recalc();
248  return result;
249  }
250  }
251 
252  if (advance_r) {
253  if (r->at_end()) {
254  PostList* result = l;
255  l = NULL;
256  pltree->force_recalc();
257  return result;
258  }
259  }
260 
261  if (advance_l) {
262  l_did = l->get_docid();
263  }
264 
265  if (advance_r) {
266  r_did = r->get_docid();
267  }
268 
269  return NULL;
270 }
271 
272 PostList*
274 {
275  // We always advance_l if l_did is 0, and similarly for advance_r.
276  bool advance_l = (did > l_did);
277  bool advance_r = (did > r_did);
278  if (!advance_l && !advance_r)
279  return NULL;
280 
281  if (w_min > l_max) {
282  if (w_min > r_max)
283  return decay_to_and(did, w_min);
284  return decay_to_andmaybe(r, l, did, w_min);
285  }
286  if (w_min > r_max) {
287  return decay_to_andmaybe(l, r, did, w_min);
288  }
289 
290  if (advance_l) {
291  PostList* result = l->skip_to(did, w_min - r_max);
292  if (result) {
293  delete l;
294  l = result;
295  }
296  }
297 
298  if (advance_r) {
299  PostList* result = r->skip_to(did, w_min - l_max);
300  if (result) {
301  delete r;
302  r = result;
303  }
304  }
305 
306  if (advance_l) {
307  if (l->at_end()) {
308  PostList* result = r;
309  r = NULL;
310  pltree->force_recalc();
311  return result;
312  }
313  }
314 
315  if (advance_r) {
316  if (r->at_end()) {
317  PostList* result = l;
318  l = NULL;
319  pltree->force_recalc();
320  return result;
321  }
322  }
323 
324  if (advance_l) {
325  l_did = l->get_docid();
326  }
327 
328  if (advance_r) {
329  r_did = r->get_docid();
330  }
331 
332  return NULL;
333 }
334 
335 PostList*
336 OrPostList::check(Xapian::docid did, double w_min, bool& valid)
337 {
338  bool advance_l = (did > l_did);
339  bool advance_r = (did > r_did);
340  if (!advance_l && !advance_r) {
341  // A call to check() which steps back isn't valid, so if we get here
342  // then did should be equal to at least one of l_did or r_did.
343  Assert(did == l_did || did == r_did);
344  valid = true;
345  return NULL;
346  }
347 
348  if (w_min > l_max) {
349  valid = true;
350  if (w_min > r_max)
351  return decay_to_and(did, w_min, &valid);
352  return decay_to_andmaybe(r, l, did, w_min, &valid);
353  }
354  if (w_min > r_max) {
355  valid = true;
356  return decay_to_andmaybe(l, r, did, w_min, &valid);
357  }
358 
359  if (advance_l) {
360  bool l_valid;
361  PostList* result = l->check(did, w_min - r_max, l_valid);
362  if (result) {
363  Assert(l_valid);
364  delete l;
365  l = result;
366  } else if (!l_valid) {
367  l_did = 0;
368  advance_l = false;
369  }
370  }
371 
372  if (advance_r) {
373  bool r_valid;
374  PostList* result = r->check(did, w_min - l_max, r_valid);
375  if (result) {
376  Assert(r_valid);
377  delete r;
378  r = result;
379  } else if (!r_valid) {
380  r_did = 0;
381  advance_r = false;
382  }
383  }
384 
385  if (advance_l) {
386  if (l->at_end()) {
387  PostList* result = r;
388  r = NULL;
389  pltree->force_recalc();
390  valid = true;
391  return result;
392  }
393  }
394 
395  if (advance_r) {
396  if (r->at_end()) {
397  PostList* result = l;
398  l = NULL;
399  pltree->force_recalc();
400  valid = true;
401  return result;
402  }
403  }
404 
405  if (advance_l) {
406  l_did = l->get_docid();
407  }
408 
409  if (advance_r) {
410  r_did = r->get_docid();
411  }
412 
413  valid = (l_did == did || r_did == did) || (l_did != 0 && r_did != 0);
414 
415  return NULL;
416 }
417 
418 bool
420 {
421  // We never need to return true here - if one child reaches at_end(), we
422  // prune to leave the other, and if both children reach at_end() together,
423  // we prune to leave one of them which will then indicate at_end() for us.
424  return false;
425 }
426 
427 void
429 {
430  l->get_docid_range(first, last);
431  Xapian::docid first2 = 1, last2 = Xapian::docid(-1);
432  r->get_docid_range(first2, last2);
433  first = min(first, first2);
434  last = max(last, last2);
435 }
436 
437 std::string
439 {
440  string desc = "OrPostList(";
441  desc += l->get_description();
442  desc += ", ";
443  desc += r->get_description();
444  desc += ')';
445  return desc;
446 }
447 
450 {
451  if (r_did == 0 || l_did < r_did)
452  return l->get_wdf();
453  if (l_did == 0 || l_did > r_did)
454  return r->get_wdf();
455  return l->get_wdf() + r->get_wdf();
456 }
457 
460 {
461  if (r_did == 0 || l_did < r_did)
462  return l->count_matching_subqs();
463  if (l_did == 0 || l_did > r_did)
464  return r->count_matching_subqs();
466 }
467 
468 void
470 {
471  if (l_did - 1 <= r_did - 1)
472  l->gather_position_lists(orposlist);
473  if (l_did - 1 >= r_did - 1)
474  r->gather_position_lists(orposlist);
475 }
PostList class implementing Query::OP_AND_MAYBE.
N-way AND postlist.
PostList class implementing Query::OP_AND_MAYBE.
N-way AND postlist.
Definition: andpostlist.h:32
Xapian::docid r_did
Definition: orpostlist.h:49
PostList * check(Xapian::docid did, double w_min, bool &valid)
Check if the specified docid occurs in this postlist.
Definition: orpostlist.cc:336
Xapian::termcount get_wdf() const
Return the wdf for the document at the current position.
Definition: orpostlist.cc:449
double recalc_maxweight()
Recalculate the upper bound on what get_weight() can return.
Definition: orpostlist.cc:172
OrPostList(const OrPostList &)=delete
Don't allow copying.
Xapian::termcount count_matching_subqs() const
Count the number of leaf subqueries which match at the current position.
Definition: orpostlist.cc:459
PostList * skip_to(Xapian::docid did, double w_min)
Skip forward to the specified docid.
Definition: orpostlist.cc:273
PostList * decay_to_andmaybe(PostList *left, PostList *right, Xapian::docid did, double w_min, bool *valid_ptr=NULL)
Definition: orpostlist.cc:127
PostList * decay_to_and(Xapian::docid did, double w_min, bool *valid_ptr=NULL)
Definition: orpostlist.cc:106
void get_docid_range(Xapian::docid &first, Xapian::docid &last) const
Get the bounds on the range of docids this PostList can return.
Definition: orpostlist.cc:428
PostList * l
Left side.
Definition: orpostlist.h:42
Xapian::docid get_docid() const
Return the current docid.
Definition: orpostlist.cc:151
std::string get_description() const
Return a string description of this object.
Definition: orpostlist.cc:438
double get_weight(Xapian::termcount doclen, Xapian::termcount unique_terms, Xapian::termcount wdfdocmax) const
Return the weight contribution for the current position.
Definition: orpostlist.cc:159
Xapian::docid l_did
Definition: orpostlist.h:47
double l_max
Definition: orpostlist.h:51
double r_max
Definition: orpostlist.h:53
void gather_position_lists(OrPositionList *orposlist)
Gather PositionList* objects for a subtree.
Definition: orpostlist.cc:469
PostListTree * pltree
Definition: orpostlist.h:55
PostList * r
Right side.
Definition: orpostlist.h:45
bool at_end() const
Return true if the current position is past the last entry in this list.
Definition: orpostlist.cc:419
void force_recalc()
Definition: postlisttree.h:125
Abstract base class for postlists.
Definition: postlist.h:40
virtual PostList * skip_to(Xapian::docid did, double w_min)=0
Skip forward to the specified docid.
virtual PostList * next(double w_min)=0
Advance the current position to the next document in the postlist.
Xapian::doccount get_termfreq() const
Get an estimate of the number of documents this PostList will return.
Definition: postlist.h:67
virtual Xapian::termcount get_wdf() const
Return the wdf for the document at the current position.
Definition: postlist.cc:34
virtual Xapian::docid get_docid() const =0
Return the current docid.
virtual double recalc_maxweight()=0
Recalculate the upper bound on what get_weight() can return.
virtual bool at_end() const =0
Return true if the current position is past the last entry in this list.
PostList * next()
Advance the current position to the next document in the postlist.
Definition: postlist.h:168
virtual void gather_position_lists(OrPositionList *orposlist)
Gather PositionList* objects for a subtree.
Definition: postlist.cc:66
virtual PostList * check(Xapian::docid did, double w_min, bool &valid)
Check if the specified docid occurs in this postlist.
Definition: postlist.cc:52
virtual void get_docid_range(docid &first, docid &last) const
Get the bounds on the range of docids this PostList can return.
Definition: postlist.cc:72
virtual std::string get_description() const =0
Return a string description of this object.
Xapian::doccount termfreq
Estimate of the number of documents this PostList will return.
Definition: postlist.h:52
virtual double get_weight(Xapian::termcount doclen, Xapian::termcount unique_terms, Xapian::termcount wdfdocmax) const =0
Return the weight contribution for the current position.
virtual Xapian::termcount count_matching_subqs() const
Count the number of leaf subqueries which match at the current position.
Definition: postlist.cc:59
Return the smaller of two numbers which isn't zero.
constexpr std::enable_if_t< std::is_unsigned_v< T >, T > min_non_zero(const T &a, const T &b)
Return the smaller of two unsigned integers which isn't zero.
Definition: min_non_zero.h:39
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
#define AssertRel(A, REL, B)
Definition: omassert.h:123
#define Assert(COND)
Definition: omassert.h:122
static void estimate_or_assuming_indep(double a, double b, double n, T &res)
Definition: orpostlist.cc:36
PostList class implementing Query::OP_OR.
Class for managing a tree of PostList objects.