xapian-core  1.4.20
multiandpostlist.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007,2009,2011,2012,2015,2017 Olly Betts
5  * Copyright (C) 2009 Lemur Consulting Ltd
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include <config.h>
23 
24 #include "multiandpostlist.h"
25 #include "omassert.h"
26 #include "debuglog.h"
27 
28 using namespace std;
29 
30 void
32 {
33  plist = new PostList * [n_kids];
34  try {
35  max_wt = new double [n_kids]();
36  } catch (...) {
37  delete [] plist;
38  plist = NULL;
39  throw;
40  }
41 }
42 
44 {
45  if (plist) {
46  for (size_t i = 0; i < n_kids; ++i) {
47  delete plist[i];
48  }
49  delete [] plist;
50  }
51  delete [] max_wt;
52 }
53 
56 {
57  // The number of matching documents is minimised when we have the minimum
58  // number of matching documents from each sub-postlist, and these are
59  // maximally disjoint.
60  Xapian::doccount sum = plist[0]->get_termfreq_min();
61  if (sum) {
62  for (size_t i = 1; i < n_kids; ++i) {
63  Xapian::doccount sum_old = sum;
64  sum += plist[i]->get_termfreq_min();
65  // If sum < sum_old, the calculation overflowed and the true sum
66  // must be > db_size. Since we added a value <= db_size,
67  // subtracting db_size must un-overflow us.
68  if (sum >= sum_old && sum <= db_size) {
69  // It's possible there's no overlap.
70  return 0;
71  }
72  sum -= db_size;
73  }
75  }
76  return sum;
77 }
78 
81 {
82  // We can't match more documents than the least of our sub-postlists.
83  Xapian::doccount result = plist[0]->get_termfreq_max();
84  for (size_t i = 1; i < n_kids; ++i) {
85  Xapian::doccount tf = plist[i]->get_termfreq_max();
86  if (tf < result) result = tf;
87  }
88  return result;
89 }
90 
93 {
94  LOGCALL(MATCH, Xapian::doccount, "MultiAndPostList::get_termfreq_est", NO_ARGS);
95  // We shortcut an empty shard and avoid creating a postlist tree for it.
96  Assert(db_size);
97  // We calculate the estimate assuming independence. With this assumption,
98  // the estimate is the product of the estimates for the sub-postlists
99  // divided by db_size (n_kids - 1) times.
100  double result = plist[0]->get_termfreq_est();
101  for (size_t i = 1; i < n_kids; ++i) {
102  result = (result * plist[i]->get_termfreq_est()) / db_size;
103  }
104  return static_cast<Xapian::doccount>(result + 0.5);
105 }
106 
107 TermFreqs
109  const Xapian::Weight::Internal & stats) const
110 {
111  LOGCALL(MATCH, TermFreqs, "MultiAndPostList::get_termfreq_est_using_stats", stats);
112  // We calculate the estimate assuming independence. With this assumption,
113  // the estimate is the product of the estimates for the sub-postlists
114  // divided by db_size (n_kids - 1) times.
115  TermFreqs freqs(plist[0]->get_termfreq_est_using_stats(stats));
116 
117  double freqest = double(freqs.termfreq);
118  double relfreqest = double(freqs.reltermfreq);
119  double collfreqest = double(freqs.collfreq);
120 
121  // Our caller should have ensured this.
122  Assert(stats.collection_size);
123 
124  for (size_t i = 1; i < n_kids; ++i) {
125  freqs = plist[i]->get_termfreq_est_using_stats(stats);
126 
127  // If the collection is empty, freqest should be 0 already, so leave
128  // it alone.
129  freqest = (freqest * freqs.termfreq) / stats.collection_size;
130  if (usual(stats.total_length != 0)) {
131  collfreqest = (collfreqest * freqs.collfreq) / stats.total_length;
132  }
133 
134  // If the rset is empty, relfreqest should be 0 already, so leave
135  // it alone.
136  if (stats.rset_size != 0)
137  relfreqest = (relfreqest * freqs.reltermfreq) / stats.rset_size;
138  }
139 
140  RETURN(TermFreqs(static_cast<Xapian::doccount>(freqest + 0.5),
141  static_cast<Xapian::doccount>(relfreqest + 0.5),
142  static_cast<Xapian::termcount>(collfreqest + 0.5)));
143 }
144 
145 double
147 {
148  return max_total;
149 }
150 
153 {
154  return did;
155 }
156 
159 {
160  Assert(did);
161  Xapian::termcount doclength = plist[0]->get_doclength();
162  for (size_t i = 1; i < n_kids; ++i) {
163  AssertEq(doclength, plist[i]->get_doclength());
164  }
165  return doclength;
166 }
167 
170 {
171  Assert(did);
172  Xapian::termcount unique_terms = plist[0]->get_unique_terms();
173  for (size_t i = 1; i < n_kids; ++i) {
174  AssertEq(unique_terms, plist[i]->get_unique_terms());
175  }
176  return unique_terms;
177 }
178 
179 double
181 {
182  Assert(did);
183  double result = 0;
184  for (size_t i = 0; i < n_kids; ++i) {
185  result += plist[i]->get_weight();
186  }
187  return result;
188 }
189 
190 bool
192 {
193  return (did == 0);
194 }
195 
196 double
198 {
199  max_total = 0.0;
200  for (size_t i = 0; i < n_kids; ++i) {
201  double new_max = plist[i]->recalc_maxweight();
202  max_wt[i] = new_max;
203  max_total += new_max;
204  }
205  return max_total;
206 }
207 
208 PostList *
210 {
211 advanced_plist0:
212  if (plist[0]->at_end()) {
213  did = 0;
214  return NULL;
215  }
216  did = plist[0]->get_docid();
217  for (size_t i = 1; i < n_kids; ++i) {
218  bool valid;
219  check_helper(i, did, w_min, valid);
220  if (!valid) {
221  next_helper(0, w_min);
222  goto advanced_plist0;
223  }
224  if (plist[i]->at_end()) {
225  did = 0;
226  return NULL;
227  }
228  Xapian::docid new_did = plist[i]->get_docid();
229  if (new_did != did) {
230  skip_to_helper(0, new_did, w_min);
231  goto advanced_plist0;
232  }
233  }
234  return NULL;
235 }
236 
237 PostList *
239 {
240  next_helper(0, w_min);
241  return find_next_match(w_min);
242 }
243 
244 PostList *
246 {
247  skip_to_helper(0, did_min, w_min);
248  return find_next_match(w_min);
249 }
250 
251 std::string
253 {
254  string desc("(");
255  desc += plist[0]->get_description();
256  for (size_t i = 1; i < n_kids; ++i) {
257  desc += " AND ";
258  desc += plist[i]->get_description();
259  }
260  desc += ')';
261  return desc;
262 }
263 
266 {
267  Xapian::termcount totwdf = 0;
268  for (size_t i = 0; i < n_kids; ++i) {
269  totwdf += plist[i]->get_wdf();
270  }
271  return totwdf;
272 }
273 
276 {
277  Xapian::termcount total = 0;
278  for (size_t i = 0; i < n_kids; ++i) {
279  total += plist[i]->count_matching_subqs();
280  }
281  return total;
282 }
283 
284 void
286 {
287  for (size_t i = 0; i < n_kids; ++i) {
288  plist[i]->gather_position_lists(orposlist);
289  }
290 }
Xapian::termcount count_matching_subqs() const
Count the number of leaf subqueries which match at the current position.
#define RETURN(A)
Definition: debuglog.h:482
#define Assert(COND)
Definition: omassert.h:122
void allocate_plist_and_max_wt()
Allocate plist and max_wt arrays of n_kids each.
Abstract base class for postlists.
Definition: postlist.h:37
#define AssertEq(A, B)
Definition: omassert.h:124
virtual Xapian::docid get_docid() const =0
Return the current docid.
void gather_position_lists(OrPositionList *orposlist)
Gather PositionList* objects for a subtree.
Xapian::doccount get_termfreq_min() const
Get a lower bound on the number of documents indexed by this term.
PostList * skip_to(Xapian::docid, double w_min)
Skip forward to the specified docid.
#define usual(COND)
Definition: config.h:563
Xapian::termcount get_doclength() const
Return the length of current document.
std::string get_description() const
Return a string description of this object.
STL namespace.
bool at_end() const
Return true if the current position is past the last entry in this list.
Xapian::docid get_docid() const
Return the current docid.
Xapian::doccount termfreq
Xapian::doccount collection_size
Number of documents in the collection.
#define AssertRelParanoid(A, REL, B)
Definition: omassert.h:130
TermFreqs get_termfreq_est_using_stats(const Xapian::Weight::Internal &stats) const
Get an estimate for the termfreq and reltermfreq, given the stats.
PostList * find_next_match(double w_min)
Advance the sublists to the next match.
Xapian::doccount rset_size
Number of relevant documents in the collection.
double get_maxweight() const
Return an upper bound on what get_weight() can return.
Xapian::termcount get_wdf() const
get_wdf() for MultiAndPostlists returns the sum of the wdfs of the sub postlists. ...
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
double doclength
A normalised document length.
Definition: types.h:59
Class to hold statistics for a given collection.
Internal * next()
Advance the current position to the next document in the postlist.
Definition: postlist.h:194
N-way AND postlist.
Xapian::termcount collfreq
The frequencies for a term.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Xapian::doccount get_termfreq_max() const
Get an upper bound on the number of documents indexed by this term.
Various assertion macros.
double get_weight() const
Return the weight contribution for the current position.
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
if(!(properties &BACKEND))
Definition: api_collated.h:3
Xapian::totallength total_length
Total length of all documents in the collection.
Xapian::doccount reltermfreq
Xapian::termcount get_unique_terms() const
Return the number of unique terms in the current document.
double recalc_maxweight()
Recalculate the upper bound on what get_weight() can return.
Xapian::doccount get_termfreq_est() const
Get an estimate of the number of documents indexed by this term.
Debug logging macros.
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:476