xapian-core  1.4.26
valuerangepostlist.cc
Go to the documentation of this file.
1 
4 /* Copyright 2007,2008,2009,2010,2011,2013,2016,2017 Olly Betts
5  * Copyright 2009 Lemur Consulting Ltd
6  * Copyright 2010 Richard Boulton
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include <config.h>
24 
25 #include "valuerangepostlist.h"
26 
27 #include "debuglog.h"
28 #include "omassert.h"
29 #include "str.h"
31 
32 using namespace std;
33 
35 {
36  delete valuelist;
37 }
38 
41 {
42  const string& lo = db->get_value_lower_bound(slot);
43  const string& hi = db->get_value_upper_bound(slot);
44  if (begin <= lo && (end.empty() || hi <= end)) {
45  // All set values lie within the range (this case is optimised at a
46  // higher level when the value frequency is the doc count, but not
47  // otherwise).
48  return db->get_value_freq(slot);
49  }
50 
51  // The bounds may not be tight, so one bound being within the range does
52  // not mean we can assume that at least one document matches.
53  return 0;
54 }
55 
56 static double
57 string_frac(const string &s, size_t prefix)
58 {
59  double r = 0;
60  double f = 1.0;
61  for (size_t i = prefix; i != s.size(); ++i) {
62  f /= 256.0;
63  r += static_cast<unsigned char>(s[i]) * f;
64  }
65 
66  return r;
67 }
68 
71 {
72  // Assume the values are evenly spread out between the min and max.
73  // FIXME: Perhaps we should store some sort of binned distribution?
74  const string& lo = db->get_value_lower_bound(slot);
75  const string& hi = db->get_value_upper_bound(slot);
76  AssertRel(lo, <=, hi);
77 
78  size_t common_prefix_len = size_t(-1);
79  do {
80  ++common_prefix_len;
81  // lo <= hi so while we're in the common prefix hi can't run out
82  // before lo.
83  if (common_prefix_len == lo.size()) {
84  if (common_prefix_len != hi.size())
85  break;
86  // All values in the slot are the same. We should have optimised
87  // to EmptyPostList if that singular value is outside the range,
88  // and if it's inside the range then we know that the frequency is
89  // exactly the value frequency.
90  Assert(begin <= lo && (end.empty() || hi <= end));
91  return db->get_value_freq(slot);
92  }
93  AssertRel(common_prefix_len, !=, hi.size());
94  } while (lo[common_prefix_len] == hi[common_prefix_len]);
95 
96  double l = string_frac(lo, common_prefix_len);
97  double h = string_frac(hi, common_prefix_len);
98  double denom = h - l;
99  if (rare(denom == 0.0)) {
100  // Weird corner case - hi != lo (because that's handled inside the loop
101  // above) but they give the same string_frac value. Because we only
102  // calculate the fraction starting from the first difference, this
103  // should only happen if hi is lo + one or more trailing zero bytes.
104 
105  if (begin <= lo && (end.empty() || hi <= end)) {
106  // All set values lie within the range (this case is optimised at a
107  // higher level when the value frequency is the doc count, but not
108  // otherwise).
109  return db->get_value_freq(slot);
110  }
111 
112  // The must be partial overlap - we just checked if the range dominates
113  // the bounds, and a range which is entirely outside the bounds is
114  // optimised to EmptyPostList at a higher level.
115  return db->get_value_freq(slot) / 2;
116  }
117 
118  double b = l;
119  if (begin > lo) {
120  b = string_frac(begin, common_prefix_len);
121  }
122  double e = h;
123  if (!end.empty() && end < hi) {
124  // end is empty for a ValueGePostList
125  e = string_frac(end, common_prefix_len);
126  }
127 
128  double est = (e - b) / denom * db->get_value_freq(slot);
129  return Xapian::doccount(est + 0.5);
130 }
131 
132 TermFreqs
134  const Xapian::Weight::Internal & stats) const
135 {
136  LOGCALL(MATCH, TermFreqs, "ValueRangePostList::get_termfreq_est_using_stats", stats);
137  // FIXME: It's hard to estimate well - perhaps consider the values of
138  // begin and end like above?
139  RETURN(TermFreqs(stats.collection_size / 2,
140  stats.rset_size / 2,
141  stats.total_length / 2));
142 }
143 
146 {
147  return db->get_value_freq(slot);
148 }
149 
150 double
152 {
153  return 0;
154 }
155 
158 {
159  Assert(valuelist);
160  Assert(db);
161  return valuelist->get_docid();
162 }
163 
164 double
166 {
167  Assert(db);
168  return 0;
169 }
170 
173 {
174  Assert(db);
175  return 0;
176 }
177 
180 {
181  Assert(db);
182  return 0;
183 }
184 
185 double
187 {
188  Assert(db);
189  return 0;
190 }
191 
192 PositionList *
194 {
195  Assert(db);
196  return NULL;
197 }
198 
199 PostList *
201 {
202  Assert(db);
203  if (!valuelist) valuelist = db->open_value_list(slot);
204  valuelist->next();
205  while (!valuelist->at_end()) {
206  const string & v = valuelist->get_value();
207  if (v >= begin && v <= end) {
208  return NULL;
209  }
210  valuelist->next();
211  }
212  db = NULL;
213  return NULL;
214 }
215 
216 PostList *
218 {
219  Assert(db);
220  if (!valuelist) valuelist = db->open_value_list(slot);
221  valuelist->skip_to(did);
222  while (!valuelist->at_end()) {
223  const string & v = valuelist->get_value();
224  if (v >= begin && v <= end) {
225  return NULL;
226  }
227  valuelist->next();
228  }
229  db = NULL;
230  return NULL;
231 }
232 
233 PostList *
234 ValueRangePostList::check(Xapian::docid did, double, bool &valid)
235 {
236  Assert(db);
237  AssertRelParanoid(did, <=, db->get_lastdocid());
238  if (!valuelist) valuelist = db->open_value_list(slot);
239  valid = valuelist->check(did);
240  if (!valid) {
241  return NULL;
242  }
243  const string & v = valuelist->get_value();
244  valid = (v >= begin && v <= end);
245  return NULL;
246 }
247 
248 bool
250 {
251  return (db == NULL);
252 }
253 
256 {
257  return 1;
258 }
259 
260 string
262 {
263  string desc = "ValueRangePostList(";
264  desc += str(slot);
265  desc += ", ";
266  description_append(desc, begin);
267  desc += ", ";
268  description_append(desc, end);
269  desc += ")";
270  return desc;
271 }
#define RETURN(A)
Definition: debuglog.h:493
#define Assert(COND)
Definition: omassert.h:122
Abstract base class for postlists.
Definition: postlist.h:37
PostList * skip_to(Xapian::docid, double w_min)
Skip forward to the specified docid.
double get_maxweight() const
Return an upper bound on what get_weight() can return.
PostList * check(Xapian::docid did, double w_min, bool &valid)
Check if the specified docid occurs in this postlist.
#define AssertRel(A, REL, B)
Definition: omassert.h:123
std::string get_description() const
Return a string description of this object.
Xapian::termcount get_unique_terms() const
Return the number of unique terms in the current document.
Xapian::docid get_docid() const
Return the current docid.
STL namespace.
Convert types to std::string.
static double est(double l, double r, double n)
Definition: orpostlist.cc:306
bool at_end() const
Return true if the current position is past the last entry in this list.
Xapian::doccount get_termfreq_min() const
Get a lower bound on the number of documents indexed by this term.
#define rare(COND)
Definition: config.h:575
Xapian::doccount collection_size
Number of documents in the collection.
#define AssertRelParanoid(A, REL, B)
Definition: omassert.h:130
Xapian::doccount rset_size
Number of relevant documents in the collection.
static double string_frac(const string &s, size_t prefix)
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
Xapian::doccount get_termfreq_est() const
Get an estimate of the number of documents indexed by this term.
PositionList * read_position_list()
Read the position list for the term in the current document and return a pointer to it (owned by the ...
void description_append(std::string &desc, const std::string &s)
Definition: unittest.cc:102
Xapian::termcount count_matching_subqs() const
Count the number of leaf subqueries which match at the current position.
Class to hold statistics for a given collection.
Internal * next()
Advance the current position to the next document in the postlist.
Definition: postlist.h:194
string str(int value)
Convert int to std::string.
Definition: str.cc:90
TermFreqs get_termfreq_est_using_stats(const Xapian::Weight::Internal &stats) const
Get an estimate for the termfreq and reltermfreq, given the stats.
Xapian::termcount get_doclength() const
Return the length of current document.
Append a string to an object description, escaping invalid UTF-8.
The frequencies for a term.
double recalc_maxweight()
Recalculate the upper bound on what get_weight() can return.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Return document ids matching a range test on a specified doc value.
Various assertion macros.
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
Abstract base class for iterating term positions in a document.
Definition: positionlist.h:31
Xapian::totallength total_length
Total length of all documents in the collection.
double get_weight() const
Return the weight contribution for the current position.
Xapian::doccount get_termfreq_max() const
Get an upper bound on the number of documents indexed by this term.
Debug logging macros.
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:487