xapian-core  1.4.26
bb2weight.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2013,2014 Aarsh Shah
5  * Copyright (C) 2014,2015,2016,2017 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include <config.h>
23 
24 #include "xapian/weight.h"
25 #include "common/log2.h"
26 
27 #include "serialise-double.h"
28 
29 #include "xapian/error.h"
30 
31 using namespace std;
32 
33 namespace Xapian {
34 
35 static double stirling_value(double difference, double y, double stirling_constant)
36 {
37  return ((y + 0.5) * (stirling_constant - log2(y)) + (difference * stirling_constant));
38 }
39 
40 BB2Weight::BB2Weight(double c) : param_c(c)
41 {
42  if (param_c <= 0)
43  throw Xapian::InvalidArgumentError("Parameter c is invalid");
50  need_stat(WDF);
52  need_stat(WQF);
54 }
55 
56 BB2Weight *
58 {
59  return new BB2Weight(param_c);
60 }
61 
62 void
63 BB2Weight::init(double factor)
64 {
65  if (factor == 0.0) {
66  // This object is for the term-independent contribution, and that's
67  // always zero for this scheme.
68  return;
69  }
70 
71  double wdfn_upper = get_wdf_upper_bound();
72 
73  if (wdfn_upper == 0) {
74  upper_bound = 0.0;
75  return;
76  }
77 
79  double wdfn_lower(1.0);
80  wdfn_lower *= log2(1 + c_product_avlen / get_doclength_upper_bound());
81  wdfn_upper *= log2(1 + c_product_avlen / get_doclength_lower_bound());
82 
83  double F = get_collection_freq();
84 
85  // Clamp wdfn to at most (F - 1) to avoid ill-defined log calculations in
86  // stirling_value().
87  if (rare(wdfn_lower >= F - 1))
88  wdfn_upper = F - 1;
89  if (rare(wdfn_upper >= F - 1))
90  wdfn_upper = F - 1;
91 
92  B_constant = get_wqf() * factor * (F + 1.0) / get_termfreq();
93 
94  // Clamp N to at least 2 to avoid ill-defined log calculations in
95  // stirling_value().
96  double N = rare(get_collection_size() <= 2) ? 2.0 : double(get_collection_size());
97 
98  wt = -1.0 / log(2.0) - log2(N - 1.0);
99  stirling_constant_1 = log2(N + F - 1.0);
101 
102  // Maximize the Stirling value to be used in the upper bound.
103  // Calculate the individual terms keeping the maximization of Stirling value
104  // in mind.
105  double y_min = F - wdfn_upper;
106  double y_max = N + F - wdfn_lower - 2.0;
107 
108  double stirling_max = stirling_value(wdfn_upper + 1.0, y_max,
110  stirling_value(wdfn_lower, y_min,
112 
113  double B_max = B_constant / (wdfn_lower + 1.0);
114  upper_bound = B_max * (wt + stirling_max);
115  if (rare(upper_bound < 0.0))
116  upper_bound = 0.0;
117 }
118 
119 string
121 {
122  return "Xapian::BB2Weight";
123 }
124 
125 string
127 {
128  return serialise_double(param_c);
129 }
130 
131 BB2Weight *
132 BB2Weight::unserialise(const string & s) const
133 {
134  const char *ptr = s.data();
135  const char *end = ptr + s.size();
136  double c = unserialise_double(&ptr, end);
137  if (rare(ptr != end))
138  throw Xapian::SerialisationError("Extra data in BB2Weight::unserialise()");
139  return new BB2Weight(c);
140 }
141 
142 double
144  Xapian::termcount) const
145 {
146  if (wdf == 0) return 0.0;
147 
148  double wdfn = wdf * log2(1 + c_product_avlen / len);
149 
150  double F = get_collection_freq();
151 
152  // Clamp wdfn to at most (F - 1) to avoid ill-defined log calculations in
153  // stirling_value().
154  if (rare(wdfn >= F - 1))
155  wdfn = F - 1;
156 
157  // Clamp N to at least 2 to avoid ill-defined log calculations in
158  // stirling_value().
160  Xapian::doccount N_less_2 = rare(N <= 2) ? 0 : N - 2;
161 
162  double y2 = F - wdfn;
163  double y1 = N_less_2 + y2;
164  double stirling = stirling_value(wdfn + 1.0, y1, stirling_constant_1) -
166 
167  double B = B_constant / (wdfn + 1.0);
168  double final_weight = B * (wt + stirling);
169  if (rare(final_weight < 0.0))
170  final_weight = 0.0;
171  return final_weight;
172 }
173 
174 double
176 {
177  return upper_bound;
178 }
179 
180 double
182 {
183  return 0;
184 }
185 
186 double
188 {
189  return 0;
190 }
191 
192 }
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
double get_maxpart() const
Return an upper bound on what get_sumpart() can return for any document.
Definition: bb2weight.cc:175
void init(double factor)
Allow the subclass to perform any initialisation it needs to.
Definition: bb2weight.cc:63
double get_sumpart(Xapian::termcount wdf, Xapian::termcount doclen, Xapian::termcount uniqterms) const
Calculate the weight contribution for this object&#39;s term to a document.
Definition: bb2weight.cc:143
Xapian::doccount get_collection_size() const
The number of documents in the collection.
Definition: weight.h:374
Definition: unittest.cc:678
Xapian::termcount get_collection_freq() const
The collection frequency of the term.
Definition: weight.h:389
double B_constant
Definition: weight.h:1069
Upper bound on document lengths.
Definition: weight.h:68
static double stirling_value(double difference, double y, double stirling_constant)
Definition: bb2weight.cc:35
double stirling_constant_1
Definition: weight.h:1071
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:1062
STL namespace.
Lower bound on (non-zero) document lengths.
Definition: weight.h:62
std::string serialise() const
Return this object&#39;s parameters serialised as a single string.
Definition: bb2weight.cc:126
#define rare(COND)
Definition: config.h:575
Hierarchy of classes which Xapian can throw as exceptions.
This class implements the BB2 weighting scheme.
Definition: weight.h:1060
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
functions to serialise and unserialise a double
Length of the current document (sum wdf).
Definition: weight.h:56
InvalidArgumentError indicates an invalid parameter value was passed to the API.
Definition: error.h:241
Xapian::termcount get_doclength_lower_bound() const
A lower bound on the minimum length of any document in the shard.
Definition: weight.h:411
double unserialise_double(const char **p, const char *end)
Unserialise a double serialised by serialise_double.
double get_maxextra() const
Return an upper bound on what get_sumextra() can return for any document.
Definition: bb2weight.cc:187
Indicates an error in the std::string serialisation of an object.
Definition: error.h:929
Within-query-frequency of the current term.
Definition: weight.h:52
double stirling_constant_2
Definition: weight.h:1072
Average length of documents in the collection.
Definition: weight.h:44
BB2Weight * clone() const
Clone this object.
Definition: bb2weight.cc:57
Xapian::termcount get_wqf() const
The within-query-frequency of this term.
Definition: weight.h:395
double upper_bound
The upper bound on the weight.
Definition: weight.h:1065
Xapian::termcount get_doclength_upper_bound() const
An upper bound on the maximum length of any document in the shard.
Definition: weight.h:401
Sum of wdf over the whole collection for the current term.
Definition: weight.h:76
Weighting scheme API.
Within-document-frequency of the current term in the current document.
Definition: weight.h:54
Upper bound on wdf.
Definition: weight.h:74
Xapian::doccount get_termfreq() const
The number of documents which this term indexes.
Definition: weight.h:383
How many documents the current term is in.
Definition: weight.h:46
double log2(double x)
Definition: log2.h:31
Xapian::doclength get_average_length() const
The average length of a document in the collection.
Definition: weight.h:380
std::string serialise_double(double v)
Serialise a double to a string.
std::string name() const
Return the name of this weighting scheme.
Definition: bb2weight.cc:120
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Number of documents in the collection.
Definition: weight.h:40
Defines a log2() function to find the logarithm to base 2 if not already defined in the library...
void need_stat(stat_flags flag)
Tell Xapian that your subclass will want a particular statistic.
Definition: weight.h:94
double get_sumextra(Xapian::termcount doclen, Xapian::termcount uniqterms) const
Calculate the term-independent weight component for a document.
Definition: bb2weight.cc:181
double c_product_avlen
The constant values to be used in get_sumpart().
Definition: weight.h:1068
BB2Weight * unserialise(const std::string &serialised) const
Unserialise parameters.
Definition: bb2weight.cc:132
Xapian::termcount get_wdf_upper_bound() const
An upper bound on the wdf of this term in the shard.
Definition: weight.h:419