xapian-core  2.0.0
bb2weight.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2013,2014 Aarsh Shah
5  * Copyright (C) 2014,2015,2016,2017,2024 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, see
19  * <https://www.gnu.org/licenses/>.
20  */
21 
22 #include <config.h>
23 
24 #include "xapian/weight.h"
25 
26 #include "weightinternal.h"
27 
28 #include "serialise-double.h"
29 
30 #include "xapian/error.h"
31 
32 #include <cmath>
33 
34 using namespace std;
35 
36 namespace Xapian {
37 
38 static double stirling_value(double difference, double y, double stirling_constant)
39 {
40  return ((y + 0.5) * (stirling_constant - log2(y)) + (difference * stirling_constant));
41 }
42 
43 BB2Weight::BB2Weight(double c) : param_c(c)
44 {
45  if (param_c <= 0)
46  throw Xapian::InvalidArgumentError("Parameter c is invalid");
53  need_stat(WDF);
55  need_stat(WQF);
57 }
58 
59 BB2Weight *
61 {
62  return new BB2Weight(param_c);
63 }
64 
65 void
66 BB2Weight::init(double factor)
67 {
68  if (factor == 0.0) {
69  // This object is for the term-independent contribution, and that's
70  // always zero for this scheme.
71  return;
72  }
73 
74  double wdfn_upper = get_wdf_upper_bound();
75 
76  if (wdfn_upper == 0) {
77  upper_bound = 0.0;
78  return;
79  }
80 
82  double wdfn_lower(1.0);
83  wdfn_lower *= log2(1 + c_product_avlen / get_doclength_upper_bound());
84  wdfn_upper *= log2(1 + c_product_avlen / get_doclength_lower_bound());
85 
86  double F = get_collection_freq();
87 
88  // Clamp wdfn to at most (F - 1) to avoid ill-defined log calculations in
89  // stirling_value().
90  if (rare(wdfn_lower >= F - 1))
91  wdfn_upper = F - 1;
92  if (rare(wdfn_upper >= F - 1))
93  wdfn_upper = F - 1;
94 
95  B_constant = get_wqf() * factor * (F + 1.0) / get_termfreq();
96 
97  // Clamp N to at least 2 to avoid ill-defined log calculations in
98  // stirling_value().
99  double N = rare(get_collection_size() <= 2) ? 2.0 : double(get_collection_size());
100 
101  wt = -1.0 / log(2.0) - log2(N - 1.0);
102  stirling_constant_1 = log2(N + F - 1.0);
103  stirling_constant_2 = log2(F);
104 
105  // Maximize the Stirling value to be used in the upper bound.
106  // Calculate the individual terms keeping the maximization of Stirling value
107  // in mind.
108  double y_min = F - wdfn_upper;
109  double y_max = N + F - wdfn_lower - 2.0;
110 
111  double stirling_max = stirling_value(wdfn_upper + 1.0, y_max,
113  stirling_value(wdfn_lower, y_min,
115 
116  double B_max = B_constant / (wdfn_lower + 1.0);
117  upper_bound = B_max * (wt + stirling_max);
118  if (rare(upper_bound < 0.0))
119  upper_bound = 0.0;
120 }
121 
122 string
124 {
125  return "bb2";
126 }
127 
128 string
130 {
131  return serialise_double(param_c);
132 }
133 
134 BB2Weight *
135 BB2Weight::unserialise(const string & s) const
136 {
137  const char *ptr = s.data();
138  const char *end = ptr + s.size();
139  double c = unserialise_double(&ptr, end);
140  if (rare(ptr != end))
141  throw Xapian::SerialisationError("Extra data in BB2Weight::unserialise()");
142  return new BB2Weight(c);
143 }
144 
145 double
148 {
149  if (wdf == 0) return 0.0;
150 
151  double wdfn = wdf * log2(1 + c_product_avlen / len);
152 
153  double F = get_collection_freq();
154 
155  // Clamp wdfn to at most (F - 1) to avoid ill-defined log calculations in
156  // stirling_value().
157  if (rare(wdfn >= F - 1))
158  wdfn = F - 1;
159 
160  // Clamp N to at least 2 to avoid ill-defined log calculations in
161  // stirling_value().
163  Xapian::doccount N_less_2 = rare(N <= 2) ? 0 : N - 2;
164 
165  double y2 = F - wdfn;
166  double y1 = N_less_2 + y2;
167  double stirling = stirling_value(wdfn + 1.0, y1, stirling_constant_1) -
169 
170  double B = B_constant / (wdfn + 1.0);
171  double final_weight = B * (wt + stirling);
172  if (rare(final_weight < 0.0))
173  final_weight = 0.0;
174  return final_weight;
175 }
176 
177 double
179 {
180  return upper_bound;
181 }
182 
183 [[noreturn]]
184 static inline void
185 parameter_error(const char* message, const char* params)
186 {
187  Xapian::Weight::Internal::parameter_error(message, "bb2", params);
188 }
189 
190 BB2Weight*
191 BB2Weight::create_from_parameters(const char* params) const
192 {
193  const char* p = params;
194  if (*p == '\0')
195  return new Xapian::BB2Weight();
196  double c = 1.0;
198  parameter_error("Parameter is invalid", params);
199  if (*p)
200  parameter_error("Extra data after parameter", params);
201  return new Xapian::BB2Weight(c);
202 }
203 
204 }
Definition: unittest.cc:660
This class implements the BB2 weighting scheme.
Definition: weight.h:1540
BB2Weight * clone() const
Clone this object.
Definition: bb2weight.cc:60
BB2Weight * create_from_parameters(const char *params) const
Create from a human-readable parameter string.
Definition: bb2weight.cc:191
double get_maxpart() const
Return an upper bound on what get_sumpart() can return for any document.
Definition: bb2weight.cc:178
double stirling_constant_2
Definition: weight.h:1552
std::string name() const
Return the name of this weighting scheme, e.g.
Definition: bb2weight.cc:123
std::string serialise() const
Return this object's parameters serialised as a single string.
Definition: bb2weight.cc:129
double B_constant
Definition: weight.h:1549
double upper_bound
The upper bound on the weight.
Definition: weight.h:1545
double get_sumpart(Xapian::termcount wdf, Xapian::termcount doclen, Xapian::termcount uniqterms, Xapian::termcount wdfdocmax) const
Calculate the weight contribution for this object's term to a document.
Definition: bb2weight.cc:146
void init(double factor)
Allow the subclass to perform any initialisation it needs to.
Definition: bb2weight.cc:66
double stirling_constant_1
Definition: weight.h:1551
double c_product_avlen
The constant values to be used in get_sumpart().
Definition: weight.h:1548
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:1542
BB2Weight * unserialise(const std::string &serialised) const
Unserialise parameters.
Definition: bb2weight.cc:135
InvalidArgumentError indicates an invalid parameter value was passed to the API.
Definition: error.h:229
Indicates an error in the std::string serialisation of an object.
Definition: error.h:917
static void parameter_error(const char *msg, const std::string &scheme, const char *params)
static bool double_param(const char **p, double *ptr_val)
Xapian::termcount get_doclength_lower_bound() const
A lower bound on the minimum length of any document in the shard.
Definition: weight.h:586
Xapian::doccount get_termfreq() const
The number of documents which this term indexes.
Definition: weight.h:558
void need_stat(stat_flags flag)
Tell Xapian that your subclass will want a particular statistic.
Definition: weight.h:183
Xapian::termcount get_wqf() const
The within-query-frequency of this term.
Definition: weight.h:570
Xapian::termcount get_collection_freq() const
The collection frequency of the term.
Definition: weight.h:564
Xapian::doccount get_collection_size() const
The number of documents in the collection.
Definition: weight.h:549
Xapian::doclength get_average_length() const
The average length of a document in the collection.
Definition: weight.h:555
Xapian::termcount get_doclength_upper_bound() const
An upper bound on the maximum length of any document in the shard.
Definition: weight.h:576
@ AVERAGE_LENGTH
Average length of documents in the collection.
Definition: weight.h:47
@ DOC_LENGTH_MAX
Upper bound on document lengths.
Definition: weight.h:73
@ DOC_LENGTH
Length of the current document (sum wdf).
Definition: weight.h:59
@ TERMFREQ
How many documents the current term is in.
Definition: weight.h:49
@ WQF
Within-query-frequency of the current term.
Definition: weight.h:55
@ COLLECTION_SIZE
Number of documents in the collection.
Definition: weight.h:43
@ WDF_MAX
Upper bound on wdf.
Definition: weight.h:81
@ DOC_LENGTH_MIN
Lower bound on (non-zero) document lengths.
Definition: weight.h:65
@ COLLECTION_FREQ
Sum of wdf over the whole collection for the current term.
Definition: weight.h:83
@ WDF
Within-document-frequency of the current term in the current document.
Definition: weight.h:57
Xapian::termcount get_wdf_upper_bound() const
An upper bound on the wdf of this term in the shard.
Definition: weight.h:594
#define rare(COND)
Definition: config.h:607
PositionList * p
Hierarchy of classes which Xapian can throw as exceptions.
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
static double stirling_value(double difference, double y, double stirling_constant)
Definition: bb2weight.cc:38
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
static void parameter_error(const char *message, const char *params)
Definition: bb2weight.cc:185
string serialise_double(double v)
Serialise a double to a string.
double unserialise_double(const char **p, const char *end)
Unserialise a double serialised by serialise_double.
functions to serialise and unserialise a double
Weighting scheme API.
Xapian::Weight::Internal class, holding database and term statistics.