xapian-core  2.0.0
pl2plusweight.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2013 Aarsh Shah
5  * Copyright (C) 2013,2014,2016,2017,2024 Olly Betts
6  * Copyright (C) 2016 Vivek Pal
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as
10  * published by the Free Software Foundation; either version 2 of the
11  * License, or (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, see
20  * <https://www.gnu.org/licenses/>.
21  */
22 
23 #include <config.h>
24 
25 #include "xapian/weight.h"
26 
27 #include "weightinternal.h"
28 
29 #include "serialise-double.h"
30 
31 #include "xapian/error.h"
32 
33 #include <algorithm>
34 #include <cmath>
35 
36 using namespace std;
37 
38 namespace Xapian {
39 
40 PL2PlusWeight::PL2PlusWeight(double c, double delta)
41  : param_c(c), param_delta(delta)
42 {
43  if (param_c <= 0)
44  throw Xapian::InvalidArgumentError("Parameter c is invalid");
45  if (param_delta <= 0)
46  throw Xapian::InvalidArgumentError("Parameter delta is invalid");
53  need_stat(WDF);
55  need_stat(WQF);
56 }
57 
60 {
61  return new PL2PlusWeight(param_c, param_delta);
62 }
63 
64 void
65 PL2PlusWeight::init(double factor_)
66 {
67  if (factor_ == 0.0) {
68  // This object is for the term-independent contribution, and that's
69  // always zero for this scheme.
70  return;
71  }
72 
73  factor = factor_ * get_wqf();
74 
75  auto wdf_upper_bound = get_wdf_upper_bound();
77  if (rare(wdf_upper_bound == 0 || mean > 1)) {
78  // PL2+ is based on a modified PL2 which "essentially ignores
79  // non-discriminative query terms".
80  upper_bound = 0;
81  return;
82  }
83 
84  double base_change(1.0 / log(2.0));
85  P1 = mean * base_change + 0.5 * log2(2.0 * M_PI);
86  P2 = log2(mean) + base_change;
87 
89 
90  double wdfn_lower = log2(1 + cl / get_doclength_upper_bound());
91  double divisior = max(get_wdf_upper_bound(), get_doclength_lower_bound());
92  double wdfn_upper = get_wdf_upper_bound() * log2(1 + cl / divisior);
93 
94  double P_delta = P1 + (param_delta + 0.5) * log2(param_delta) - P2 * param_delta;
95  dw = P_delta / (param_delta + 1.0);
96 
97  // Calculate an upper bound on the weights which get_sumpart() can return.
98  //
99  // We consider the equation for P as the sum of two parts which we
100  // maximise individually:
101  //
102  // (a) (wdfn + 0.5) / (wdfn + 1) * log2(wdfn)
103  // (b) (P1 - P2 * wdfn) / (wdfn + 1)
104  //
105  // To maximise (a), the fractional part is always positive (since wdfn>0)
106  // and is maximised by maximising wdfn - clearer when rewritten as:
107  // (1 - 0.5 / (wdfn + 1))
108  //
109  // The log part of (a) is clearly also maximised by maximising wdfn,
110  // so we want to evaluate (a) at wdfn=wdfn_upper.
111  double P_max2a = (wdfn_upper + 0.5) * log2(wdfn_upper) / (wdfn_upper + 1.0);
112  // To maximise (b) substitute x=wdfn+1 (so x>1) and we get:
113  //
114  // (P1 + P2)/x - P2
115  //
116  // Differentiating wrt x gives:
117  //
118  // -(P1 + P2)/x²
119  //
120  // So there are no local minima or maxima, and the function is continuous
121  // in the range of interest, so the sign of this differential tells us
122  // whether we want to maximise or minimise wdfn, and the denominator is
123  // always positive so we can just consider the sign of: (P1 + P2)
124  //
125  // Commonly P1 + P2 > 0, in which case we evaluate P at wdfn=wdfn_upper
126  // giving us a bound that can't be bettered if wdfn_upper is tight.
127  double wdfn_optb = P1 + P2 > 0 ? wdfn_upper : wdfn_lower;
128  double P_max2b = (P1 - P2 * wdfn_optb) / (wdfn_optb + 1.0);
129  upper_bound = factor * (P_max2a + P_max2b + dw);
130 
131  if (rare(upper_bound < 0)) upper_bound = 0;
132 }
133 
134 string
136 {
137  return "pl2+";
138 }
139 
140 string
142 {
143  string result = serialise_double(param_c);
144  result += serialise_double(param_delta);
145  return result;
146 }
147 
149 PL2PlusWeight::unserialise(const string & s) const
150 {
151  const char *ptr = s.data();
152  const char *end = ptr + s.size();
153  double c = unserialise_double(&ptr, end);
154  double delta = unserialise_double(&ptr, end);
155  if (rare(ptr != end))
156  throw Xapian::SerialisationError("Extra data in PL2PlusWeight::unserialise()");
157  return new PL2PlusWeight(c, delta);
158 }
159 
160 double
163 {
164  // Note: lambda_t in the paper is 1/mean.
165  if (wdf == 0 || mean > 1) {
166  // PL2+ is based on a modified PL2 which "essentially ignores
167  // non-discriminative query terms".
168  return 0.0;
169  }
170 
171  double wdfn = wdf * log2(1 + cl / len);
172 
173  double P = P1 + (wdfn + 0.5) * log2(wdfn) - P2 * wdfn;
174 
175  double wt = (P / (wdfn + 1.0)) + dw;
176  // FIXME: Is a negative wt possible here? It is with vanilla PL2, but for
177  // PL2+ we've added on dw, and bailed out early if mean < 1.
178  if (rare(wt <= 0)) return 0.0;
179 
180  return factor * wt;
181 }
182 
183 double
185 {
186  return upper_bound;
187 }
188 
189 [[noreturn]]
190 static inline void
191 parameter_error(const char* message, const char* params)
192 {
193  Xapian::Weight::Internal::parameter_error(message, "pl2+", params);
194 }
195 
196 PL2PlusWeight *
197 PL2PlusWeight::create_from_parameters(const char* params) const
198 {
199  const char* p = params;
200  if (*p == '\0')
201  return new Xapian::PL2PlusWeight();
202  double c = 1.0;
203  double delta = 0.8;
205  parameter_error("Parameter 1 (c) is invalid", params);
207  parameter_error("Parameter 2 (delta) is invalid", params);
208  if (*p)
209  parameter_error("Extra data after parameter 2", params);
210  return new Xapian::PL2PlusWeight(c, delta);
211 }
212 
213 }
InvalidArgumentError indicates an invalid parameter value was passed to the API.
Definition: error.h:229
Xapian::Weight subclass implementing the PL2+ probabilistic formula.
Definition: weight.h:1731
std::string name() const
Return the name of this weighting scheme, e.g.
double mean
Set by init() to get_collection_freq()) / get_collection_size()
Definition: weight.h:1751
double dw
Weight contribution of delta term in the PL2+ function.
Definition: weight.h:1754
std::string serialise() const
Return this object's parameters serialised as a single string.
double factor
The factor to multiply weights by.
Definition: weight.h:1733
PL2PlusWeight * clone() const
Clone this object.
double get_maxpart() const
Return an upper bound on what get_sumpart() can return for any document.
double get_sumpart(Xapian::termcount wdf, Xapian::termcount doclen, Xapian::termcount uniqterms, Xapian::termcount wdfdocmax) const
Calculate the weight contribution for this object's term to a document.
double P1
Constants for a given term in a given query.
Definition: weight.h:1745
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:1736
PL2PlusWeight * create_from_parameters(const char *params) const
Create from a human-readable parameter string.
double cl
Set by init() to (param_c * get_average_length())
Definition: weight.h:1748
double param_delta
Additional parameter delta in the PL2+ weighting formula.
Definition: weight.h:1739
void init(double factor_)
Allow the subclass to perform any initialisation it needs to.
double upper_bound
The upper bound on the weight.
Definition: weight.h:1742
PL2PlusWeight * unserialise(const std::string &serialised) const
Unserialise parameters.
Indicates an error in the std::string serialisation of an object.
Definition: error.h:917
static void parameter_error(const char *msg, const std::string &scheme, const char *params)
static bool double_param(const char **p, double *ptr_val)
Xapian::termcount get_doclength_lower_bound() const
A lower bound on the minimum length of any document in the shard.
Definition: weight.h:586
void need_stat(stat_flags flag)
Tell Xapian that your subclass will want a particular statistic.
Definition: weight.h:183
Xapian::termcount get_wqf() const
The within-query-frequency of this term.
Definition: weight.h:570
Xapian::termcount get_collection_freq() const
The collection frequency of the term.
Definition: weight.h:564
Xapian::doccount get_collection_size() const
The number of documents in the collection.
Definition: weight.h:549
Xapian::doclength get_average_length() const
The average length of a document in the collection.
Definition: weight.h:555
Xapian::termcount get_doclength_upper_bound() const
An upper bound on the maximum length of any document in the shard.
Definition: weight.h:576
@ AVERAGE_LENGTH
Average length of documents in the collection.
Definition: weight.h:47
@ DOC_LENGTH_MAX
Upper bound on document lengths.
Definition: weight.h:73
@ DOC_LENGTH
Length of the current document (sum wdf).
Definition: weight.h:59
@ WQF
Within-query-frequency of the current term.
Definition: weight.h:55
@ COLLECTION_SIZE
Number of documents in the collection.
Definition: weight.h:43
@ WDF_MAX
Upper bound on wdf.
Definition: weight.h:81
@ DOC_LENGTH_MIN
Lower bound on (non-zero) document lengths.
Definition: weight.h:65
@ COLLECTION_FREQ
Sum of wdf over the whole collection for the current term.
Definition: weight.h:83
@ WDF
Within-document-frequency of the current term in the current document.
Definition: weight.h:57
Xapian::termcount get_wdf_upper_bound() const
An upper bound on the wdf of this term in the shard.
Definition: weight.h:594
#define rare(COND)
Definition: config.h:607
PositionList * p
Hierarchy of classes which Xapian can throw as exceptions.
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
static void parameter_error(const char *message, const char *params)
Definition: bb2weight.cc:185
string serialise_double(double v)
Serialise a double to a string.
double unserialise_double(const char **p, const char *end)
Unserialise a double serialised by serialise_double.
functions to serialise and unserialise a double
Weighting scheme API.
Xapian::Weight::Internal class, holding database and term statistics.