xapian-core  1.4.25
pl2plusweight.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2013 Aarsh Shah
5  * Copyright (C) 2013,2014,2016,2017 Olly Betts
6  * Copyright (C) 2016 Vivek Pal
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as
10  * published by the Free Software Foundation; either version 2 of the
11  * License, or (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include <config.h>
24 
25 #include "xapian/weight.h"
26 #include "common/log2.h"
27 
28 #include "serialise-double.h"
29 
30 #include "xapian/error.h"
31 
32 #include <algorithm>
33 
34 using namespace std;
35 
36 namespace Xapian {
37 
38 PL2PlusWeight::PL2PlusWeight(double c, double delta)
39  : param_c(c), param_delta(delta)
40 {
41  if (param_c <= 0)
42  throw Xapian::InvalidArgumentError("Parameter c is invalid");
43  if (param_delta <= 0)
44  throw Xapian::InvalidArgumentError("Parameter delta is invalid");
51  need_stat(WDF);
53  need_stat(WQF);
54 }
55 
58 {
59  return new PL2PlusWeight(param_c, param_delta);
60 }
61 
62 void
63 PL2PlusWeight::init(double factor_)
64 {
65  if (factor_ == 0.0) {
66  // This object is for the term-independent contribution, and that's
67  // always zero for this scheme.
68  return;
69  }
70 
71  factor = factor_;
72 
73  if (get_wdf_upper_bound() == 0) {
74  // The "extra" weight object is cloned, init() called and then
75  // get_maxextra() is called and we discover that we don't need it.
76  // So we need to handle that case (which will give us 0 from
77  // get_wdf_upper_bound() here).
78  upper_bound = 0;
79  return;
80  }
81 
82  factor *= get_wqf();
83 
85 
86  double base_change(1.0 / log(2.0));
88  P1 = mean * base_change + 0.5 * log2(2.0 * M_PI);
89  P2 = log2(mean) + base_change;
90 
91  double wdfn_lower = log2(1 + cl / get_doclength_upper_bound());
92  double divisior = max(get_wdf_upper_bound(), get_doclength_lower_bound());
93  double wdfn_upper = get_wdf_upper_bound() * log2(1 + cl / divisior);
94 
95  double P_delta = P1 + (param_delta + 0.5) * log2(param_delta) - P2 * param_delta;
96  dw = P_delta / (param_delta + 1.0);
97 
98  // Calculate an upper bound on the weights which get_sumpart() can return.
99  //
100  // We consider the equation for P as the sum of two parts which we
101  // maximise individually:
102  //
103  // (a) (wdfn + 0.5) / (wdfn + 1) * log2(wdfn)
104  // (b) (P1 - P2 * wdfn) / (wdfn + 1)
105  //
106  // To maximise (a), the fractional part is always positive (since wdfn>0)
107  // and is maximised by maximising wdfn - clearer when rewritten as:
108  // (1 - 0.5 / (wdfn + 1))
109  //
110  // The log part of (a) is clearly also maximised by maximising wdfn,
111  // so we want to evaluate (a) at wdfn=wdfn_upper.
112  double P_max2a = (wdfn_upper + 0.5) * log2(wdfn_upper) / (wdfn_upper + 1.0);
113  // To maximise (b) substitute x=wdfn+1 (so x>1) and we get:
114  //
115  // (P1 + P2)/x - P2
116  //
117  // Differentiating wrt x gives:
118  //
119  // -(P1 + P2)/x²
120  //
121  // So there are no local minima or maxima, and the function is continuous
122  // in the range of interest, so the sign of this differential tells us
123  // whether we want to maximise or minimise wdfn, and since x>1, we can
124  // just consider the sign of: (P1 + P2)
125  //
126  // Commonly P1 + P2 > 0, in which case we evaluate P at wdfn=wdfn_upper
127  // giving us a bound that can't be bettered if wdfn_upper is tight.
128  double wdfn_optb = P1 + P2 > 0 ? wdfn_upper : wdfn_lower;
129  double P_max2b = (P1 - P2 * wdfn_optb) / (wdfn_optb + 1.0);
130  upper_bound = factor * (P_max2a + P_max2b + dw);
131 
132  if (rare(upper_bound <= 0)) upper_bound = 0;
133 }
134 
135 string
137 {
138  return "Xapian::PL2PlusWeight";
139 }
140 
141 string
143 {
144  string result = serialise_double(param_c);
145  result += serialise_double(param_delta);
146  return result;
147 }
148 
150 PL2PlusWeight::unserialise(const string & s) const
151 {
152  const char *ptr = s.data();
153  const char *end = ptr + s.size();
154  double c = unserialise_double(&ptr, end);
155  double delta = unserialise_double(&ptr, end);
156  if (rare(ptr != end))
157  throw Xapian::SerialisationError("Extra data in PL2PlusWeight::unserialise()");
158  return new PL2PlusWeight(c, delta);
159 }
160 
161 double
163  Xapian::termcount) const
164 {
165  if (wdf == 0 || mean < 1) return 0.0;
166 
167  double wdfn = wdf * log2(1 + cl / len);
168 
169  double P = P1 + (wdfn + 0.5) * log2(wdfn) - P2 * wdfn;
170 
171  double wt = (P / (wdfn + 1.0)) + dw;
172  // FIXME: Is a negative wt possible here? It is with vanilla PL2, but for
173  // PL2+ we've added on dw, and bailed out early if mean < 1.
174  if (rare(wt <= 0)) return 0.0;
175 
176  return factor * wt;
177 }
178 
179 double
181 {
182  return upper_bound;
183 }
184 
185 double
187 {
188  return 0;
189 }
190 
191 double
193 {
194  return 0;
195 }
196 
197 }
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
double factor
The factor to multiply weights by.
Definition: weight.h:1254
double param_delta
Additional parameter delta in the PL2+ weighting formula.
Definition: weight.h:1260
Xapian::doccount get_collection_size() const
The number of documents in the collection.
Definition: weight.h:363
double get_sumextra(Xapian::termcount doclen, Xapian::termcount uniqterms) const
Calculate the term-independent weight component for a document.
double upper_bound
The upper bound on the weight.
Definition: weight.h:1263
Xapian::termcount get_collection_freq() const
The collection frequency of the term.
Definition: weight.h:378
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:1257
Upper bound on document lengths.
Definition: weight.h:60
PL2PlusWeight * unserialise(const std::string &serialised) const
Unserialise parameters.
STL namespace.
std::string serialise() const
Return this object&#39;s parameters serialised as a single string.
Lower bound on (non-zero) document lengths.
Definition: weight.h:58
double dw
Weight contribution of delta term in the PL2+ function.
Definition: weight.h:1275
Xapian::Weight subclass implementing the PL2+ probabilistic formula.
Definition: weight.h:1252
#define rare(COND)
Definition: config.h:565
double P1
Constants for a given term in a given query.
Definition: weight.h:1266
std::string name() const
Return the name of this weighting scheme.
Hierarchy of classes which Xapian can throw as exceptions.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
functions to serialise and unserialise a double
Length of the current document (sum wdf).
Definition: weight.h:56
InvalidArgumentError indicates an invalid parameter value was passed to the API.
Definition: error.h:241
Xapian::termcount get_doclength_lower_bound() const
A lower bound on the minimum length of any document in the database.
Definition: weight.h:400
double unserialise_double(const char **p, const char *end)
Unserialise a double serialised by serialise_double.
Indicates an error in the std::string serialisation of an object.
Definition: error.h:929
Within-query-frequency of the current term.
Definition: weight.h:52
Average length of documents in the collection.
Definition: weight.h:44
double mean
Set by init() to get_collection_freq()) / get_collection_size()
Definition: weight.h:1272
double get_maxextra() const
Return an upper bound on what get_sumextra() can return for any document.
Xapian::termcount get_wqf() const
The within-query-frequency of this term.
Definition: weight.h:384
PL2PlusWeight * clone() const
Clone this object.
Xapian::termcount get_doclength_upper_bound() const
An upper bound on the maximum length of any document in the database.
Definition: weight.h:390
Sum of wdf over the whole collection for the current term.
Definition: weight.h:64
Weighting scheme API.
Within-document-frequency of the current term in the current document.
Definition: weight.h:54
Upper bound on wdf.
Definition: weight.h:62
double log2(double x)
Definition: log2.h:31
Xapian::doclength get_average_length() const
The average length of a document in the collection.
Definition: weight.h:369
std::string serialise_double(double v)
Serialise a double to a string.
void init(double factor_)
Allow the subclass to perform any initialisation it needs to.
double cl
Set by init() to (param_c * get_average_length())
Definition: weight.h:1269
Number of documents in the collection.
Definition: weight.h:40
Defines a log2() function to find the logarithm to base 2 if not already defined in the library...
double get_sumpart(Xapian::termcount wdf, Xapian::termcount doclen, Xapian::termcount uniqterms) const
Calculate the weight contribution for this object&#39;s term to a document.
void need_stat(stat_flags flag)
Tell Xapian that your subclass will want a particular statistic.
Definition: weight.h:83
double get_maxpart() const
Return an upper bound on what get_sumpart() can return for any document.
Xapian::termcount get_wdf_upper_bound() const
An upper bound on the wdf of this term.
Definition: weight.h:408