xapian-core  2.0.0
dlhweight.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2013, 2014 Aarsh Shah
5  * Copyright (C) 2016,2017,2019 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, see
19  * <https://www.gnu.org/licenses/>.
20  */
21 
22 #include <config.h>
23 
24 #include "xapian/weight.h"
25 
26 #include "xapian/error.h"
27 
28 #include <algorithm>
29 #include <cmath>
30 
31 using namespace std;
32 
33 namespace Xapian {
34 
35 DLHWeight *
36 DLHWeight::clone() const
37 {
38  return new DLHWeight();
39 }
40 
41 void
42 DLHWeight::init(double factor)
43 {
44  if (factor == 0.0) {
45  // This object is for the term-independent contribution, and that's
46  // always zero for this scheme.
47  return;
48  }
49 
50  double wdf_upper = get_wdf_upper_bound();
51  if (wdf_upper == 0) {
52  upper_bound = 0.0;
53  return;
54  }
55 
56  const double wdf_lower = 1.0;
57  double len_upper = get_doclength_upper_bound();
58  double len_lower = get_doclength_lower_bound();
59 
60  double F = get_collection_freq();
61 
62  // Calculate constant values to be used in get_sumpart().
63  log_constant = get_total_length() / F;
64  wqf_product_factor = get_wqf() * factor;
65 
66  // Calculate values for the upper bound.
67 
68  // w <= l, so if the allowed ranges overlap, max w/l is 1.0.
69  double max_wdf_over_l = wdf_upper < len_lower ? wdf_upper / len_lower : 1.0;
70 
71  // First term A: w/(w+.5)*log2(w/l*L) where L=total_len/coll_freq
72  // Assume log >= 0:
73  // w/(w+.5) = 1-1/(2w+1) and w >= 1 so max A at w=w_max
74  // log2(w/l*L) maximised when w/l maximised
75  // so max A at w=w_max, l=max(l_min, w_max)
76  // If log < 0 => then A <= 0, so max A <= 0 and we want to minimise the
77  // factor outside the log.
78  double logged_expr = max_wdf_over_l * log_constant;
79  double w_for_A = logged_expr > 1.0 ? wdf_upper : wdf_lower;
80  double A = w_for_A / (w_for_A + 0.5) * log2(logged_expr);
81 
82  // Second term B:
83  //
84  // (l-w)*log2(1-w/l)
85  //
86  // The log is negative, and w <= l so B <= 0 and its maximum is the value
87  // as close to zero as possible. So smaller (l-w) is better and smaller
88  // w/l is better.
89  //
90  // This function is ill defined at w=l, but -> 0 as w -> l.
91  //
92  // If w=l is valid (i.e. len_lower > wdf_upper) then B = 0.
93  double B = 0;
94  if (len_lower > wdf_upper) {
95  // If not, then minimising l-w gives us a candidate (i.e. w=wdf_upper
96  // and l=len_lower).
97  //
98  // The function is also 0 at w = 0 (there must be a local mimina at
99  // some value of w between 0 and l), so the other candidate is at
100  // w=wdf_lower.
101  //
102  // We need to find the optimum value of l in this case, so
103  // differentiate the formula by l:
104  //
105  // d/dl: log2(1-w/l) + (l-w)*(1-w/l)/(l*log(2))
106  // = (log(1-w/l) + (1-w/l)²)/log(2)
107  // = (log(x) + x²)/log(2) [x=1-w/l]
108  //
109  // which is 0 at approx x=0.65291864
110  //
111  // x=1-w/l <=> l*(1-x)=w <=> l=w/(1-x) <=> l ~= 0.34708136*w
112  //
113  // but l >= w so we can't attain that (and the log isn't valid there).
114  //
115  // Gradient at (without loss of generality) l=2*w is:
116  // (log(0.5) + 0.25)/log(2)
117  // which is < 0 so want to minimise l, i.e. l=len_lower, so the other
118  // candidate is w=wdf_lower and l=len_lower.
119  //
120  // So evaluate both candidates and pick the larger:
121  double B1 = (len_lower - wdf_lower) * log2(1.0 - wdf_lower / len_lower);
122  double B2 = (len_lower - wdf_upper) * log2(1.0 - wdf_upper / len_lower);
123  B = max(B1, B2);
124  }
125 
126  /* An upper bound of the term used in the third log can be obtained by:
127  *
128  * 0.5 * log2(2.0 * M_PI * wdf * (1 - wdf / len))
129  *
130  * An upper bound on wdf * (1 - wdf / len) (and hence on the log, since
131  * log is a monotonically increasing function on positive numbers) can
132  * be obtained by plugging in the upper bound of the length and
133  * differentiating the term w.r.t wdf which gives the value of wdf at which
134  * the function attains maximum value - at wdf = len_upper / 2 (or if the
135  * wdf can't be that large, at wdf = wdf_upper): */
136  double wdf_var = min(wdf_upper, len_upper / 2.0);
137  double max_product = wdf_var * (1.0 - wdf_var / len_upper);
138 #if 0
139  /* An upper bound can also be obtained by taking the minimum and maximum
140  * wdf value in the formula as shown (which isn't useful now as it's always
141  * >= the bound above, but could be useful if we tracked bounds on wdf/len):
142  */
143  double min_wdf_to_len = wdf_lower / len_upper;
144  double max_product_2 = wdf_upper * (1.0 - min_wdf_to_len);
145  /* Take the minimum of the two upper bounds. */
146  max_product = min(max_product, max_product_2);
147 #endif
148  double C = 0.5 * log2(2.0 * M_PI * max_product) / (wdf_lower + 0.5);
149  upper_bound = A + B + C;
150 
151  if (rare(upper_bound < 0.0))
152  upper_bound = 0.0;
153  else
154  upper_bound *= wqf_product_factor;
155 }
156 
157 string
159 {
160  return "dlh";
161 }
162 
163 string
164 DLHWeight::serialise() const
165 {
166  return string();
167 }
168 
169 DLHWeight *
170 DLHWeight::unserialise(const string& s) const
171 {
172  if (rare(!s.empty()))
173  throw Xapian::SerialisationError("Extra data in DLHWeight::unserialise()");
174  return new DLHWeight();
175 }
176 
177 double
178 DLHWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
180 {
181  if (wdf == 0 || wdf == len) return 0.0;
182 
183  double wdf_to_len = double(wdf) / len;
184  double one_minus_wdf_to_len = 1.0 - wdf_to_len;
185 
186  double wt = wdf * log2(wdf_to_len * log_constant) +
187  (len - wdf) * log2(one_minus_wdf_to_len) +
188  0.5 * log2(2.0 * M_PI * wdf * one_minus_wdf_to_len);
189  if (rare(wt <= 0.0)) return 0.0;
190 
191  return wqf_product_factor * wt / (wdf + 0.5);
192 }
193 
194 double
195 DLHWeight::get_maxpart() const
196 {
197  return upper_bound;
198 }
199 
200 DLHWeight *
201 DLHWeight::create_from_parameters(const char * p) const
202 {
203  if (*p != '\0')
204  throw InvalidArgumentError("No parameters are required for DLHWeight");
205  return new Xapian::DLHWeight();
206 }
207 
208 }
char name[9]
Definition: dbcheck.cc:57
Definition: unittest.cc:650
Definition: unittest.cc:660
This class implements the DLH weighting scheme, which is a representative scheme of the Divergence fr...
Definition: weight.h:1615
InvalidArgumentError indicates an invalid parameter value was passed to the API.
Definition: error.h:229
Indicates an error in the std::string serialisation of an object.
Definition: error.h:917
#define rare(COND)
Definition: config.h:607
PositionList * p
Hierarchy of classes which Xapian can throw as exceptions.
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
#define C(X)
Weighting scheme API.