xapian-core  1.4.27
dlhweight.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2013, 2014 Aarsh Shah
5  * Copyright (C) 2016,2017,2019 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include <config.h>
23 
24 #include "xapian/weight.h"
25 
26 #include "xapian/error.h"
27 #include "common/log2.h"
28 #include <algorithm>
29 
30 using namespace std;
31 
32 namespace Xapian {
33 
34 DLHWeight *
35 DLHWeight::clone() const
36 {
37  return new DLHWeight();
38 }
39 
40 void
41 DLHWeight::init(double factor)
42 {
43  // Avoid warnings about unused private member.
44  (void)lower_bound;
45 
46  if (factor == 0.0) {
47  // This object is for the term-independent contribution, and that's
48  // always zero for this scheme.
49  return;
50  }
51 
52  double wdf_upper = get_wdf_upper_bound();
53  if (wdf_upper == 0) {
54  upper_bound = 0.0;
55  return;
56  }
57 
58  const double wdf_lower = 1.0;
59  double len_upper = get_doclength_upper_bound();
60  double len_lower = get_doclength_lower_bound();
61 
62  double F = get_collection_freq();
63 
64  // Calculate constant values to be used in get_sumpart().
65  log_constant = get_total_length() / F;
66  wqf_product_factor = get_wqf() * factor;
67 
68  // Calculate values for the upper bound.
69 
70  // w <= l, so if the allowed ranges overlap, max w/l is 1.0.
71  double max_wdf_over_l = wdf_upper < len_lower ? wdf_upper / len_lower : 1.0;
72 
73  // First term A: w/(w+.5)*log2(w/l*L) where L=total_len/coll_freq
74  // Assume log >= 0:
75  // w/(w+.5) = 1-1/(2w+1) and w >= 1 so max A at w=w_max
76  // log2(w/l*L) maximised when w/l maximised
77  // so max A at w=w_max, l=max(l_min, w_max)
78  // If log < 0 => then A <= 0, so max A <= 0 and we want to minimise the
79  // factor outside the log.
80  double logged_expr = max_wdf_over_l * log_constant;
81  double w_for_A = logged_expr > 1.0 ? wdf_upper : wdf_lower;
82  double A = w_for_A / (w_for_A + 0.5) * log2(logged_expr);
83 
84  // Second term B:
85  //
86  // (l-w)*log2(1-w/l)
87  //
88  // The log is negative, and w <= l so B <= 0 and its maximum is the value
89  // as close to zero as possible. So smaller (l-w) is better and smaller
90  // w/l is better.
91  //
92  // This function is ill defined at w=l, but -> 0 as w -> l.
93  //
94  // If w=l is valid (i.e. len_lower > wdf_upper) then B = 0.
95  double B = 0;
96  if (len_lower > wdf_upper) {
97  // If not, then minimising l-w gives us a candidate (i.e. w=wdf_upper
98  // and l=len_lower).
99  //
100  // The function is also 0 at w = 0 (there must be a local mimina at
101  // some value of w between 0 and l), so the other candidate is at
102  // w=wdf_lower.
103  //
104  // We need to find the optimum value of l in this case, so
105  // differentiate the formula by l:
106  //
107  // d/dl: log2(1-w/l) + (l-w)*(1-w/l)/(l*log(2))
108  // = (log(1-w/l) + (1-w/l)²)/log(2)
109  // = (log(x) + x²)/log(2) [x=1-w/l]
110  //
111  // which is 0 at approx x=0.65291864
112  //
113  // x=1-w/l <=> l*(1-x)=w <=> l=w/(1-x) <=> l ~= 0.34708136*w
114  //
115  // but l >= w so we can't attain that (and the log isn't valid there).
116  //
117  // Gradient at (without loss of generality) l=2*w is:
118  // (log(0.5) + 0.25)/log(2)
119  // which is < 0 so want to minimise l, i.e. l=len_lower, so the other
120  // candidate is w=wdf_lower and l=len_lower.
121  //
122  // So evaluate both candidates and pick the larger:
123  double B1 = (len_lower - wdf_lower) * log2(1.0 - wdf_lower / len_lower);
124  double B2 = (len_lower - wdf_upper) * log2(1.0 - wdf_upper / len_lower);
125  B = max(B1, B2);
126  }
127 
128  /* An upper bound of the term used in the third log can be obtained by:
129  *
130  * 0.5 * log2(2.0 * M_PI * wdf * (1 - wdf / len))
131  *
132  * An upper bound on wdf * (1 - wdf / len) (and hence on the log, since
133  * log is a monotonically increasing function on positive numbers) can
134  * be obtained by plugging in the upper bound of the length and
135  * differentiating the term w.r.t wdf which gives the value of wdf at which
136  * the function attains maximum value - at wdf = len_upper / 2 (or if the
137  * wdf can't be that large, at wdf = wdf_upper): */
138  double wdf_var = min(wdf_upper, len_upper / 2.0);
139  double max_product = wdf_var * (1.0 - wdf_var / len_upper);
140 #if 0
141  /* An upper bound can also be obtained by taking the minimum and maximum
142  * wdf value in the formula as shown (which isn't useful now as it's always
143  * >= the bound above, but could be useful if we tracked bounds on wdf/len):
144  */
145  double min_wdf_to_len = wdf_lower / len_upper;
146  double max_product_2 = wdf_upper * (1.0 - min_wdf_to_len);
147  /* Take the minimum of the two upper bounds. */
148  max_product = min(max_product, max_product_2);
149 #endif
150  double C = 0.5 * log2(2.0 * M_PI * max_product) / (wdf_lower + 0.5);
151  upper_bound = A + B + C;
152 
153  if (rare(upper_bound < 0.0))
154  upper_bound = 0.0;
155  else
156  upper_bound *= wqf_product_factor;
157 }
158 
159 string
161 {
162  return "Xapian::DLHWeight";
163 }
164 
165 string
166 DLHWeight::serialise() const
167 {
168  return string();
169 }
170 
171 DLHWeight *
172 DLHWeight::unserialise(const string& s) const
173 {
174  if (rare(!s.empty()))
175  throw Xapian::SerialisationError("Extra data in DLHWeight::unserialise()");
176  return new DLHWeight();
177 }
178 
179 double
180 DLHWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
181  Xapian::termcount) const
182 {
183  if (wdf == 0 || wdf == len) return 0.0;
184 
185  double wdf_to_len = double(wdf) / len;
186  double one_minus_wdf_to_len = 1.0 - wdf_to_len;
187 
188  double wt = wdf * log2(wdf_to_len * log_constant) +
189  (len - wdf) * log2(one_minus_wdf_to_len) +
190  0.5 * log2(2.0 * M_PI * wdf * one_minus_wdf_to_len);
191  if (rare(wt <= 0.0)) return 0.0;
192 
193  return wqf_product_factor * wt / (wdf + 0.5);
194 }
195 
196 double
197 DLHWeight::get_maxpart() const
198 {
199  return upper_bound;
200 }
201 
202 double
203 DLHWeight::get_sumextra(Xapian::termcount, Xapian::termcount) const
204 {
205  return 0;
206 }
207 
208 double
209 DLHWeight::get_maxextra() const
210 {
211  return 0;
212 }
213 
214 }
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
Definition: unittest.cc:678
STL namespace.
Definition: unittest.cc:668
#define rare(COND)
Definition: config.h:575
Hierarchy of classes which Xapian can throw as exceptions.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
Indicates an error in the std::string serialisation of an object.
Definition: error.h:929
This class implements the DLH weighting scheme, which is a representative scheme of the Divergence fr...
Definition: weight.h:1136
Weighting scheme API.
double log2(double x)
Definition: log2.h:31
char name[9]
Definition: dbcheck.cc:55
Defines a log2() function to find the logarithm to base 2 if not already defined in the library...
#define C(X)