xapian-core  1.4.27
dphweight.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2013, 2014 Aarsh Shah
5  * Copyright (C) 2016,2017 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include <config.h>
23 
24 #include "xapian/weight.h"
25 
26 #include "xapian/error.h"
27 #include "common/log2.h"
28 #include <algorithm>
29 #include <cmath>
30 
31 using namespace std;
32 
33 namespace Xapian {
34 
35 DPHWeight *
36 DPHWeight::clone() const
37 {
38  return new DPHWeight();
39 }
40 
41 void
42 DPHWeight::init(double factor)
43 {
44  // Avoid warnings about unused private member.
45  (void)lower_bound;
46 
47  if (factor == 0.0) {
48  // This object is for the term-independent contribution, and that's
49  // always zero for this scheme.
50  return;
51  }
52 
53  double F = get_collection_freq();
54  double wdf_lower = 1.0;
55  double wdf_upper = get_wdf_upper_bound();
56 
57  double len_upper = get_doclength_upper_bound();
58 
59  if (wdf_upper == 0) {
60  upper_bound = 0.0;
61  return;
62  }
63 
64  double min_wdf_to_len = wdf_lower / len_upper;
65 
66  /* Calculate constant value to be used in get_sumpart(). */
67  log_constant = log2(get_total_length() / F);
68  wqf_product_factor = get_wqf() * factor;
69 
70  // Calculate the upper bound on the weight.
71 
72  /* Calculations to decide the values to be used for calculating upper bound. */
73  /* The upper bound of the term appearing in the second log is obtained
74  by taking the minimum and maximum wdf value in the formula as shown. */
75  double max_product_1 = wdf_upper * (1.0 - min_wdf_to_len);
76  /* A second upper bound of the term can be obtained by plugging in the
77  upper bound of the length and differentiating the term w.r.t wdf
78  to find the value of wdf at which function attains maximum value. */
79  double wdf_var = min(wdf_upper, len_upper / 2.0);
80  double max_product_2 = wdf_var * (1.0 - wdf_var / len_upper);
81  /* Take the minimum of the two upper bounds. */
82  double max_product = min(max_product_1, max_product_2);
83 
84  // Maximization of the product of wdf and normalized wdf.
85  /* The expression is (wdf * (1.0 - wdf / len) * (1.0 - wdf / len)) /
86  (wdf + 1.0). */
87  /* Now, assuming len to be len_upper for the purpose of maximization,
88  (d)/(dx) (x * (1 - x / c) * (1 - x / c)) / (x+1) =
89  ((c - x) * (c - x * (2 * x + 3))) / (c² * (x + 1)²)
90  Thus, if (c - x * (2 * x + 3)) is positive, the differentiation
91  value will be positive and hence the function will be an
92  increasing function. By finding the positive root of the equation
93  2 * x² + 3 * x - c = 0, we get the value of x(wdf)
94  at which the differentiation value turns to negative from positive,
95  and hence, the function will have maximum value for that value of wdf. */
96  double wdf_root = 0.25 * (sqrt(8.0 * len_upper + 9.0) - 3.0);
97 
98  // If wdf_root outside valid range, use nearest value in range.
99  if (wdf_root > wdf_upper) {
100  wdf_root = wdf_upper;
101  } else if (wdf_root < wdf_lower) {
102  wdf_root = wdf_lower;
103  }
104 
105  double x = 1 - wdf_root / len_upper;
106  double x_squared = x * x;
107  auto max_wdf_product_normalization = wdf_root / (wdf_root + 1) * x_squared;
108 
109  double max_weight = max_wdf_product_normalization *
110  (log_constant + (0.5 * log2(2 * M_PI * max_product)));
111 
112  upper_bound = wqf_product_factor * max_weight;
113  if (rare(upper_bound < 0.0)) upper_bound = 0.0;
114 }
115 
116 string
118 {
119  return "Xapian::DPHWeight";
120 }
121 
122 string
123 DPHWeight::serialise() const
124 {
125  return string();
126 }
127 
128 DPHWeight *
129 DPHWeight::unserialise(const string& s) const
130 {
131  if (rare(!s.empty()))
132  throw Xapian::SerialisationError("Extra data in DPHWeight::unserialise()");
133  return new DPHWeight();
134 }
135 
136 double
137 DPHWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
138  Xapian::termcount) const
139 {
140  if (wdf == 0 || wdf == len) return 0.0;
141 
142  double wdf_to_len = double(wdf) / len;
143 
144  double x = 1 - wdf_to_len;
145  double normalization = x * x / (wdf + 1);
146 
147  double wt = normalization *
148  (wdf * (log2(wdf_to_len) + log_constant) +
149  (0.5 * log2(2 * M_PI * wdf * (1 - wdf_to_len))));
150  if (rare(wt <= 0.0)) return 0.0;
151 
152  return wqf_product_factor * wt;
153 }
154 
155 double
156 DPHWeight::get_maxpart() const
157 {
158  return upper_bound;
159 }
160 
161 double
162 DPHWeight::get_sumextra(Xapian::termcount, Xapian::termcount) const
163 {
164  return 0;
165 }
166 
167 double
168 DPHWeight::get_maxextra() const
169 {
170  return 0;
171 }
172 
173 }
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
STL namespace.
#define rare(COND)
Definition: config.h:575
Hierarchy of classes which Xapian can throw as exceptions.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
Indicates an error in the std::string serialisation of an object.
Definition: error.h:929
Weighting scheme API.
double log2(double x)
Definition: log2.h:31
char name[9]
Definition: dbcheck.cc:55
This class implements the DPH weighting scheme.
Definition: weight.h:1359
Defines a log2() function to find the logarithm to base 2 if not already defined in the library...