xapian-core  2.0.0
dphweight.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2013, 2014 Aarsh Shah
5  * Copyright (C) 2016,2017 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, see
19  * <https://www.gnu.org/licenses/>.
20  */
21 
22 #include <config.h>
23 
24 #include "xapian/weight.h"
25 
26 #include "xapian/error.h"
27 
28 #include <algorithm>
29 #include <cmath>
30 
31 using namespace std;
32 
33 namespace Xapian {
34 
35 DPHWeight *
36 DPHWeight::clone() const
37 {
38  return new DPHWeight();
39 }
40 
41 void
42 DPHWeight::init(double factor)
43 {
44  if (factor == 0.0) {
45  // This object is for the term-independent contribution, and that's
46  // always zero for this scheme.
47  return;
48  }
49 
50  double F = get_collection_freq();
51  double wdf_lower = 1.0;
52  double wdf_upper = get_wdf_upper_bound();
53 
54  double len_upper = get_doclength_upper_bound();
55 
56  if (wdf_upper == 0) {
57  upper_bound = 0.0;
58  return;
59  }
60 
61  double min_wdf_to_len = wdf_lower / len_upper;
62 
63  /* Calculate constant value to be used in get_sumpart(). */
64  log_constant = log2(get_total_length() / F);
65  wqf_product_factor = get_wqf() * factor;
66 
67  // Calculate the upper bound on the weight.
68 
69  /* Calculations to decide the values to be used for calculating upper bound. */
70  /* The upper bound of the term appearing in the second log is obtained
71  by taking the minimum and maximum wdf value in the formula as shown. */
72  double max_product_1 = wdf_upper * (1.0 - min_wdf_to_len);
73  /* A second upper bound of the term can be obtained by plugging in the
74  upper bound of the length and differentiating the term w.r.t wdf
75  to find the value of wdf at which function attains maximum value. */
76  double wdf_var = min(wdf_upper, len_upper / 2.0);
77  double max_product_2 = wdf_var * (1.0 - wdf_var / len_upper);
78  /* Take the minimum of the two upper bounds. */
79  double max_product = min(max_product_1, max_product_2);
80 
81  // Maximization of the product of wdf and normalized wdf.
82  /* The expression is (wdf * (1.0 - wdf / len) * (1.0 - wdf / len)) /
83  (wdf + 1.0). */
84  /* Now, assuming len to be len_upper for the purpose of maximization,
85  (d)/(dx) (x * (1 - x / c) * (1 - x / c)) / (x+1) =
86  ((c - x) * (c - x * (2 * x + 3))) / (c² * (x + 1)²)
87  Thus, if (c - x * (2 * x + 3)) is positive, the differentiation
88  value will be positive and hence the function will be an
89  increasing function. By finding the positive root of the equation
90  2 * x² + 3 * x - c = 0, we get the value of x(wdf)
91  at which the differentiation value turns to negative from positive,
92  and hence, the function will have maximum value for that value of wdf. */
93  double wdf_root = 0.25 * (sqrt(8.0 * len_upper + 9.0) - 3.0);
94 
95  // If wdf_root outside valid range, use nearest value in range.
96  if (wdf_root > wdf_upper) {
97  wdf_root = wdf_upper;
98  } else if (wdf_root < wdf_lower) {
99  wdf_root = wdf_lower;
100  }
101 
102  double x = 1 - wdf_root / len_upper;
103  double x_squared = x * x;
104  auto max_wdf_product_normalization = wdf_root / (wdf_root + 1) * x_squared;
105 
106  double max_weight = max_wdf_product_normalization *
107  (log_constant + (0.5 * log2(2 * M_PI * max_product)));
108 
109  upper_bound = wqf_product_factor * max_weight;
110  if (rare(upper_bound < 0.0)) upper_bound = 0.0;
111 }
112 
113 string
115 {
116  return "dph";
117 }
118 
119 string
120 DPHWeight::serialise() const
121 {
122  return string();
123 }
124 
125 DPHWeight *
126 DPHWeight::unserialise(const string& s) const
127 {
128  if (rare(!s.empty()))
129  throw Xapian::SerialisationError("Extra data in DPHWeight::unserialise()");
130  return new DPHWeight();
131 }
132 
133 double
134 DPHWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
136 {
137  if (wdf == 0 || wdf == len) return 0.0;
138 
139  double wdf_to_len = double(wdf) / len;
140 
141  double x = 1 - wdf_to_len;
142  double normalization = x * x / (wdf + 1);
143 
144  double wt = normalization *
145  (wdf * (log2(wdf_to_len) + log_constant) +
146  (0.5 * log2(2 * M_PI * wdf * (1 - wdf_to_len))));
147  if (rare(wt <= 0.0)) return 0.0;
148 
149  return wqf_product_factor * wt;
150 }
151 
152 double
153 DPHWeight::get_maxpart() const
154 {
155  return upper_bound;
156 }
157 
158 DPHWeight *
159 DPHWeight::create_from_parameters(const char * p) const
160 {
161  if (*p != '\0')
162  throw InvalidArgumentError("No parameters are required for DPHWeight");
163  return new Xapian::DPHWeight();
164 }
165 
166 }
char name[9]
Definition: dbcheck.cc:57
This class implements the DPH weighting scheme.
Definition: weight.h:1826
InvalidArgumentError indicates an invalid parameter value was passed to the API.
Definition: error.h:229
Indicates an error in the std::string serialisation of an object.
Definition: error.h:917
#define rare(COND)
Definition: config.h:607
PositionList * p
Hierarchy of classes which Xapian can throw as exceptions.
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
Weighting scheme API.