xapian-core  1.4.21
lmweight.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2012 Gaurav Arora
5  * Copyright (C) 2016 Olly Betts
6  * Copyright (C) 2016 Vivek Pal
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as
10  * published by the Free Software Foundation; either version 2 of the
11  * License, or (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include <config.h>
24 
25 #include "xapian/weight.h"
26 
27 #include "debuglog.h"
28 #include "omassert.h"
29 #include "serialise-double.h"
30 
31 #include "xapian/error.h"
32 
33 #include <cmath>
34 
35 using namespace std;
36 
37 namespace Xapian {
38 
39 LMWeight *
40 LMWeight::clone() const {
41  return new LMWeight(param_log, select_smoothing, param_smoothing1, param_smoothing2);
42 }
43 
44 void
45 LMWeight::init(double factor_)
46 {
47  // weight_collection is really factor.
48  weight_collection = factor_;
49 
50  /* Setting default values of the param_log to handle negative value of log.
51  * It is considered to be upperbound of document length.
52  * initializing param_log to upperbound of document_length.
53  */
54 
55  if (param_log == 0.0) {
56  param_log = get_doclength_upper_bound();
57  }
58 
59  /* Since the optimal parameter for Jelinek mercer smoothing
60  * is based on query length, so if query is title query changing
61  * default value of smoothing parameter.
62  */
63 
64  if (select_smoothing == JELINEK_MERCER_SMOOTHING ||
65  select_smoothing == TWO_STAGE_SMOOTHING) {
66  if (param_smoothing1 == 0.7) {
67  if (get_query_length() <= 2) {
68  param_smoothing1 = 0.1;
69  }
70  }
71  }
72 
73  /* param_smoothing1 default value should be 2000 in case
74  * DIRICHLET_SMOOTHING is selected. Tweaking param_smoothing1
75  * if user supply his own value for param_smoothing1 value will not be set
76  * to 2000(default value)
77  */
78  if (select_smoothing == DIRICHLET_SMOOTHING) {
79  if (param_smoothing1 == 0.7) {
80  param_smoothing1 = 2000;
81  }
82  }
83 
84  /* Setting param_smoothing1 and param_smoothing2 default value to used when
85  * DIRICHLET_PLUS_SMOOTHING is selected.*/
86  if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
87  if (param_smoothing1 == 0.7) {
88  param_smoothing1 = 2000;
89  }
90  }
91 }
92 
93 string
95 {
96  return "Xapian::LMWeight";
97 }
98 
99 string
100 LMWeight::serialise() const
101 {
102  string result = serialise_double(param_log);
103  result += static_cast<unsigned char>(select_smoothing);
104  result += serialise_double(param_smoothing1);
105  result += serialise_double(param_smoothing2);
106  return result;
107 }
108 
109 LMWeight *
110 LMWeight::unserialise(const string & s) const
111 {
112  const char *ptr = s.data();
113  const char *end = ptr + s.size();
114  double param_log_ = unserialise_double(&ptr, end);
115  type_smoothing select_smoothing_ = static_cast<type_smoothing>(*(ptr)++);
116  double param_smoothing1_ = unserialise_double(&ptr, end);
117  double param_smoothing2_ = unserialise_double(&ptr, end);
118  if (rare(ptr != end))
119  throw Xapian::SerialisationError("Extra data in LMWeight::unserialise()");
120  return new LMWeight(param_log_, select_smoothing_, param_smoothing1_, param_smoothing2_);
121 }
122 
123 double
124 LMWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
125  Xapian::termcount uniqterm) const
126 {
127  // Within Document Frequency of the term in document being considered.
128  double wdf_double = wdf;
129  // Length of the Document in terms of number of terms.
130  double len_double = len;
131  // variable to store weight contribution of term in the document scoring for LM.
132  double weight_sum;
133 
134  /* In case the within document frequency of term is zero smoothing will
135  * be required and should be return instead of returning zero, as returning
136  * LM score are multiplication of contribution of all terms, due to absence
137  * of single term whole document is scored zero, hence apply collection
138  * frequency smoothing.
139  */
140  double wt_coll = get_collection_freq() / double(get_total_length());
141 
142  // Calculating weights considering different smoothing option available to user.
143  if (select_smoothing == JELINEK_MERCER_SMOOTHING) {
144  /* Maximum likelihood of current term, weight contribution of term in
145  * case query term is present in the document.
146  */
147  double weight_document = wdf_double / len_double;
148  weight_sum = (param_smoothing1 * wt_coll) +
149  ((1 - param_smoothing1) * weight_document);
150  } else if (select_smoothing == DIRICHLET_SMOOTHING) {
151  weight_sum = (wdf_double + (param_smoothing1 * wt_coll)) /
152  (len_double + param_smoothing1);
153  } else if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
154  /* In the Dir+ weighting formula, sumpart weight contribution is :-
155  *
156  * sum of log of (1 + (wdf/(param_smoothing1 * wt_coll))) and
157  * log of (1 + (delta/param_smoothing1 * wt_coll))).
158  * Since, sum of logs is log of product so weight_sum is calculated as product
159  * of terms in log in the Dir+ formula.
160  */
161  weight_sum = (1 + (wdf_double / (param_smoothing1 * wt_coll))) *
162  (1 + (param_smoothing2 / (param_smoothing1 * wt_coll)));
163  } else if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING) {
164  double uniqterm_double = uniqterm;
165  weight_sum = ((((wdf_double - param_smoothing1) > 0) ? (wdf_double - param_smoothing1) : 0) / len_double) + ((param_smoothing1 * wt_coll * uniqterm_double) / len_double);
166  } else {
167  weight_sum = (((1 - param_smoothing1) * (wdf_double + (param_smoothing2 * wt_coll)) / (len_double + param_smoothing2)) + (param_smoothing1 * wt_coll));
168  }
169 
170  /* Since LM score is calculated with multiplication, instead of changing
171  * the current implementation log trick have been used to calculate the
172  * product since (sum of log is log of product and since aim is ranking
173  * ranking document by product or log of product won't make a large
174  * difference hence log(product) will be used for ranking.
175  */
176  double product = weight_sum * param_log;
177  // weight_collection is really factor.
178  return (product > 1.0) ? weight_collection * log(product) : 0;
179 }
180 
181 double
182 LMWeight::get_maxpart() const
183 {
184  // Variable to store the collection frequency
185  double upper_bound;
186  // Store upper bound on wdf in variable wdf_max
187  double wdf_max = get_wdf_upper_bound();
188 
189  /* In case the within document frequency of term is zero smoothing will
190  * be required and should be return instead of returning zero, as
191  * returning LM score are multiplication of contribution of all terms,
192  * due to absence of single term whole document is scored zero, hence
193  * apply collection frequency smoothing.
194  */
195  double wt_coll = get_collection_freq() / double(get_total_length());
196 
197  // Calculating upper bound considering different smoothing option available to user.
198  if (select_smoothing == JELINEK_MERCER_SMOOTHING) {
199  upper_bound = (param_smoothing1 * wt_coll) + (1 - param_smoothing1);
200  } else if (select_smoothing == DIRICHLET_SMOOTHING) {
201  upper_bound = (get_doclength_upper_bound() + (param_smoothing1 * wt_coll)) / (get_doclength_upper_bound() + param_smoothing1);
202  } else if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
203  upper_bound = (1 + (wdf_max / (param_smoothing1 * wt_coll))) *
204  (1 + (param_smoothing2 / (param_smoothing1 * wt_coll)));
205  } else if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING) {
206  upper_bound = param_smoothing1 * wt_coll + 1;
207  } else {
208  upper_bound = (((1 - param_smoothing1) * (get_doclength_upper_bound() + (param_smoothing2 * wt_coll)) / (get_doclength_upper_bound() + param_smoothing2)) + (param_smoothing1 * wt_coll));
209  }
210 
211  /* Since weight are calculated using log trick, using same with the bounds. Refer
212  * comment in get_sumpart for the details.
213  */
214  double product = upper_bound * param_log;
215  // weight_collection is really factor.
216  return (product > 1.0) ? weight_collection * log(product) : 1.0;
217 }
218 
219 /* The extra weight component in the Dir+ formula is :-
220  *
221  * |Q| * log (param_smoothing1 / (|D| + param_smoothing1))
222  *
223  * where, |Q| is total query length.
224  * |D| is total document length.
225  */
226 double
227 LMWeight::get_sumextra(Xapian::termcount len, Xapian::termcount) const
228 {
229  if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
230  double extra_weight = param_smoothing1 / (len + param_smoothing1);
231  return get_query_length() * log(extra_weight);
232  }
233  return 0;
234 }
235 
236 double
237 LMWeight::get_maxextra() const
238 {
239  if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
240  double extra_weight = param_smoothing1 / (get_doclength_lower_bound() + param_smoothing1);
241  return get_query_length() * log(extra_weight);
242  }
243  return 0;
244 }
245 
246 }
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
STL namespace.
#define rare(COND)
Definition: config.h:573
Hierarchy of classes which Xapian can throw as exceptions.
type_smoothing
Type of smoothing to use with the Language Model Weighting scheme.
Definition: weight.h:147
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
functions to serialise and unserialise a double
double unserialise_double(const char **p, const char *end)
Unserialise a double serialised by serialise_double.
Indicates an error in the std::string serialisation of an object.
Definition: error.h:929
Weighting scheme API.
std::string serialise_double(double v)
Serialise a double to a string.
char name[9]
Definition: dbcheck.cc:55
Various assertion macros.
Xapian::Weight subclass implementing the Language Model formula.
Definition: weight.h:1406
Debug logging macros.