xapian-core  1.4.26
ifb2weight.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2013,2014 Aarsh Shah
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <config.h>
22 
23 #include "xapian/weight.h"
24 #include "common/log2.h"
25 
26 #include "serialise-double.h"
27 
28 #include "xapian/error.h"
29 
30 using namespace std;
31 
32 namespace Xapian {
33 
34 IfB2Weight::IfB2Weight(double c)
35  : param_c(c)
36 {
37  if (param_c <= 0)
38  throw Xapian::InvalidArgumentError("Parameter c is invalid");
44  need_stat(WDF);
46  need_stat(WQF);
48 }
49 
50 IfB2Weight *
52 {
53  return new IfB2Weight(param_c);
54 }
55 
56 void
57 IfB2Weight::init(double factor)
58 {
59  if (factor == 0.0) {
60  // This object is for the term-independent contribution, and that's
61  // always zero for this scheme.
62  return;
63  }
64 
65  double wdfn_upper = get_wdf_upper_bound();
66  if (wdfn_upper == 0) {
67  upper_bound = 0.0;
68  return;
69  }
70 
71  double F = get_collection_freq();
72  double N = get_collection_size();
73 
74  wdfn_upper *= log2(1 + (param_c * get_average_length()) /
76 
77  // This term is constant for all documents.
78  double idf_max = log2((N + 1.0) / (F + 0.5));
79 
80  /* Calculate constant values to be used in get_sumpart(). */
81  wqf_product_idf = get_wqf() * idf_max * factor;
83  B_constant = (F + 1.0) / get_termfreq();
84 
85  // wdfn * B = wdfn * (F + 1.0) / (get_termfreq() * (wdfn + 1.0)).
86  // By cancelling out wdfn, we get (F + 1.0) / (get_termfreq() * (1.0 + 1.0 / wdfn)).
87  // In order to maximize the product, we need to minimize the denominator, and so we use wdfn_upper.
88  double max_wdfn_product_B = wdfn_upper * B_constant / (wdfn_upper + 1.0);
89 
90  upper_bound = wqf_product_idf * max_wdfn_product_B * factor;
91 }
92 
93 string
95 {
96  return "Xapian::IfB2Weight";
97 }
98 
99 string
101 {
102  return serialise_double(param_c);
103 }
104 
105 IfB2Weight *
106 IfB2Weight::unserialise(const string & s) const
107 {
108  const char *ptr = s.data();
109  const char *end = ptr + s.size();
110  double c = unserialise_double(&ptr, end);
111  if (rare(ptr != end))
112  throw Xapian::SerialisationError("Extra data in IfB2Weight::unserialise()");
113  return new IfB2Weight(c);
114 }
115 
116 double
118  Xapian::termcount) const
119 {
120  if (wdf == 0) return 0.0;
121  double wdfn = wdf;
122  wdfn *= log2(1 + c_product_avlen / len);
123 
124  double wdfn_product_B = wdfn * B_constant / (wdfn + 1.0);
125 
126  return (wqf_product_idf * wdfn_product_B);
127 }
128 
129 double
131 {
132  return upper_bound;
133 }
134 
135 double
137 {
138  return 0;
139 }
140 
141 double
143 {
144  return 0;
145 }
146 
147 }
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
Xapian::doccount get_collection_size() const
The number of documents in the collection.
Definition: weight.h:374
Xapian::termcount get_collection_freq() const
The collection frequency of the term.
Definition: weight.h:389
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:917
STL namespace.
Lower bound on (non-zero) document lengths.
Definition: weight.h:62
#define rare(COND)
Definition: config.h:575
Hierarchy of classes which Xapian can throw as exceptions.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
double get_maxpart() const
Return an upper bound on what get_sumpart() can return for any document.
Definition: ifb2weight.cc:130
functions to serialise and unserialise a double
IfB2Weight * unserialise(const std::string &serialised) const
Unserialise parameters.
Definition: ifb2weight.cc:106
Length of the current document (sum wdf).
Definition: weight.h:56
InvalidArgumentError indicates an invalid parameter value was passed to the API.
Definition: error.h:241
double c_product_avlen
Definition: weight.h:924
Xapian::termcount get_doclength_lower_bound() const
A lower bound on the minimum length of any document in the shard.
Definition: weight.h:411
double wqf_product_idf
The constant values which are used for calculations in get_sumpart().
Definition: weight.h:923
double unserialise_double(const char **p, const char *end)
Unserialise a double serialised by serialise_double.
Indicates an error in the std::string serialisation of an object.
Definition: error.h:929
Within-query-frequency of the current term.
Definition: weight.h:52
void init(double factor)
Allow the subclass to perform any initialisation it needs to.
Definition: ifb2weight.cc:57
IfB2Weight * clone() const
Clone this object.
Definition: ifb2weight.cc:51
Average length of documents in the collection.
Definition: weight.h:44
std::string name() const
Return the name of this weighting scheme.
Definition: ifb2weight.cc:94
Xapian::termcount get_wqf() const
The within-query-frequency of this term.
Definition: weight.h:395
std::string serialise() const
Return this object&#39;s parameters serialised as a single string.
Definition: ifb2weight.cc:100
Sum of wdf over the whole collection for the current term.
Definition: weight.h:76
Weighting scheme API.
Within-document-frequency of the current term in the current document.
Definition: weight.h:54
double B_constant
Definition: weight.h:925
This class implements the IfB2 weighting scheme.
Definition: weight.h:915
Upper bound on wdf.
Definition: weight.h:74
Xapian::doccount get_termfreq() const
The number of documents which this term indexes.
Definition: weight.h:383
How many documents the current term is in.
Definition: weight.h:46
double log2(double x)
Definition: log2.h:31
Xapian::doclength get_average_length() const
The average length of a document in the collection.
Definition: weight.h:380
std::string serialise_double(double v)
Serialise a double to a string.
Number of documents in the collection.
Definition: weight.h:40
Defines a log2() function to find the logarithm to base 2 if not already defined in the library...
double get_sumpart(Xapian::termcount wdf, Xapian::termcount doclen, Xapian::termcount uniqterm) const
Calculate the weight contribution for this object&#39;s term to a document.
Definition: ifb2weight.cc:117
void need_stat(stat_flags flag)
Tell Xapian that your subclass will want a particular statistic.
Definition: weight.h:94
double get_sumextra(Xapian::termcount doclen, Xapian::termcount uniqterms) const
Calculate the term-independent weight component for a document.
Definition: ifb2weight.cc:136
double get_maxextra() const
Return an upper bound on what get_sumextra() can return for any document.
Definition: ifb2weight.cc:142
double upper_bound
The upper bound on the weight.
Definition: weight.h:920
Xapian::termcount get_wdf_upper_bound() const
An upper bound on the wdf of this term in the shard.
Definition: weight.h:419