xapian-core  2.0.0
ifb2weight.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2013,2014 Aarsh Shah
5  * Copyright (C) 2024 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, see
19  * <https://www.gnu.org/licenses/>.
20  */
21 
22 #include <config.h>
23 
24 #include "xapian/weight.h"
25 
26 #include "weightinternal.h"
27 
28 #include "serialise-double.h"
29 
30 #include "xapian/error.h"
31 
32 #include <cmath>
33 
34 using namespace std;
35 
36 namespace Xapian {
37 
38 IfB2Weight::IfB2Weight(double c)
39  : param_c(c)
40 {
41  if (param_c <= 0)
42  throw Xapian::InvalidArgumentError("Parameter c is invalid");
48  need_stat(WDF);
50  need_stat(WQF);
52 }
53 
54 IfB2Weight *
56 {
57  return new IfB2Weight(param_c);
58 }
59 
60 void
61 IfB2Weight::init(double factor)
62 {
63  if (factor == 0.0) {
64  // This object is for the term-independent contribution, and that's
65  // always zero for this scheme.
66  return;
67  }
68 
69  double wdfn_upper = get_wdf_upper_bound();
70  if (wdfn_upper == 0) {
71  upper_bound = 0.0;
72  return;
73  }
74 
75  double F = get_collection_freq();
76  double N = get_collection_size();
77 
78  wdfn_upper *= log2(1 + (param_c * get_average_length()) /
80 
81  // This term is constant for all documents.
82  double idf_max = log2((N + 1.0) / (F + 0.5));
83 
84  /* Calculate constant values to be used in get_sumpart(). */
85  wqf_product_idf = get_wqf() * idf_max * factor;
87  B_constant = (F + 1.0) / get_termfreq();
88 
89  // wdfn * B = wdfn * (F + 1.0) / (get_termfreq() * (wdfn + 1.0)).
90  // By cancelling out wdfn, we get (F + 1.0) / (get_termfreq() * (1.0 + 1.0 / wdfn)).
91  // In order to maximize the product, we need to minimize the denominator, and so we use wdfn_upper.
92  double max_wdfn_product_B = wdfn_upper * B_constant / (wdfn_upper + 1.0);
93 
94  upper_bound = wqf_product_idf * max_wdfn_product_B * factor;
95 }
96 
97 string
99 {
100  return "ifb2";
101 }
102 
103 string
105 {
106  return serialise_double(param_c);
107 }
108 
109 IfB2Weight *
110 IfB2Weight::unserialise(const string & s) const
111 {
112  const char *ptr = s.data();
113  const char *end = ptr + s.size();
114  double c = unserialise_double(&ptr, end);
115  if (rare(ptr != end))
116  throw Xapian::SerialisationError("Extra data in IfB2Weight::unserialise()");
117  return new IfB2Weight(c);
118 }
119 
120 double
123 {
124  if (wdf == 0) return 0.0;
125  double wdfn = wdf;
126  wdfn *= log2(1 + c_product_avlen / len);
127 
128  double wdfn_product_B = wdfn * B_constant / (wdfn + 1.0);
129 
130  return (wqf_product_idf * wdfn_product_B);
131 }
132 
133 double
135 {
136  return upper_bound;
137 }
138 
139 [[noreturn]]
140 static inline void
141 parameter_error(const char* message, const char* params)
142 {
143  Xapian::Weight::Internal::parameter_error(message, "ifb2", params);
144 }
145 
146 IfB2Weight*
147 IfB2Weight::create_from_parameters(const char* params) const
148 {
149  const char* p = params;
150  if (*p == '\0')
151  return new Xapian::IfB2Weight();
152  double c = 1.0;
154  parameter_error("Parameter is invalid", params);
155  if (*p)
156  parameter_error("Extra data after parameter", params);
157  return new Xapian::IfB2Weight(c);
158 }
159 
160 }
This class implements the IfB2 weighting scheme.
Definition: weight.h:1397
IfB2Weight * unserialise(const std::string &serialised) const
Unserialise parameters.
Definition: ifb2weight.cc:110
double c_product_avlen
Definition: weight.h:1406
double get_sumpart(Xapian::termcount wdf, Xapian::termcount doclen, Xapian::termcount uniqterm, Xapian::termcount wdfdocmax) const
Calculate the weight contribution for this object's term to a document.
Definition: ifb2weight.cc:121
double B_constant
Definition: weight.h:1407
double upper_bound
The upper bound on the weight.
Definition: weight.h:1402
std::string serialise() const
Return this object's parameters serialised as a single string.
Definition: ifb2weight.cc:104
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:1399
std::string name() const
Return the name of this weighting scheme, e.g.
Definition: ifb2weight.cc:98
double get_maxpart() const
Return an upper bound on what get_sumpart() can return for any document.
Definition: ifb2weight.cc:134
IfB2Weight * create_from_parameters(const char *params) const
Create from a human-readable parameter string.
Definition: ifb2weight.cc:147
double wqf_product_idf
The constant values which are used for calculations in get_sumpart().
Definition: weight.h:1405
void init(double factor)
Allow the subclass to perform any initialisation it needs to.
Definition: ifb2weight.cc:61
IfB2Weight * clone() const
Clone this object.
Definition: ifb2weight.cc:55
InvalidArgumentError indicates an invalid parameter value was passed to the API.
Definition: error.h:229
Indicates an error in the std::string serialisation of an object.
Definition: error.h:917
static void parameter_error(const char *msg, const std::string &scheme, const char *params)
static bool double_param(const char **p, double *ptr_val)
Xapian::termcount get_doclength_lower_bound() const
A lower bound on the minimum length of any document in the shard.
Definition: weight.h:586
Xapian::doccount get_termfreq() const
The number of documents which this term indexes.
Definition: weight.h:558
void need_stat(stat_flags flag)
Tell Xapian that your subclass will want a particular statistic.
Definition: weight.h:183
Xapian::termcount get_wqf() const
The within-query-frequency of this term.
Definition: weight.h:570
Xapian::termcount get_collection_freq() const
The collection frequency of the term.
Definition: weight.h:564
Xapian::doccount get_collection_size() const
The number of documents in the collection.
Definition: weight.h:549
Xapian::doclength get_average_length() const
The average length of a document in the collection.
Definition: weight.h:555
@ AVERAGE_LENGTH
Average length of documents in the collection.
Definition: weight.h:47
@ DOC_LENGTH
Length of the current document (sum wdf).
Definition: weight.h:59
@ TERMFREQ
How many documents the current term is in.
Definition: weight.h:49
@ WQF
Within-query-frequency of the current term.
Definition: weight.h:55
@ COLLECTION_SIZE
Number of documents in the collection.
Definition: weight.h:43
@ WDF_MAX
Upper bound on wdf.
Definition: weight.h:81
@ DOC_LENGTH_MIN
Lower bound on (non-zero) document lengths.
Definition: weight.h:65
@ COLLECTION_FREQ
Sum of wdf over the whole collection for the current term.
Definition: weight.h:83
@ WDF
Within-document-frequency of the current term in the current document.
Definition: weight.h:57
Xapian::termcount get_wdf_upper_bound() const
An upper bound on the wdf of this term in the shard.
Definition: weight.h:594
#define rare(COND)
Definition: config.h:607
PositionList * p
Hierarchy of classes which Xapian can throw as exceptions.
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
static void parameter_error(const char *message, const char *params)
Definition: bb2weight.cc:185
string serialise_double(double v)
Serialise a double to a string.
double unserialise_double(const char **p, const char *end)
Unserialise a double serialised by serialise_double.
functions to serialise and unserialise a double
Weighting scheme API.
Xapian::Weight::Internal class, holding database and term statistics.