40 LMWeight::clone()
const {
41 return new LMWeight(param_log, select_smoothing, param_smoothing1, param_smoothing2);
45 LMWeight::init(
double factor_)
48 weight_collection = factor_;
55 if (param_log == 0.0) {
56 param_log = get_doclength_upper_bound();
64 if (select_smoothing == JELINEK_MERCER_SMOOTHING ||
65 select_smoothing == TWO_STAGE_SMOOTHING) {
66 if (param_smoothing1 == 0.7) {
67 if (get_query_length() <= 2) {
68 param_smoothing1 = 0.1;
78 if (select_smoothing == DIRICHLET_SMOOTHING) {
79 if (param_smoothing1 == 0.7) {
80 param_smoothing1 = 2000;
86 if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
87 if (param_smoothing1 == 0.7) {
88 param_smoothing1 = 2000;
96 return "Xapian::LMWeight";
100 LMWeight::serialise()
const
103 result +=
static_cast<unsigned char>(select_smoothing);
110 LMWeight::unserialise(
const string & s)
const
112 const char *ptr = s.data();
113 const char *end = ptr + s.size();
118 if (
rare(ptr != end))
120 return new LMWeight(param_log_, select_smoothing_, param_smoothing1_, param_smoothing2_);
128 double wdf_double = wdf;
130 double len_double = len;
140 double wt_coll = get_collection_freq() / double(get_total_length());
143 if (select_smoothing == JELINEK_MERCER_SMOOTHING) {
147 double weight_document = wdf_double / len_double;
148 weight_sum = (param_smoothing1 * wt_coll) +
149 ((1 - param_smoothing1) * weight_document);
150 }
else if (select_smoothing == DIRICHLET_SMOOTHING) {
151 weight_sum = (wdf_double + (param_smoothing1 * wt_coll)) /
152 (len_double + param_smoothing1);
153 }
else if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
161 weight_sum = (1 + (wdf_double / (param_smoothing1 * wt_coll))) *
162 (1 + (param_smoothing2 / (param_smoothing1 * wt_coll)));
163 }
else if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING) {
164 double uniqterm_double = uniqterm;
165 weight_sum = ((((wdf_double - param_smoothing1) > 0) ? (wdf_double - param_smoothing1) : 0) / len_double) + ((param_smoothing1 * wt_coll * uniqterm_double) / len_double);
167 weight_sum = (((1 - param_smoothing1) * (wdf_double + (param_smoothing2 * wt_coll)) / (len_double + param_smoothing2)) + (param_smoothing1 * wt_coll));
176 double product = weight_sum * param_log;
178 return (product > 1.0) ? weight_collection * log(product) : 0;
182 LMWeight::get_maxpart()
const
187 double wdf_max = get_wdf_upper_bound();
195 double wt_coll = get_collection_freq() / double(get_total_length());
198 if (select_smoothing == JELINEK_MERCER_SMOOTHING) {
199 upper_bound = (param_smoothing1 * wt_coll) + (1 - param_smoothing1);
200 }
else if (select_smoothing == DIRICHLET_SMOOTHING) {
201 upper_bound = (get_doclength_upper_bound() + (param_smoothing1 * wt_coll)) / (get_doclength_upper_bound() + param_smoothing1);
202 }
else if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
203 upper_bound = (1 + (wdf_max / (param_smoothing1 * wt_coll))) *
204 (1 + (param_smoothing2 / (param_smoothing1 * wt_coll)));
205 }
else if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING) {
206 upper_bound = param_smoothing1 * wt_coll + 1;
208 upper_bound = (((1 - param_smoothing1) * (get_doclength_upper_bound() + (param_smoothing2 * wt_coll)) / (get_doclength_upper_bound() + param_smoothing2)) + (param_smoothing1 * wt_coll));
214 double product = upper_bound * param_log;
216 return (product > 1.0) ? weight_collection * log(product) : 1.0;
229 if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
230 double extra_weight = param_smoothing1 / (len + param_smoothing1);
231 return get_query_length() * log(extra_weight);
237 LMWeight::get_maxextra()
const
239 if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
240 double extra_weight = param_smoothing1 / (get_doclength_lower_bound() + param_smoothing1);
241 return get_query_length() * log(extra_weight);
Xapian::Weight subclass implementing the Language Model formula.
Indicates an error in the std::string serialisation of an object.
type_smoothing
Type of smoothing to use with the Language Model Weighting scheme.
Hierarchy of classes which Xapian can throw as exceptions.
The Xapian namespace contains public interfaces for the Xapian library.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Various assertion macros.
std::string serialise_double(double v)
Serialise a double to a string.
double unserialise_double(const char **p, const char *end)
Unserialise a double serialised by serialise_double.
functions to serialise and unserialise a double