43 static TfIdfWeight::wdf_norm
46 if (normalizations.length() != 3)
48 switch (normalizations[0]) {
52 return TfIdfWeight::wdf_norm::SQUARE;
54 return TfIdfWeight::wdf_norm::LOG;
56 return TfIdfWeight::wdf_norm::PIVOTED;
58 return TfIdfWeight::wdf_norm::LOG_AVERAGE;
62 return TfIdfWeight::wdf_norm::MAX;
64 return TfIdfWeight::wdf_norm::AUG;
69 static TfIdfWeight::idf_norm
72 if (normalizations.length() != 3)
74 switch (normalizations[1]) {
78 return TfIdfWeight::idf_norm::SQUARE;
80 return TfIdfWeight::idf_norm::FREQ;
82 return TfIdfWeight::idf_norm::PIVOTED;
84 return TfIdfWeight::idf_norm::PROB;
86 return TfIdfWeight::idf_norm::TFIDF;
91 static TfIdfWeight::wt_norm
94 if (normalizations.length() != 3)
96 switch (normalizations[2]) {
103 TfIdfWeight::TfIdfWeight(
const std::string& normals,
104 double slope,
double delta)
113 double slope,
double delta)
114 : wdf_norm_(wdf_normalization), idf_norm_(idf_normalization),
115 wt_norm_(wt_normalization), param_slope(slope), param_delta(delta)
161 if (factor_ == 0.0) {
182 result +=
static_cast<unsigned char>(
wdf_norm_);
183 result +=
static_cast<unsigned char>(
idf_norm_);
184 result +=
static_cast<unsigned char>(
wt_norm_);
191 const char *ptr = s.data();
192 const char *end = ptr + s.size();
195 if (
rare(end - ptr != 3))
197 (
"Incorrect data in TfIdfWeight::unserialise()");
201 return new TfIdfWeight(wdf_normalization, idf_normalization,
202 wt_normalization, slope, delta);
233 switch (wdf_normalization) {
235 if (wdf == 0)
return 0;
240 if (wdf == 0)
return 0;
241 return (1 + log(
double(wdf)));
243 if (wdf == 0)
return 0;
246 return ((1 + log(1 + log(
double(wdf)))) * norm_factor +
param_delta);
249 if (wdf == 0)
return 0;
250 double uniqterm_double = uniqterms;
251 double doclen_double = doclen;
253 if (doclen_double == 0 || uniqterm_double == 0)
256 wdf_avg = doclen_double / uniqterm_double;
257 double num = 1 + log(
double(wdf));
258 double den = 1 + log(wdf_avg);
262 if (wdf == 0)
return 0;
263 return (0.2 + 0.8 * log(1.0 + wdf));
266 if (wdf == 0)
return 0;
267 return (sqrt(wdf - 0.5) + 1);
270 if (wdf == 0)
return 0;
271 return 0.9 + 0.1 * (double(wdf) / (double(doclen) / uniqterms));
274 if (
rare(wdfdocmax == 0))
return 0;
275 return double(wdf) / wdfdocmax;
277 if (wdf == 0)
return 0;
278 return 0.5 + 0.5 * (double(wdf) / wdfdocmax);
298 switch (idf_normalization) {
303 if (N == termfreq)
return 0;
304 return log((N - termfreq) / termfreq);
306 return (1.0 / termfreq);
308 double x = log(N / termfreq);
312 return log((N + 1) / termfreq);
315 return (
double(collfreq) / termfreq);
319 return log(
double(collfreq) / termfreq + 1);
323 return (
double(collfreq) / termfreq + 1);
327 return sqrt(
double(collfreq) / termfreq - 0.9);
332 return (log(N / termfreq));
338 (void)wt_normalization;
352 const char*
p = params;
361 if (code < 0 && s.size() == 3 && *
p ==
'\0') {
InvalidArgumentError indicates an invalid parameter value was passed to the API.
Indicates an error in the std::string serialisation of an object.
Xapian::Weight subclass implementing the tf-idf weighting scheme.
double wqf_factor
The factor to multiply with the weight.
idf_norm
Idf normalizations.
@ GLOBAL_FREQ
Global frequency IDF.
@ LOG_GLOBAL_FREQ
Log global frequency IDF.
@ SQRT_GLOBAL_FREQ
Square root global frequency IDF.
@ INCREMENTED_GLOBAL_FREQ
Incremented global frequency IDF.
double get_wdfn(Xapian::termcount wdf, Xapian::termcount len, Xapian::termcount uniqterms, Xapian::termcount wdfdocmax, wdf_norm wdf_normalization) const
TfIdfWeight * create_from_parameters(const char *params) const
Create from a human-readable parameter string.
double get_maxpart() const
Return an upper bound on what get_sumpart() can return for any document.
wt_norm wt_norm_
The parameter for normalization for the document weight.
TfIdfWeight * unserialise(const std::string &serialised) const
Unserialise parameters.
double get_sumpart(Xapian::termcount wdf, Xapian::termcount doclen, Xapian::termcount uniqterm, Xapian::termcount wdfdocmax) const
Calculate the weight contribution for this object's term to a document.
void init(double factor)
Allow the subclass to perform any initialisation it needs to.
TfIdfWeight * clone() const
Clone this object.
wdf_norm wdf_norm_
The parameter for normalization for the wdf.
wt_norm
Weight normalizations.
idf_norm idf_norm_
The parameter for normalization for the idf.
double idfn
Normalised IDF value (document-independent).
std::string name() const
Return the name of this weighting scheme, e.g.
TfIdfWeight()
Construct a TfIdfWeight using the default normalizations ("ntn").
double get_idfn(idf_norm idf_normalization) const
wdf_norm
Wdf normalizations.
@ LOG_AVERAGE
Log average.
@ AUG_AVERAGE
Augmented average term frequency.
std::string serialise() const
Return this object's parameters serialised as a single string.
double get_wtn(double wt, wt_norm wt_normalization) const
double param_slope
Parameters slope and delta in the Piv+ normalization weighting formula.
static void parameter_error(const char *msg, const std::string &scheme, const char *params)
static bool param_name(const char **p, std::string &name)
Xapian::termcount get_doclength_lower_bound() const
A lower bound on the minimum length of any document in the shard.
Xapian::doccount get_termfreq() const
The number of documents which this term indexes.
void need_stat(stat_flags flag)
Tell Xapian that your subclass will want a particular statistic.
Xapian::termcount get_wqf() const
The within-query-frequency of this term.
Xapian::termcount get_collection_freq() const
The collection frequency of the term.
Xapian::doccount get_collection_size() const
The number of documents in the collection.
Xapian::doclength get_average_length() const
The average length of a document in the collection.
@ WDF_DOC_MAX
Maximum wdf in the current document.
@ UNIQUE_TERMS
Number of unique terms in the current document.
@ AVERAGE_LENGTH
Average length of documents in the collection.
@ DOC_LENGTH_MAX
Upper bound on document lengths.
@ DOC_LENGTH
Length of the current document (sum wdf).
@ TERMFREQ
How many documents the current term is in.
@ WQF
Within-query-frequency of the current term.
@ COLLECTION_SIZE
Number of documents in the collection.
@ WDF_MAX
Upper bound on wdf.
@ DOC_LENGTH_MIN
Lower bound on (non-zero) document lengths.
@ COLLECTION_FREQ
Sum of wdf over the whole collection for the current term.
@ WDF
Within-document-frequency of the current term in the current document.
Xapian::termcount get_wdf_upper_bound() const
An upper bound on the wdf of this term in the shard.
Hierarchy of classes which Xapian can throw as exceptions.
Map string to idf normalisation code.
static const unsigned char idf_norm_tab[]
int keyword(const unsigned char *p, const char *s, size_t len)
Efficient keyword to enum lookup.
The Xapian namespace contains public interfaces for the Xapian library.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
static TfIdfWeight::wdf_norm decode_wdf_norm(const string &normalizations)
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
static TfIdfWeight::idf_norm decode_idf_norm(const string &normalizations)
static void parameter_error(const char *message, const char *params)
static TfIdfWeight::wt_norm decode_wt_norm(const string &normalizations)
Various assertion macros.
string serialise_double(double v)
Serialise a double to a string.
double unserialise_double(const char **p, const char *end)
Unserialise a double serialised by serialise_double.
functions to serialise and unserialise a double
Map string to wdf normalisation code.
static const unsigned char wdf_norm_tab[]
Xapian::Weight::Internal class, holding database and term statistics.