24 #ifndef XAPIAN_INCLUDED_WEIGHT_H 25 #define XAPIAN_INCLUDED_WEIGHT_H 68 DOC_LENGTH_MAX = 1024,
76 COLLECTION_FREQ = 4096,
82 TOTAL_LENGTH = COLLECTION_SIZE | AVERAGE_LENGTH
95 stats_needed = stat_flags(stats_needed | flag);
107 virtual void init(
double factor) = 0;
111 void operator=(
const Weight &);
159 TWO_STAGE_SMOOTHING = 1,
160 DIRICHLET_SMOOTHING = 2,
161 ABSOLUTE_DISCOUNT_SMOOTHING = 3,
162 JELINEK_MERCER_SMOOTHING = 4,
163 DIRICHLET_PLUS_SMOOTHING = 5
187 virtual Weight * clone()
const = 0;
202 virtual std::string
name()
const;
209 virtual std::string serialise()
const;
228 virtual Weight * unserialise(
const std::string & serialised)
const;
249 virtual double get_maxpart()
const = 0;
268 virtual double get_maxextra()
const = 0;
303 double factor,
void* postlist);
334 return stats_needed & DOC_LENGTH;
343 return stats_needed & WDF;
353 return stats_needed & UNIQUE_TERMS;
362 return stats_needed == 0 &&
name() ==
"Xapian::BoolWeight";
402 return doclength_upper_bound_;
412 return doclength_lower_bound_;
420 return wdf_upper_bound_;
436 void init(
double factor);
442 std::string
name()
const;
444 std::string serialise()
const;
445 BoolWeight * unserialise(
const std::string & serialised)
const;
450 double get_maxpart()
const;
454 double get_maxextra()
const;
468 void init(
double factor);
474 double get_wtn(
double wt,
char c)
const;
517 explicit TfIdfWeight(
const std::string &normalizations);
521 : normalizations(
"ntn")
526 need_stat(COLLECTION_SIZE);
529 std::string
name()
const;
531 std::string serialise()
const;
532 TfIdfWeight * unserialise(
const std::string & serialised)
const;
537 double get_maxpart()
const;
541 double get_maxextra()
const;
561 void init(
double factor);
591 BM25Weight(
double k1,
double k2,
double k3,
double b,
double min_normlen)
592 : param_k1(k1), param_k2(k2), param_k3(k3), param_b(b),
593 param_min_normlen(min_normlen)
595 if (param_k1 < 0) param_k1 = 0;
596 if (param_k2 < 0) param_k2 = 0;
597 if (param_k3 < 0) param_k3 = 0;
600 }
else if (param_b > 1) {
603 need_stat(COLLECTION_SIZE);
604 need_stat(RSET_SIZE);
606 need_stat(RELTERMFREQ);
609 if (param_k2 != 0 || (param_k1 != 0 && param_b != 0)) {
610 need_stat(DOC_LENGTH_MIN);
611 need_stat(AVERAGE_LENGTH);
613 if (param_k1 != 0 && param_b != 0) need_stat(DOC_LENGTH);
614 if (param_k2 != 0) need_stat(QUERY_LENGTH);
615 if (param_k3 != 0) need_stat(WQF);
619 : param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
620 param_min_normlen(0.5)
622 need_stat(COLLECTION_SIZE);
623 need_stat(RSET_SIZE);
625 need_stat(RELTERMFREQ);
628 need_stat(DOC_LENGTH_MIN);
629 need_stat(AVERAGE_LENGTH);
630 need_stat(DOC_LENGTH);
634 std::string
name()
const;
636 std::string serialise()
const;
637 BM25Weight * unserialise(
const std::string & serialised)
const;
642 double get_maxpart()
const;
646 double get_maxextra()
const;
668 void init(
double factor);
706 double min_normlen,
double delta)
707 : param_k1(k1), param_k2(k2), param_k3(k3), param_b(b),
708 param_min_normlen(min_normlen), param_delta(delta)
710 if (param_k1 < 0) param_k1 = 0;
711 if (param_k2 < 0) param_k2 = 0;
712 if (param_k3 < 0) param_k3 = 0;
713 if (param_delta < 0) param_delta = 0;
716 }
else if (param_b > 1) {
719 need_stat(COLLECTION_SIZE);
720 need_stat(RSET_SIZE);
722 need_stat(RELTERMFREQ);
725 if (param_k2 != 0 || (param_k1 != 0 && param_b != 0)) {
726 need_stat(DOC_LENGTH_MIN);
727 need_stat(AVERAGE_LENGTH);
729 if (param_k1 != 0 && param_b != 0) need_stat(DOC_LENGTH);
730 if (param_k2 != 0) need_stat(QUERY_LENGTH);
731 if (param_k3 != 0) need_stat(WQF);
735 : param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
736 param_min_normlen(0.5), param_delta(1)
738 need_stat(COLLECTION_SIZE);
739 need_stat(RSET_SIZE);
741 need_stat(RELTERMFREQ);
744 need_stat(DOC_LENGTH_MIN);
745 need_stat(AVERAGE_LENGTH);
746 need_stat(DOC_LENGTH);
750 std::string
name()
const;
752 std::string serialise()
const;
753 BM25PlusWeight * unserialise(
const std::string & serialised)
const;
758 double get_maxpart()
const;
762 double get_maxextra()
const;
786 void init(
double factor);
797 if (param_k < 0) param_k = 0;
798 if (param_k != 0.0) {
799 need_stat(AVERAGE_LENGTH);
800 need_stat(DOC_LENGTH);
802 need_stat(COLLECTION_SIZE);
803 need_stat(RSET_SIZE);
805 need_stat(RELTERMFREQ);
806 need_stat(DOC_LENGTH_MIN);
811 std::string
name()
const;
813 std::string serialise()
const;
814 TradWeight * unserialise(
const std::string & serialised)
const;
819 double get_maxpart()
const;
823 double get_maxextra()
const;
857 void init(
double factor);
873 need_stat(AVERAGE_LENGTH);
874 need_stat(DOC_LENGTH);
875 need_stat(DOC_LENGTH_MIN);
876 need_stat(DOC_LENGTH_MAX);
877 need_stat(COLLECTION_SIZE);
884 std::string
name()
const;
886 std::string serialise()
const;
887 InL2Weight * unserialise(
const std::string & serialised)
const;
892 double get_maxpart()
const;
896 double get_maxextra()
const;
929 void init(
double factor);
945 need_stat(AVERAGE_LENGTH);
946 need_stat(DOC_LENGTH);
947 need_stat(DOC_LENGTH_MIN);
948 need_stat(DOC_LENGTH_MAX);
949 need_stat(COLLECTION_SIZE);
950 need_stat(COLLECTION_FREQ);
957 std::string
name()
const;
959 std::string serialise()
const;
960 IfB2Weight * unserialise(
const std::string & serialised)
const;
965 double get_maxpart()
const;
969 double get_maxextra()
const;
1002 void init(
double factor);
1016 need_stat(AVERAGE_LENGTH);
1017 need_stat(DOC_LENGTH);
1018 need_stat(DOC_LENGTH_MIN);
1019 need_stat(DOC_LENGTH_MAX);
1020 need_stat(COLLECTION_SIZE);
1024 need_stat(COLLECTION_FREQ);
1025 need_stat(TERMFREQ);
1028 std::string
name()
const;
1030 std::string serialise()
const;
1031 IneB2Weight * unserialise(
const std::string & serialised)
const;
1036 double get_maxpart()
const;
1040 double get_maxextra()
const;
1076 void init(
double factor);
1092 need_stat(AVERAGE_LENGTH);
1093 need_stat(DOC_LENGTH);
1094 need_stat(DOC_LENGTH_MIN);
1095 need_stat(DOC_LENGTH_MAX);
1096 need_stat(COLLECTION_SIZE);
1097 need_stat(COLLECTION_FREQ);
1101 need_stat(TERMFREQ);
1104 std::string
name()
const;
1106 std::string serialise()
const;
1107 BB2Weight * unserialise(
const std::string & serialised)
const;
1112 double get_maxpart()
const;
1116 double get_maxextra()
const;
1149 void init(
double factor);
1153 need_stat(DOC_LENGTH);
1154 need_stat(COLLECTION_FREQ);
1158 need_stat(DOC_LENGTH_MIN);
1159 need_stat(DOC_LENGTH_MAX);
1160 need_stat(TOTAL_LENGTH);
1163 std::string
name()
const;
1165 std::string serialise()
const;
1166 DLHWeight * unserialise(
const std::string & serialised)
const;
1171 double get_maxpart()
const;
1175 double get_maxextra()
const;
1220 void init(
double factor);
1236 need_stat(AVERAGE_LENGTH);
1237 need_stat(DOC_LENGTH);
1238 need_stat(DOC_LENGTH_MIN);
1239 need_stat(DOC_LENGTH_MAX);
1240 need_stat(COLLECTION_SIZE);
1241 need_stat(COLLECTION_FREQ);
1247 std::string
name()
const;
1249 std::string serialise()
const;
1250 PL2Weight * unserialise(
const std::string & serialised)
const;
1255 double get_maxpart()
const;
1259 double get_maxextra()
const;
1290 void init(
double factor_);
1313 : param_c(1.0), param_delta(0.8) {
1314 need_stat(AVERAGE_LENGTH);
1315 need_stat(DOC_LENGTH);
1316 need_stat(DOC_LENGTH_MIN);
1317 need_stat(DOC_LENGTH_MAX);
1318 need_stat(COLLECTION_SIZE);
1319 need_stat(COLLECTION_FREQ);
1325 std::string
name()
const;
1327 std::string serialise()
const;
1328 PL2PlusWeight * unserialise(
const std::string & serialised)
const;
1333 double get_maxpart()
const;
1337 double get_maxextra()
const;
1372 void init(
double factor);
1377 need_stat(DOC_LENGTH);
1378 need_stat(COLLECTION_FREQ);
1382 need_stat(DOC_LENGTH_MIN);
1383 need_stat(DOC_LENGTH_MAX);
1384 need_stat(TOTAL_LENGTH);
1387 std::string
name()
const;
1389 std::string serialise()
const;
1390 DPHWeight * unserialise(
const std::string & serialised)
const;
1395 double get_maxpart()
const;
1399 double get_maxextra()
const;
1433 void init(
double factor);
1469 type_smoothing select_smoothing_ = TWO_STAGE_SMOOTHING,
1470 double param_smoothing1_ = -1.0,
1471 double param_smoothing2_ = -1.0)
1472 : select_smoothing(select_smoothing_), param_log(param_log_), param_smoothing1(param_smoothing1_),
1473 param_smoothing2(param_smoothing2_)
1475 if (param_smoothing1 < 0) param_smoothing1 = 0.7;
1476 if (param_smoothing2 < 0) {
1477 if (select_smoothing == TWO_STAGE_SMOOTHING)
1478 param_smoothing2 = 2000.0;
1480 param_smoothing2 = 0.05;
1482 need_stat(DOC_LENGTH);
1483 need_stat(RSET_SIZE);
1484 need_stat(TERMFREQ);
1485 need_stat(RELTERMFREQ);
1486 need_stat(DOC_LENGTH_MAX);
1489 need_stat(COLLECTION_FREQ);
1490 need_stat(TOTAL_LENGTH);
1491 if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING)
1492 need_stat(UNIQUE_TERMS);
1493 if (select_smoothing == DIRICHLET_PLUS_SMOOTHING)
1494 need_stat(DOC_LENGTH_MIN);
1497 std::string
name()
const;
1499 std::string serialise()
const;
1500 LMWeight * unserialise(
const std::string & serialised)
const;
1505 double get_maxpart()
const;
1508 double get_maxextra()
const;
1523 void init(
double factor_);
1528 std::string
name()
const;
1530 std::string serialise()
const;
1531 CoordWeight * unserialise(
const std::string & serialised)
const;
1536 double get_maxpart()
const;
1539 double get_maxextra()
const;
1544 #endif // XAPIAN_INCLUDED_WEIGHT_H
The Xapian namespace contains public interfaces for the Xapian library.
type_smoothing select_smoothing
The type of smoothing to use.
double factor
The factor to multiply weights by.
Xapian::doccount termfreq_
The number of documents which this term indexes.
double param_delta
Additional parameter delta in the PL2+ weighting formula.
Xapian::doccount get_collection_size() const
The number of documents in the collection.
std::string normalizations
Xapian::termcount doclength_lower_bound_
A lower bound on the minimum length of any document in the shard.
double factor
The factor to multiply with the weight.
double upper_bound
The upper bound on the weight.
Xapian::termcount get_collection_freq() const
The collection frequency of the term.
double param_c
The wdf normalization parameter in the formula.
double param_c
The wdf normalization parameter in the formula.
double param_c
The wdf normalization parameter in the formula.
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
Xapian::termcount doclength_upper_bound_
An upper bound on the maximum length of any document in the shard.
double stirling_constant_1
double upper_bound
The upper bound on the weight.
Xapian::doclength param_min_normlen
The minimum normalised document length value.
bool get_sumpart_needs_wdf_() const
double param_c
The wdf normalization parameter in the formula.
This class implements the InL2 weighting scheme.
double param_k
The parameter in the formula.
double dw
Weight contribution of delta term in the PL2+ function.
bool get_sumpart_needs_uniqueterms_() const
bool is_bool_weight_() const
Xapian::Weight subclass implementing the PL2+ probabilistic formula.
double wqf_product_idf
The constant values which are used on every call to get_sumpart().
stat_flags stats_needed
A bitmask of the statistics this weighting scheme needs.
Xapian::totallength get_total_length() const
Total length of all documents in the collection.
bool get_sumpart_needs_doclength_() const
double lower_bound
Now unused but left in place in 1.4.x for ABI compatibility.
#define XAPIAN_VISIBILITY_DEFAULT
double upper_bound
The upper bound on the weight.
double lower_bound
The factor to multiply weights by.
DPHWeight()
Construct a DPHWeight.
Xapian::doccount collection_size_
The number of documents in the collection.
Xapian::termcount wdf_upper_bound_
An upper bound on the wdf of this term in the shard.
CoordWeight()
Construct a CoordWeight.
TfIdfWeight()
Construct a TfIdfWeight using the default normalizations ("ntn").
This class implements the BB2 weighting scheme.
type_smoothing
Type of smoothing to use with the Language Model Weighting scheme.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Xapian::termcount wqf_
The within-query-frequency of this term.
Xapian::Weight subclass implementing Coordinate Matching.
double param_c
The wdf normalization parameter in the formula.
Xapian::termcount get_doclength_lower_bound() const
A lower bound on the minimum length of any document in the shard.
double wqf_product_idf
The constant values which are used for calculations in get_sumpart().
Class implementing a "boolean" weighting scheme.
Xapian::doclength param_min_normlen
The minimum normalised document length value.
Define XAPIAN_VISIBILITY_* macros.
double doclength
A normalised document length.
double stirling_constant_2
Xapian::termcount collectionfreq_
Class to hold statistics for a given collection.
Xapian::doclength average_length_
The average length of a document in the collection.
double termweight
Factor combining all the document independent factors.
double mean
Set by init() to get_collection_freq()) / get_collection_size()
double termweight
Factor combining all the document independent factors.
stat_flags
Stats which the weighting scheme can use (see need_stat()).
Xapian::termcount get_query_length() const
The length of the query.
Xapian::doclength len_factor
Factor to multiply the document length by.
BoolWeight()
Construct a BoolWeight.
Xapian::Weight subclass implementing the traditional probabilistic formula.
This class implements the DLH weighting scheme, which is a representative scheme of the Divergence fr...
This class implements the PL2 weighting scheme.
Xapian::doccount rset_size_
The number of documents marked as relevant.
LMWeight(double param_log_=0.0, type_smoothing select_smoothing_=TWO_STAGE_SMOOTHING, double param_smoothing1_=-1.0, double param_smoothing2_=-1.0)
Construct a LMWeight.
This class implements the IneB2 weighting scheme.
Xapian::termcount get_wqf() const
The within-query-frequency of this term.
double upper_bound
The upper bound on the weight a term can give to a document.
Xapian::doccount get_rset_size() const
The number of documents marked as relevant.
Xapian::termcount query_length_
The length of the query.
double upper_bound
The upper bound on the weight.
double log_constant
The constant value to be used in get_sumpart().
double wqf_product_factor
Xapian::termcount get_doclength_upper_bound() const
An upper bound on the maximum length of any document in the shard.
BM25Weight(double k1, double k2, double k3, double b, double min_normlen)
Construct a BM25Weight.
double lower_bound
Now unused but left in place in 1.4.x for ABI compatibility.
TradWeight(double k=1.0)
Construct a TradWeight.
This class implements the IfB2 weighting scheme.
Xapian::doccount get_termfreq() const
The number of documents which this term indexes.
double wqf_product_factor
Xapian::doclength get_average_length() const
The average length of a document in the collection.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
double termweight
Factor combining all the document independent factors.
double cl
Set by init() to (param_c * get_average_length())
Xapian::doclength len_factor
Factor to multiply the document length by.
double param_c
The wdf normalization parameter in the formula.
double cl
Set by init() to (param_c * get_average_length())
This class implements the DPH weighting scheme.
Xapian::doccount get_reltermfreq() const
The number of relevant documents which this term indexes.
Xapian::doclength len_factor
Factor to multiply the document length by.
void need_stat(stat_flags flag)
Tell Xapian that your subclass will want a particular statistic.
double weight_collection
The factor to multiply weights by.
double upper_bound
The upper bound on the weight.
double log_constant
The constant value used in get_sumpart() .
double c_product_avlen
The constant values to be used in get_sumpart().
double param_delta
Additional parameter delta in the BM25+ formula.
BM25PlusWeight(double k1, double k2, double k3, double b, double min_normlen, double delta)
Construct a BM25PlusWeight.
Xapian::Weight subclass implementing the Language Model formula.
Weight()
Default constructor, needed by subclass constructors.
double upper_bound
The upper bound on the weight.
Xapian::termcount get_wdf_upper_bound() const
An upper bound on the wdf of this term in the shard.
double wqf_product_idf
Constant values used in get_sumpart().
double upper_bound
The upper bound of the weight.
double factor
The factor to multiply weights by.
Xapian::Weight subclass implementing the BM25+ probabilistic formula.
Xapian::Weight subclass implementing the BM25 probabilistic formula.
Xapian::doccount reltermfreq_
The number of relevant documents which this term indexes.
Xapian::Weight subclass implementing the tf-idf weighting scheme.
Abstract base class for weighting schemes.