24 #ifndef XAPIAN_INCLUDED_WEIGHT_H 25 #define XAPIAN_INCLUDED_WEIGHT_H 60 DOC_LENGTH_MAX = 1024,
64 COLLECTION_FREQ = 4096,
71 TOTAL_LENGTH = COLLECTION_SIZE | AVERAGE_LENGTH
84 stats_needed = stat_flags(stats_needed | flag);
96 virtual void init(
double factor) = 0;
100 void operator=(
const Weight &);
148 TWO_STAGE_SMOOTHING = 1,
149 DIRICHLET_SMOOTHING = 2,
150 ABSOLUTE_DISCOUNT_SMOOTHING = 3,
151 JELINEK_MERCER_SMOOTHING = 4,
152 DIRICHLET_PLUS_SMOOTHING = 5
176 virtual Weight * clone()
const = 0;
191 virtual std::string
name()
const;
198 virtual std::string serialise()
const;
217 virtual Weight * unserialise(
const std::string & serialised)
const;
238 virtual double get_maxpart()
const = 0;
257 virtual double get_maxextra()
const = 0;
292 double factor,
void* postlist);
323 return stats_needed & DOC_LENGTH;
332 return stats_needed & WDF;
342 return stats_needed & UNIQUE_TERMS;
351 return stats_needed == 0 &&
name() ==
"Xapian::BoolWeight";
391 return doclength_upper_bound_;
401 return doclength_lower_bound_;
409 return wdf_upper_bound_;
425 void init(
double factor);
431 std::string
name()
const;
433 std::string serialise()
const;
434 BoolWeight * unserialise(
const std::string & serialised)
const;
439 double get_maxpart()
const;
443 double get_maxextra()
const;
457 void init(
double factor);
463 double get_wtn(
double wt,
char c)
const;
506 explicit TfIdfWeight(
const std::string &normalizations);
510 : normalizations(
"ntn")
515 need_stat(COLLECTION_SIZE);
518 std::string
name()
const;
520 std::string serialise()
const;
521 TfIdfWeight * unserialise(
const std::string & serialised)
const;
526 double get_maxpart()
const;
530 double get_maxextra()
const;
550 void init(
double factor);
580 BM25Weight(
double k1,
double k2,
double k3,
double b,
double min_normlen)
581 : param_k1(k1), param_k2(k2), param_k3(k3), param_b(b),
582 param_min_normlen(min_normlen)
584 if (param_k1 < 0) param_k1 = 0;
585 if (param_k2 < 0) param_k2 = 0;
586 if (param_k3 < 0) param_k3 = 0;
589 }
else if (param_b > 1) {
592 need_stat(COLLECTION_SIZE);
593 need_stat(RSET_SIZE);
595 need_stat(RELTERMFREQ);
598 if (param_k2 != 0 || (param_k1 != 0 && param_b != 0)) {
599 need_stat(DOC_LENGTH_MIN);
600 need_stat(AVERAGE_LENGTH);
602 if (param_k1 != 0 && param_b != 0) need_stat(DOC_LENGTH);
603 if (param_k2 != 0) need_stat(QUERY_LENGTH);
604 if (param_k3 != 0) need_stat(WQF);
608 : param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
609 param_min_normlen(0.5)
611 need_stat(COLLECTION_SIZE);
612 need_stat(RSET_SIZE);
614 need_stat(RELTERMFREQ);
617 need_stat(DOC_LENGTH_MIN);
618 need_stat(AVERAGE_LENGTH);
619 need_stat(DOC_LENGTH);
623 std::string
name()
const;
625 std::string serialise()
const;
626 BM25Weight * unserialise(
const std::string & serialised)
const;
631 double get_maxpart()
const;
635 double get_maxextra()
const;
657 void init(
double factor);
695 double min_normlen,
double delta)
696 : param_k1(k1), param_k2(k2), param_k3(k3), param_b(b),
697 param_min_normlen(min_normlen), param_delta(delta)
699 if (param_k1 < 0) param_k1 = 0;
700 if (param_k2 < 0) param_k2 = 0;
701 if (param_k3 < 0) param_k3 = 0;
702 if (param_delta < 0) param_delta = 0;
705 }
else if (param_b > 1) {
708 need_stat(COLLECTION_SIZE);
709 need_stat(RSET_SIZE);
711 need_stat(RELTERMFREQ);
714 if (param_k2 != 0 || (param_k1 != 0 && param_b != 0)) {
715 need_stat(DOC_LENGTH_MIN);
716 need_stat(AVERAGE_LENGTH);
718 if (param_k1 != 0 && param_b != 0) need_stat(DOC_LENGTH);
719 if (param_k2 != 0) need_stat(QUERY_LENGTH);
720 if (param_k3 != 0) need_stat(WQF);
724 : param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
725 param_min_normlen(0.5), param_delta(1)
727 need_stat(COLLECTION_SIZE);
728 need_stat(RSET_SIZE);
730 need_stat(RELTERMFREQ);
733 need_stat(DOC_LENGTH_MIN);
734 need_stat(AVERAGE_LENGTH);
735 need_stat(DOC_LENGTH);
739 std::string
name()
const;
741 std::string serialise()
const;
742 BM25PlusWeight * unserialise(
const std::string & serialised)
const;
747 double get_maxpart()
const;
751 double get_maxextra()
const;
775 void init(
double factor);
786 if (param_k < 0) param_k = 0;
787 if (param_k != 0.0) {
788 need_stat(AVERAGE_LENGTH);
789 need_stat(DOC_LENGTH);
791 need_stat(COLLECTION_SIZE);
792 need_stat(RSET_SIZE);
794 need_stat(RELTERMFREQ);
795 need_stat(DOC_LENGTH_MIN);
800 std::string
name()
const;
802 std::string serialise()
const;
803 TradWeight * unserialise(
const std::string & serialised)
const;
808 double get_maxpart()
const;
812 double get_maxextra()
const;
846 void init(
double factor);
862 need_stat(AVERAGE_LENGTH);
863 need_stat(DOC_LENGTH);
864 need_stat(DOC_LENGTH_MIN);
865 need_stat(DOC_LENGTH_MAX);
866 need_stat(COLLECTION_SIZE);
873 std::string
name()
const;
875 std::string serialise()
const;
876 InL2Weight * unserialise(
const std::string & serialised)
const;
881 double get_maxpart()
const;
885 double get_maxextra()
const;
918 void init(
double factor);
934 need_stat(AVERAGE_LENGTH);
935 need_stat(DOC_LENGTH);
936 need_stat(DOC_LENGTH_MIN);
937 need_stat(DOC_LENGTH_MAX);
938 need_stat(COLLECTION_SIZE);
939 need_stat(COLLECTION_FREQ);
946 std::string
name()
const;
948 std::string serialise()
const;
949 IfB2Weight * unserialise(
const std::string & serialised)
const;
954 double get_maxpart()
const;
958 double get_maxextra()
const;
991 void init(
double factor);
1005 need_stat(AVERAGE_LENGTH);
1006 need_stat(DOC_LENGTH);
1007 need_stat(DOC_LENGTH_MIN);
1008 need_stat(DOC_LENGTH_MAX);
1009 need_stat(COLLECTION_SIZE);
1013 need_stat(COLLECTION_FREQ);
1014 need_stat(TERMFREQ);
1017 std::string
name()
const;
1019 std::string serialise()
const;
1020 IneB2Weight * unserialise(
const std::string & serialised)
const;
1025 double get_maxpart()
const;
1029 double get_maxextra()
const;
1065 void init(
double factor);
1081 need_stat(AVERAGE_LENGTH);
1082 need_stat(DOC_LENGTH);
1083 need_stat(DOC_LENGTH_MIN);
1084 need_stat(DOC_LENGTH_MAX);
1085 need_stat(COLLECTION_SIZE);
1086 need_stat(COLLECTION_FREQ);
1090 need_stat(TERMFREQ);
1093 std::string
name()
const;
1095 std::string serialise()
const;
1096 BB2Weight * unserialise(
const std::string & serialised)
const;
1101 double get_maxpart()
const;
1105 double get_maxextra()
const;
1138 void init(
double factor);
1142 need_stat(DOC_LENGTH);
1143 need_stat(COLLECTION_FREQ);
1147 need_stat(DOC_LENGTH_MIN);
1148 need_stat(DOC_LENGTH_MAX);
1149 need_stat(TOTAL_LENGTH);
1152 std::string
name()
const;
1154 std::string serialise()
const;
1155 DLHWeight * unserialise(
const std::string & serialised)
const;
1160 double get_maxpart()
const;
1164 double get_maxextra()
const;
1209 void init(
double factor);
1225 need_stat(AVERAGE_LENGTH);
1226 need_stat(DOC_LENGTH);
1227 need_stat(DOC_LENGTH_MIN);
1228 need_stat(DOC_LENGTH_MAX);
1229 need_stat(COLLECTION_SIZE);
1230 need_stat(COLLECTION_FREQ);
1236 std::string
name()
const;
1238 std::string serialise()
const;
1239 PL2Weight * unserialise(
const std::string & serialised)
const;
1244 double get_maxpart()
const;
1248 double get_maxextra()
const;
1279 void init(
double factor_);
1302 : param_c(1.0), param_delta(0.8) {
1303 need_stat(AVERAGE_LENGTH);
1304 need_stat(DOC_LENGTH);
1305 need_stat(DOC_LENGTH_MIN);
1306 need_stat(DOC_LENGTH_MAX);
1307 need_stat(COLLECTION_SIZE);
1308 need_stat(COLLECTION_FREQ);
1314 std::string
name()
const;
1316 std::string serialise()
const;
1317 PL2PlusWeight * unserialise(
const std::string & serialised)
const;
1322 double get_maxpart()
const;
1326 double get_maxextra()
const;
1361 void init(
double factor);
1366 need_stat(DOC_LENGTH);
1367 need_stat(COLLECTION_FREQ);
1371 need_stat(DOC_LENGTH_MIN);
1372 need_stat(DOC_LENGTH_MAX);
1373 need_stat(TOTAL_LENGTH);
1376 std::string
name()
const;
1378 std::string serialise()
const;
1379 DPHWeight * unserialise(
const std::string & serialised)
const;
1384 double get_maxpart()
const;
1388 double get_maxextra()
const;
1421 void init(
double factor);
1457 type_smoothing select_smoothing_ = TWO_STAGE_SMOOTHING,
1458 double param_smoothing1_ = -1.0,
1459 double param_smoothing2_ = -1.0)
1460 : select_smoothing(select_smoothing_), param_log(param_log_), param_smoothing1(param_smoothing1_),
1461 param_smoothing2(param_smoothing2_)
1463 if (param_smoothing1 < 0) param_smoothing1 = 0.7;
1464 if (param_smoothing2 < 0) {
1465 if (select_smoothing == TWO_STAGE_SMOOTHING)
1466 param_smoothing2 = 2000.0;
1468 param_smoothing2 = 0.05;
1470 need_stat(DOC_LENGTH);
1471 need_stat(RSET_SIZE);
1472 need_stat(TERMFREQ);
1473 need_stat(RELTERMFREQ);
1474 need_stat(DOC_LENGTH_MAX);
1477 need_stat(COLLECTION_FREQ);
1478 need_stat(TOTAL_LENGTH);
1479 if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING)
1480 need_stat(UNIQUE_TERMS);
1481 if (select_smoothing == DIRICHLET_PLUS_SMOOTHING)
1482 need_stat(DOC_LENGTH_MIN);
1485 std::string
name()
const;
1487 std::string serialise()
const;
1488 LMWeight * unserialise(
const std::string & serialised)
const;
1493 double get_maxpart()
const;
1496 double get_maxextra()
const;
1511 void init(
double factor_);
1516 std::string
name()
const;
1518 std::string serialise()
const;
1519 CoordWeight * unserialise(
const std::string & serialised)
const;
1524 double get_maxpart()
const;
1527 double get_maxextra()
const;
1532 #endif // XAPIAN_INCLUDED_WEIGHT_H
The Xapian namespace contains public interfaces for the Xapian library.
type_smoothing select_smoothing
The type of smoothing to use.
double factor
The factor to multiply weights by.
Xapian::doccount termfreq_
The number of documents which this term indexes.
double param_delta
Additional parameter delta in the PL2+ weighting formula.
Xapian::doccount get_collection_size() const
The number of documents in the collection.
std::string normalizations
Xapian::termcount doclength_lower_bound_
A lower bound on the minimum length of any document in the database.
double factor
The factor to multiply with the weight.
double upper_bound
The upper bound on the weight.
Xapian::termcount get_collection_freq() const
The collection frequency of the term.
double param_c
The wdf normalization parameter in the formula.
double param_c
The wdf normalization parameter in the formula.
double param_c
The wdf normalization parameter in the formula.
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
Xapian::termcount doclength_upper_bound_
An upper bound on the maximum length of any document in the database.
double stirling_constant_1
double upper_bound
The upper bound on the weight.
Xapian::doclength param_min_normlen
The minimum normalised document length value.
bool get_sumpart_needs_wdf_() const
double param_c
The wdf normalization parameter in the formula.
This class implements the InL2 weighting scheme.
double param_k
The parameter in the formula.
double dw
Weight contribution of delta term in the PL2+ function.
bool get_sumpart_needs_uniqueterms_() const
bool is_bool_weight_() const
Xapian::Weight subclass implementing the PL2+ probabilistic formula.
double wqf_product_idf
The constant values which are used on every call to get_sumpart().
stat_flags stats_needed
A bitmask of the statistics this weighting scheme needs.
Xapian::totallength get_total_length() const
Total length of all documents in the collection.
bool get_sumpart_needs_doclength_() const
double lower_bound
Now unused but left in place in 1.4.x for ABI compatibility.
#define XAPIAN_VISIBILITY_DEFAULT
double upper_bound
The upper bound on the weight.
double lower_bound
The factor to multiply weights by.
DPHWeight()
Construct a DPHWeight.
Xapian::doccount collection_size_
The number of documents in the collection.
Xapian::termcount wdf_upper_bound_
An upper bound on the wdf of this term.
CoordWeight()
Construct a CoordWeight.
TfIdfWeight()
Construct a TfIdfWeight using the default normalizations ("ntn").
This class implements the BB2 weighting scheme.
type_smoothing
Type of smoothing to use with the Language Model Weighting scheme.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Xapian::termcount wqf_
The within-query-frequency of this term.
Xapian::Weight subclass implementing Coordinate Matching.
double param_c
The wdf normalization parameter in the formula.
Xapian::termcount get_doclength_lower_bound() const
A lower bound on the minimum length of any document in the database.
double wqf_product_idf
The constant values which are used for calculations in get_sumpart().
Class implementing a "boolean" weighting scheme.
Xapian::doclength param_min_normlen
The minimum normalised document length value.
Define XAPIAN_VISIBILITY_* macros.
double doclength
A normalised document length.
double stirling_constant_2
Xapian::termcount collectionfreq_
Class to hold statistics for a given collection.
Xapian::doclength average_length_
The average length of a document in the collection.
double termweight
Factor combining all the document independent factors.
double mean
Set by init() to get_collection_freq()) / get_collection_size()
double termweight
Factor combining all the document independent factors.
stat_flags
Stats which the weighting scheme can use (see need_stat()).
Xapian::termcount get_query_length() const
The length of the query.
Xapian::doclength len_factor
Factor to multiply the document length by.
BoolWeight()
Construct a BoolWeight.
Xapian::Weight subclass implementing the traditional probabilistic formula.
This class implements the DLH weighting scheme, which is a representative scheme of the Divergence fr...
This class implements the PL2 weighting scheme.
Xapian::doccount rset_size_
The number of documents marked as relevant.
LMWeight(double param_log_=0.0, type_smoothing select_smoothing_=TWO_STAGE_SMOOTHING, double param_smoothing1_=-1.0, double param_smoothing2_=-1.0)
Construct a LMWeight.
This class implements the IneB2 weighting scheme.
Xapian::termcount get_wqf() const
The within-query-frequency of this term.
double upper_bound
The upper bound on the weight a term can give to a document.
Xapian::doccount get_rset_size() const
The number of documents marked as relevant.
Xapian::termcount query_length_
The length of the query.
double upper_bound
The upper bound on the weight.
double log_constant
The constant value to be used in get_sumpart().
double wqf_product_factor
Xapian::termcount get_doclength_upper_bound() const
An upper bound on the maximum length of any document in the database.
BM25Weight(double k1, double k2, double k3, double b, double min_normlen)
Construct a BM25Weight.
double lower_bound
Now unused but left in place in 1.4.x for ABI compatibility.
TradWeight(double k=1.0)
Construct a TradWeight.
This class implements the IfB2 weighting scheme.
Xapian::doccount get_termfreq() const
The number of documents which this term indexes.
double wqf_product_factor
Xapian::doclength get_average_length() const
The average length of a document in the collection.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
double termweight
Factor combining all the document independent factors.
double cl
Set by init() to (param_c * get_average_length())
Xapian::doclength len_factor
Factor to multiply the document length by.
double param_c
The wdf normalization parameter in the formula.
double cl
Set by init() to (param_c * get_average_length())
This class implements the DPH weighting scheme.
Xapian::doccount get_reltermfreq() const
The number of relevant documents which this term indexes.
Xapian::doclength len_factor
Factor to multiply the document length by.
void need_stat(stat_flags flag)
Tell Xapian that your subclass will want a particular statistic.
double weight_collection
The factor to multiply weights by.
double upper_bound
The upper bound on the weight.
double log_constant
The constant value used in get_sumpart() .
double c_product_avlen
The constant values to be used in get_sumpart().
double param_delta
Additional parameter delta in the BM25+ formula.
BM25PlusWeight(double k1, double k2, double k3, double b, double min_normlen, double delta)
Construct a BM25PlusWeight.
Xapian::Weight subclass implementing the Language Model formula.
Weight()
Default constructor, needed by subclass constructors.
double upper_bound
The upper bound on the weight.
Xapian::termcount get_wdf_upper_bound() const
An upper bound on the wdf of this term.
double wqf_product_idf
Constant values used in get_sumpart().
double upper_bound
The upper bound of the weight.
double factor
The factor to multiply weights by.
Xapian::Weight subclass implementing the BM25+ probabilistic formula.
Xapian::Weight subclass implementing the BM25 probabilistic formula.
Xapian::doccount reltermfreq_
The number of relevant documents which this term indexes.
Xapian::Weight subclass implementing the tf-idf weighting scheme.
Abstract base class for weighting schemes.