24 #ifndef XAPIAN_INCLUDED_WEIGHT_H 25 #define XAPIAN_INCLUDED_WEIGHT_H 60 DOC_LENGTH_MAX = 1024,
64 COLLECTION_FREQ = 4096,
71 TOTAL_LENGTH = COLLECTION_SIZE | AVERAGE_LENGTH
84 stats_needed = stat_flags(stats_needed | flag);
96 virtual void init(
double factor) = 0;
100 void operator=(
const Weight &);
148 TWO_STAGE_SMOOTHING = 1,
149 DIRICHLET_SMOOTHING = 2,
150 ABSOLUTE_DISCOUNT_SMOOTHING = 3,
151 JELINEK_MERCER_SMOOTHING = 4,
152 DIRICHLET_PLUS_SMOOTHING = 5
176 virtual Weight * clone()
const = 0;
191 virtual std::string
name()
const;
198 virtual std::string serialise()
const;
217 virtual Weight * unserialise(
const std::string & serialised)
const;
238 virtual double get_maxpart()
const = 0;
257 virtual double get_maxextra()
const = 0;
292 double factor,
void* postlist);
323 return stats_needed & DOC_LENGTH;
332 return stats_needed & WDF;
342 return stats_needed & UNIQUE_TERMS;
351 return stats_needed == 0 &&
name() ==
"Xapian::BoolWeight";
391 return doclength_upper_bound_;
401 return doclength_lower_bound_;
409 return wdf_upper_bound_;
425 void init(
double factor);
431 std::string
name()
const;
433 std::string serialise()
const;
434 BoolWeight * unserialise(
const std::string & serialised)
const;
439 double get_maxpart()
const;
443 double get_maxextra()
const;
457 void init(
double factor);
463 double get_wtn(
double wt,
char c)
const;
506 explicit TfIdfWeight(
const std::string &normalizations);
510 : normalizations(
"ntn")
515 need_stat(COLLECTION_SIZE);
518 std::string
name()
const;
520 std::string serialise()
const;
521 TfIdfWeight * unserialise(
const std::string & serialised)
const;
526 double get_maxpart()
const;
530 double get_maxextra()
const;
550 void init(
double factor);
580 BM25Weight(
double k1,
double k2,
double k3,
double b,
double min_normlen)
581 : param_k1(k1), param_k2(k2), param_k3(k3), param_b(b),
582 param_min_normlen(min_normlen)
584 if (param_k1 < 0) param_k1 = 0;
585 if (param_k2 < 0) param_k2 = 0;
586 if (param_k3 < 0) param_k3 = 0;
589 }
else if (param_b > 1) {
592 need_stat(COLLECTION_SIZE);
593 need_stat(RSET_SIZE);
595 need_stat(RELTERMFREQ);
598 if (param_k2 != 0 || (param_k1 != 0 && param_b != 0)) {
599 need_stat(DOC_LENGTH_MIN);
600 need_stat(AVERAGE_LENGTH);
602 if (param_k1 != 0 && param_b != 0) need_stat(DOC_LENGTH);
603 if (param_k2 != 0) need_stat(QUERY_LENGTH);
604 if (param_k3 != 0) need_stat(WQF);
608 : param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
609 param_min_normlen(0.5)
611 need_stat(COLLECTION_SIZE);
612 need_stat(RSET_SIZE);
614 need_stat(RELTERMFREQ);
617 need_stat(DOC_LENGTH_MIN);
618 need_stat(AVERAGE_LENGTH);
619 need_stat(DOC_LENGTH);
623 std::string
name()
const;
625 std::string serialise()
const;
626 BM25Weight * unserialise(
const std::string & serialised)
const;
631 double get_maxpart()
const;
635 double get_maxextra()
const;
657 void init(
double factor);
695 double min_normlen,
double delta)
696 : param_k1(k1), param_k2(k2), param_k3(k3), param_b(b),
697 param_min_normlen(min_normlen), param_delta(delta)
699 if (param_k1 < 0) param_k1 = 0;
700 if (param_k2 < 0) param_k2 = 0;
701 if (param_k3 < 0) param_k3 = 0;
702 if (param_delta < 0) param_delta = 0;
705 }
else if (param_b > 1) {
708 need_stat(COLLECTION_SIZE);
709 need_stat(RSET_SIZE);
711 need_stat(RELTERMFREQ);
714 if (param_k2 != 0 || (param_k1 != 0 && param_b != 0)) {
715 need_stat(DOC_LENGTH_MIN);
716 need_stat(AVERAGE_LENGTH);
718 if (param_k1 != 0 && param_b != 0) need_stat(DOC_LENGTH);
719 if (param_k2 != 0) need_stat(QUERY_LENGTH);
720 if (param_k3 != 0) need_stat(WQF);
721 if (param_delta != 0) {
722 need_stat(AVERAGE_LENGTH);
723 need_stat(DOC_LENGTH);
729 : param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
730 param_min_normlen(0.5), param_delta(1)
732 need_stat(COLLECTION_SIZE);
733 need_stat(RSET_SIZE);
735 need_stat(RELTERMFREQ);
738 need_stat(DOC_LENGTH_MIN);
739 need_stat(AVERAGE_LENGTH);
740 need_stat(DOC_LENGTH);
744 std::string
name()
const;
746 std::string serialise()
const;
747 BM25PlusWeight * unserialise(
const std::string & serialised)
const;
752 double get_maxpart()
const;
756 double get_maxextra()
const;
780 void init(
double factor);
791 if (param_k < 0) param_k = 0;
792 if (param_k != 0.0) {
793 need_stat(AVERAGE_LENGTH);
794 need_stat(DOC_LENGTH);
796 need_stat(COLLECTION_SIZE);
797 need_stat(RSET_SIZE);
799 need_stat(RELTERMFREQ);
800 need_stat(DOC_LENGTH_MIN);
805 std::string
name()
const;
807 std::string serialise()
const;
808 TradWeight * unserialise(
const std::string & serialised)
const;
813 double get_maxpart()
const;
817 double get_maxextra()
const;
851 void init(
double factor);
867 need_stat(AVERAGE_LENGTH);
868 need_stat(DOC_LENGTH);
869 need_stat(DOC_LENGTH_MIN);
870 need_stat(DOC_LENGTH_MAX);
871 need_stat(COLLECTION_SIZE);
878 std::string
name()
const;
880 std::string serialise()
const;
881 InL2Weight * unserialise(
const std::string & serialised)
const;
886 double get_maxpart()
const;
890 double get_maxextra()
const;
923 void init(
double factor);
939 need_stat(AVERAGE_LENGTH);
940 need_stat(DOC_LENGTH);
941 need_stat(DOC_LENGTH_MIN);
942 need_stat(DOC_LENGTH_MAX);
943 need_stat(COLLECTION_SIZE);
944 need_stat(COLLECTION_FREQ);
951 std::string
name()
const;
953 std::string serialise()
const;
954 IfB2Weight * unserialise(
const std::string & serialised)
const;
959 double get_maxpart()
const;
963 double get_maxextra()
const;
996 void init(
double factor);
1010 need_stat(AVERAGE_LENGTH);
1011 need_stat(DOC_LENGTH);
1012 need_stat(DOC_LENGTH_MIN);
1013 need_stat(DOC_LENGTH_MAX);
1014 need_stat(COLLECTION_SIZE);
1018 need_stat(COLLECTION_FREQ);
1019 need_stat(TERMFREQ);
1022 std::string
name()
const;
1024 std::string serialise()
const;
1025 IneB2Weight * unserialise(
const std::string & serialised)
const;
1030 double get_maxpart()
const;
1034 double get_maxextra()
const;
1070 void init(
double factor);
1086 need_stat(AVERAGE_LENGTH);
1087 need_stat(DOC_LENGTH);
1088 need_stat(DOC_LENGTH_MIN);
1089 need_stat(DOC_LENGTH_MAX);
1090 need_stat(COLLECTION_SIZE);
1091 need_stat(COLLECTION_FREQ);
1095 need_stat(TERMFREQ);
1098 std::string
name()
const;
1100 std::string serialise()
const;
1101 BB2Weight * unserialise(
const std::string & serialised)
const;
1106 double get_maxpart()
const;
1110 double get_maxextra()
const;
1143 void init(
double factor);
1147 need_stat(DOC_LENGTH);
1148 need_stat(COLLECTION_FREQ);
1152 need_stat(DOC_LENGTH_MIN);
1153 need_stat(DOC_LENGTH_MAX);
1154 need_stat(TOTAL_LENGTH);
1157 std::string
name()
const;
1159 std::string serialise()
const;
1160 DLHWeight * unserialise(
const std::string & serialised)
const;
1165 double get_maxpart()
const;
1169 double get_maxextra()
const;
1214 void init(
double factor);
1230 need_stat(AVERAGE_LENGTH);
1231 need_stat(DOC_LENGTH);
1232 need_stat(DOC_LENGTH_MIN);
1233 need_stat(DOC_LENGTH_MAX);
1234 need_stat(COLLECTION_SIZE);
1235 need_stat(COLLECTION_FREQ);
1241 std::string
name()
const;
1243 std::string serialise()
const;
1244 PL2Weight * unserialise(
const std::string & serialised)
const;
1249 double get_maxpart()
const;
1253 double get_maxextra()
const;
1284 void init(
double factor_);
1307 : param_c(1.0), param_delta(0.8) {
1308 need_stat(AVERAGE_LENGTH);
1309 need_stat(DOC_LENGTH);
1310 need_stat(DOC_LENGTH_MIN);
1311 need_stat(DOC_LENGTH_MAX);
1312 need_stat(COLLECTION_SIZE);
1313 need_stat(COLLECTION_FREQ);
1319 std::string
name()
const;
1321 std::string serialise()
const;
1322 PL2PlusWeight * unserialise(
const std::string & serialised)
const;
1327 double get_maxpart()
const;
1331 double get_maxextra()
const;
1366 void init(
double factor);
1371 need_stat(DOC_LENGTH);
1372 need_stat(COLLECTION_FREQ);
1376 need_stat(DOC_LENGTH_MIN);
1377 need_stat(DOC_LENGTH_MAX);
1378 need_stat(TOTAL_LENGTH);
1381 std::string
name()
const;
1383 std::string serialise()
const;
1384 DPHWeight * unserialise(
const std::string & serialised)
const;
1389 double get_maxpart()
const;
1393 double get_maxextra()
const;
1426 void init(
double factor);
1462 type_smoothing select_smoothing_ = TWO_STAGE_SMOOTHING,
1463 double param_smoothing1_ = -1.0,
1464 double param_smoothing2_ = -1.0)
1465 : select_smoothing(select_smoothing_), param_log(param_log_), param_smoothing1(param_smoothing1_),
1466 param_smoothing2(param_smoothing2_)
1468 if (param_smoothing1 < 0) param_smoothing1 = 0.7;
1469 if (param_smoothing2 < 0) {
1470 if (select_smoothing == TWO_STAGE_SMOOTHING)
1471 param_smoothing2 = 2000.0;
1473 param_smoothing2 = 0.05;
1475 need_stat(DOC_LENGTH);
1476 need_stat(RSET_SIZE);
1477 need_stat(TERMFREQ);
1478 need_stat(RELTERMFREQ);
1479 need_stat(DOC_LENGTH_MAX);
1482 need_stat(COLLECTION_FREQ);
1483 need_stat(TOTAL_LENGTH);
1484 if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING)
1485 need_stat(UNIQUE_TERMS);
1486 if (select_smoothing == DIRICHLET_PLUS_SMOOTHING)
1487 need_stat(DOC_LENGTH_MIN);
1490 std::string
name()
const;
1492 std::string serialise()
const;
1493 LMWeight * unserialise(
const std::string & serialised)
const;
1498 double get_maxpart()
const;
1501 double get_maxextra()
const;
1516 void init(
double factor_);
1521 std::string
name()
const;
1523 std::string serialise()
const;
1524 CoordWeight * unserialise(
const std::string & serialised)
const;
1529 double get_maxpart()
const;
1532 double get_maxextra()
const;
1537 #endif // XAPIAN_INCLUDED_WEIGHT_H
The Xapian namespace contains public interfaces for the Xapian library.
type_smoothing select_smoothing
The type of smoothing to use.
double factor
The factor to multiply weights by.
Xapian::doccount termfreq_
The number of documents which this term indexes.
double param_delta
Additional parameter delta in the PL2+ weighting formula.
Xapian::doccount get_collection_size() const
The number of documents in the collection.
std::string normalizations
Xapian::termcount doclength_lower_bound_
A lower bound on the minimum length of any document in the database.
double factor
The factor to multiply with the weight.
double upper_bound
The upper bound on the weight.
Xapian::termcount get_collection_freq() const
The collection frequency of the term.
double param_c
The wdf normalization parameter in the formula.
double param_c
The wdf normalization parameter in the formula.
double param_c
The wdf normalization parameter in the formula.
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
Xapian::termcount doclength_upper_bound_
An upper bound on the maximum length of any document in the database.
double stirling_constant_1
double upper_bound
The upper bound on the weight.
Xapian::doclength param_min_normlen
The minimum normalised document length value.
bool get_sumpart_needs_wdf_() const
double param_c
The wdf normalization parameter in the formula.
This class implements the InL2 weighting scheme.
double param_k
The parameter in the formula.
double dw
Weight contribution of delta term in the PL2+ function.
bool get_sumpart_needs_uniqueterms_() const
bool is_bool_weight_() const
Xapian::Weight subclass implementing the PL2+ probabilistic formula.
double wqf_product_idf
The constant values which are used on every call to get_sumpart().
stat_flags stats_needed
A bitmask of the statistics this weighting scheme needs.
Xapian::totallength get_total_length() const
Total length of all documents in the collection.
bool get_sumpart_needs_doclength_() const
double lower_bound
Now unused but left in place in 1.4.x for ABI compatibility.
#define XAPIAN_VISIBILITY_DEFAULT
double upper_bound
The upper bound on the weight.
double lower_bound
The factor to multiply weights by.
DPHWeight()
Construct a DPHWeight.
Xapian::doccount collection_size_
The number of documents in the collection.
Xapian::termcount wdf_upper_bound_
An upper bound on the wdf of this term.
CoordWeight()
Construct a CoordWeight.
TfIdfWeight()
Construct a TfIdfWeight using the default normalizations ("ntn").
This class implements the BB2 weighting scheme.
type_smoothing
Type of smoothing to use with the Language Model Weighting scheme.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Xapian::termcount wqf_
The within-query-frequency of this term.
Xapian::Weight subclass implementing Coordinate Matching.
double param_c
The wdf normalization parameter in the formula.
Xapian::termcount get_doclength_lower_bound() const
A lower bound on the minimum length of any document in the database.
double wqf_product_idf
The constant values which are used for calculations in get_sumpart().
Class implementing a "boolean" weighting scheme.
Xapian::doclength param_min_normlen
The minimum normalised document length value.
Define XAPIAN_VISIBILITY_* macros.
double doclength
A normalised document length.
double stirling_constant_2
Xapian::termcount collectionfreq_
Class to hold statistics for a given collection.
Xapian::doclength average_length_
The average length of a document in the collection.
double termweight
Factor combining all the document independent factors.
double mean
Set by init() to get_collection_freq()) / get_collection_size()
double termweight
Factor combining all the document independent factors.
stat_flags
Stats which the weighting scheme can use (see need_stat()).
Xapian::termcount get_query_length() const
The length of the query.
Xapian::doclength len_factor
Factor to multiply the document length by.
BoolWeight()
Construct a BoolWeight.
Xapian::Weight subclass implementing the traditional probabilistic formula.
This class implements the DLH weighting scheme, which is a representative scheme of the Divergence fr...
This class implements the PL2 weighting scheme.
Xapian::doccount rset_size_
The number of documents marked as relevant.
LMWeight(double param_log_=0.0, type_smoothing select_smoothing_=TWO_STAGE_SMOOTHING, double param_smoothing1_=-1.0, double param_smoothing2_=-1.0)
Construct a LMWeight.
This class implements the IneB2 weighting scheme.
Xapian::termcount get_wqf() const
The within-query-frequency of this term.
double upper_bound
The upper bound on the weight a term can give to a document.
Xapian::doccount get_rset_size() const
The number of documents marked as relevant.
Xapian::termcount query_length_
The length of the query.
double upper_bound
The upper bound on the weight.
double log_constant
The constant value to be used in get_sumpart().
double wqf_product_factor
Xapian::termcount get_doclength_upper_bound() const
An upper bound on the maximum length of any document in the database.
BM25Weight(double k1, double k2, double k3, double b, double min_normlen)
Construct a BM25Weight.
double lower_bound
Now unused but left in place in 1.4.x for ABI compatibility.
TradWeight(double k=1.0)
Construct a TradWeight.
This class implements the IfB2 weighting scheme.
Xapian::doccount get_termfreq() const
The number of documents which this term indexes.
double wqf_product_factor
Xapian::doclength get_average_length() const
The average length of a document in the collection.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
double termweight
Factor combining all the document independent factors.
double cl
Set by init() to (param_c * get_average_length())
Xapian::doclength len_factor
Factor to multiply the document length by.
double param_c
The wdf normalization parameter in the formula.
double cl
Set by init() to (param_c * get_average_length())
This class implements the DPH weighting scheme.
Xapian::doccount get_reltermfreq() const
The number of relevant documents which this term indexes.
Xapian::doclength len_factor
Factor to multiply the document length by.
void need_stat(stat_flags flag)
Tell Xapian that your subclass will want a particular statistic.
double weight_collection
The factor to multiply weights by.
double upper_bound
The upper bound on the weight.
double log_constant
The constant value used in get_sumpart() .
double c_product_avlen
The constant values to be used in get_sumpart().
double param_delta
Additional parameter delta in the BM25+ formula.
BM25PlusWeight(double k1, double k2, double k3, double b, double min_normlen, double delta)
Construct a BM25PlusWeight.
Xapian::Weight subclass implementing the Language Model formula.
Weight()
Default constructor, needed by subclass constructors.
double upper_bound
The upper bound on the weight.
Xapian::termcount get_wdf_upper_bound() const
An upper bound on the wdf of this term.
double wqf_product_idf
Constant values used in get_sumpart().
double upper_bound
The upper bound of the weight.
double factor
The factor to multiply weights by.
Xapian::Weight subclass implementing the BM25+ probabilistic formula.
Xapian::Weight subclass implementing the BM25 probabilistic formula.
Xapian::doccount reltermfreq_
The number of relevant documents which this term indexes.
Xapian::Weight subclass implementing the tf-idf weighting scheme.
Abstract base class for weighting schemes.