xapian-core  2.0.0
weight.h
Go to the documentation of this file.
1 
4 /* Copyright (C) 2004-2024 Olly Betts
5  * Copyright (C) 2009 Lemur Consulting Ltd
6  * Copyright (C) 2013,2014 Aarsh Shah
7  * Copyright (C) 2016,2017 Vivek Pal
8  *
9  * This program is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU General Public License as
11  * published by the Free Software Foundation; either version 2 of the
12  * License, or (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, see
21  * <https://www.gnu.org/licenses/>.
22  */
23 
24 #ifndef XAPIAN_INCLUDED_WEIGHT_H
25 #define XAPIAN_INCLUDED_WEIGHT_H
26 
27 #include <string>
28 
29 #include <xapian/database.h>
30 #include <xapian/deprecated.h>
31 #include <xapian/registry.h>
32 #include <xapian/types.h>
33 #include <xapian/visibility.h>
34 
35 namespace Xapian {
36 
39  protected:
41  typedef enum {
43  COLLECTION_SIZE = 0,
45  RSET_SIZE = 0,
47  AVERAGE_LENGTH = 4,
49  TERMFREQ = 1,
51  RELTERMFREQ = 1,
53  QUERY_LENGTH = 0,
55  WQF = 0,
57  WDF = 2,
59  DOC_LENGTH = 8,
65  DOC_LENGTH_MIN = 16,
73  DOC_LENGTH_MAX = 32,
81  WDF_MAX = 64,
83  COLLECTION_FREQ = 1,
85  UNIQUE_TERMS = 128,
89  TOTAL_LENGTH = 256,
94  WDF_DOC_MAX = 512,
104  UNIQUE_TERMS_MIN = 1024,
114  UNIQUE_TERMS_MAX = 2048,
121  DB_DOC_LENGTH_MIN = 4096,
128  DB_DOC_LENGTH_MAX = 8192,
135  DB_UNIQUE_TERMS_MIN = 16384,
142  DB_UNIQUE_TERMS_MAX = 32768,
149  DB_WDF_MAX = 65536,
153  IS_BOOLWEIGHT_ = static_cast<int>(0x80000000)
154  } stat_flags;
155 
183  void need_stat(stat_flags flag) {
184  stats_needed = stat_flags(stats_needed | flag);
185  }
186 
196  virtual void init(double factor) = 0;
197 
198  private:
200  void operator=(const Weight &);
201 
204 
207 
210 
213 
216 
217  // The collection frequency of the term.
219 
222 
225 
228 
231 
234 
237 
240 
245 
250 
253 
256 
259 
264 
269 
270  public:
271 
273  Weight() : stats_needed() { }
274 
275  class Internal;
276 
278  virtual ~Weight();
279 
296  virtual Weight * clone() const = 0;
297 
321  virtual std::string name() const;
322 
328  virtual std::string serialise() const;
329 
347  virtual Weight * unserialise(const std::string & serialised) const;
348 
374  virtual double get_sumpart(Xapian::termcount wdf,
375  Xapian::termcount doclen,
376  Xapian::termcount uniqterms,
377  Xapian::termcount wdfdocmax) const = 0;
378 
384  virtual double get_maxpart() const = 0;
385 
408  virtual double get_sumextra(Xapian::termcount doclen,
409  Xapian::termcount uniqterms,
410  Xapian::termcount wdfdocmax) const;
411 
421  virtual double get_maxextra() const;
422 
437  void init_(const Internal & stats, Xapian::termcount query_len_,
438  const std::string & term, Xapian::termcount wqf_,
439  double factor,
440  const Xapian::Database::Internal* shard,
441  void* postlist);
442 
454  void init_(const Internal & stats, Xapian::termcount query_len_,
455  double factor, Xapian::doccount termfreq,
456  Xapian::doccount reltermfreq, Xapian::termcount collection_freq,
457  const Xapian::Database::Internal* shard);
458 
466  void init_(const Internal & stats, Xapian::termcount query_len_,
467  const Xapian::Database::Internal* shard);
468 
476  return stats_needed & DOC_LENGTH;
477  }
478 
484  bool get_sumpart_needs_wdf_() const {
485  return stats_needed & WDF;
486  }
487 
495  return stats_needed & UNIQUE_TERMS;
496  }
497 
508  static const Weight * create(const std::string & scheme,
509  const Registry & reg = Registry());
510 
517  virtual Weight * create_from_parameters(const char * params) const;
518 
520  bool is_bool_weight_() const {
521  // We use a special flag bit to make this check efficient. Note we
522  // can't use (get_maxpart() == 0.0) since that's not required to work
523  // without init() having been called.
524  return stats_needed & IS_BOOLWEIGHT_;
525  }
526 
536  return stats_needed & WDF_DOC_MAX;
537  }
538 
539  protected:
546  Weight(const Weight &);
547 
549  Xapian::doccount get_collection_size() const { return collection_size_; }
550 
552  Xapian::doccount get_rset_size() const { return rset_size_; }
553 
555  Xapian::doclength get_average_length() const { return average_length_; }
556 
558  Xapian::doccount get_termfreq() const { return termfreq_; }
559 
561  Xapian::doccount get_reltermfreq() const { return reltermfreq_; }
562 
564  Xapian::termcount get_collection_freq() const { return collectionfreq_; }
565 
567  Xapian::termcount get_query_length() const { return query_length_; }
568 
570  Xapian::termcount get_wqf() const { return wqf_; }
571 
577  return doclength_upper_bound_;
578  }
579 
587  return doclength_lower_bound_;
588  }
589 
595  return wdf_upper_bound_;
596  }
597 
600  return total_length_;
601  }
602 
613  return unique_terms_upper_bound_;
614  }
615 
624  return unique_terms_lower_bound_;
625  }
626 
632  return db_doclength_upper_bound_;
633  }
634 
642  return db_doclength_lower_bound_;
643  }
644 
653  return db_unique_terms_upper_bound_;
654  }
655 
662  return db_unique_terms_lower_bound_;
663  }
664 
670  return db_wdf_upper_bound_;
671  }
672 };
673 
679  BoolWeight * clone() const;
680 
681  void init(double factor);
682 
683  public:
686  need_stat(IS_BOOLWEIGHT_);
687  }
688 
689  std::string name() const;
690 
691  std::string serialise() const;
692  BoolWeight * unserialise(const std::string & serialised) const;
693 
694  double get_sumpart(Xapian::termcount wdf,
695  Xapian::termcount doclen,
696  Xapian::termcount uniqterms,
697  Xapian::termcount wdfdocmax) const;
698  double get_maxpart() const;
699 
700  BoolWeight * create_from_parameters(const char * params) const;
701 };
702 
705  public:
710  enum class wdf_norm : unsigned char {
715  NONE = 1,
716 
721  BOOLEAN = 2,
722 
727  SQUARE = 3,
728 
733  LOG = 4,
734 
740  PIVOTED = 5,
741 
747  LOG_AVERAGE = 6,
748 
753  AUG_LOG = 7,
754 
759  SQRT = 8,
760 
765  AUG_AVERAGE = 9,
766 
771  MAX = 10,
772 
777  AUG = 11
778  };
779 
784  enum class idf_norm : unsigned char {
789  NONE = 1,
790 
797  TFIDF = 2,
798 
803  SQUARE = 3,
804 
809  FREQ = 4,
810 
815  PROB = 5,
816 
821  PIVOTED = 6,
822 
827  GLOBAL_FREQ = 7,
828 
833  LOG_GLOBAL_FREQ = 8,
834 
839  INCREMENTED_GLOBAL_FREQ = 9,
840 
845  SQRT_GLOBAL_FREQ = 10
846  };
847 
852  enum class wt_norm : unsigned char {
857  NONE = 1
858  };
859  private:
866 
868  double wqf_factor;
869 
871  double idfn;
872 
874  double param_slope, param_delta;
875 
876  TfIdfWeight * clone() const;
877 
878  void init(double factor);
879 
880  /* When additional normalizations are implemented in the future, the additional statistics for them
881  should be accessed by these functions. */
882  double get_wdfn(Xapian::termcount wdf,
883  Xapian::termcount len,
884  Xapian::termcount uniqterms,
885  Xapian::termcount wdfdocmax,
886  wdf_norm wdf_normalization) const;
887  double get_idfn(idf_norm idf_normalization) const;
888  double get_wtn(double wt, wt_norm wt_normalization) const;
889 
890  public:
936  explicit TfIdfWeight(const std::string& normalizations)
937  : TfIdfWeight(normalizations, 0.2, 1.0) {}
938 
985  TfIdfWeight(const std::string &normalizations, double slope, double delta);
986 
998  TfIdfWeight(wdf_norm wdf_normalization,
999  idf_norm idf_normalization,
1000  wt_norm wt_normalization)
1001  : TfIdfWeight(wdf_normalization, idf_normalization,
1002  wt_normalization, 0.2, 1.0) {}
1003 
1019  TfIdfWeight(wdf_norm wdf_norm_, idf_norm idf_norm_,
1020  wt_norm wt_norm_, double slope, double delta);
1021 
1024  : wdf_norm_(wdf_norm::NONE), idf_norm_(idf_norm::TFIDF),
1025  wt_norm_(wt_norm::NONE), param_slope(0.2), param_delta(1.0)
1026  {
1027  need_stat(WQF);
1028  need_stat(TERMFREQ);
1029  need_stat(WDF);
1030  need_stat(WDF_MAX);
1031  need_stat(COLLECTION_SIZE);
1032  }
1033 
1034  std::string name() const;
1035 
1036  std::string serialise() const;
1037  TfIdfWeight * unserialise(const std::string & serialised) const;
1038 
1039  double get_sumpart(Xapian::termcount wdf,
1040  Xapian::termcount doclen,
1041  Xapian::termcount uniqterm,
1042  Xapian::termcount wdfdocmax) const;
1043  double get_maxpart() const;
1044 
1045  TfIdfWeight * create_from_parameters(const char * params) const;
1046 };
1047 
1048 
1053 
1055  mutable double termweight;
1056 
1058  double param_k1, param_k2, param_k3, param_b;
1059 
1062 
1063  BM25Weight * clone() const;
1064 
1065  void init(double factor);
1066 
1067  public:
1095  BM25Weight(double k1, double k2, double k3, double b, double min_normlen)
1096  : param_k1(k1), param_k2(k2), param_k3(k3), param_b(b),
1097  param_min_normlen(min_normlen)
1098  {
1099  if (param_k1 < 0) param_k1 = 0;
1100  if (param_k2 < 0) param_k2 = 0;
1101  if (param_k3 < 0) param_k3 = 0;
1102  if (param_b < 0) {
1103  param_b = 0;
1104  } else if (param_b > 1) {
1105  param_b = 1;
1106  }
1107  need_stat(COLLECTION_SIZE);
1108  need_stat(RSET_SIZE);
1109  need_stat(TERMFREQ);
1110  need_stat(RELTERMFREQ);
1111  need_stat(WDF);
1112  need_stat(WDF_MAX);
1113  if (param_k2 != 0 || (param_k1 != 0 && param_b != 0)) {
1114  need_stat(DOC_LENGTH_MIN);
1115  need_stat(AVERAGE_LENGTH);
1116  }
1117  if (param_k1 != 0 && param_b != 0) need_stat(DOC_LENGTH);
1118  if (param_k2 != 0) {
1119  need_stat(DOC_LENGTH);
1120  need_stat(QUERY_LENGTH);
1121  }
1122  if (param_k3 != 0) need_stat(WQF);
1123  }
1124 
1126  : param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
1127  param_min_normlen(0.5)
1128  {
1129  need_stat(COLLECTION_SIZE);
1130  need_stat(RSET_SIZE);
1131  need_stat(TERMFREQ);
1132  need_stat(RELTERMFREQ);
1133  need_stat(WDF);
1134  need_stat(WDF_MAX);
1135  need_stat(DOC_LENGTH_MIN);
1136  need_stat(AVERAGE_LENGTH);
1137  need_stat(DOC_LENGTH);
1138  need_stat(WQF);
1139  }
1140 
1141  std::string name() const;
1142 
1143  std::string serialise() const;
1144  BM25Weight * unserialise(const std::string & serialised) const;
1145 
1146  double get_sumpart(Xapian::termcount wdf,
1147  Xapian::termcount doclen,
1148  Xapian::termcount uniqterm,
1149  Xapian::termcount wdfdocmax) const;
1150  double get_maxpart() const;
1151 
1152  double get_sumextra(Xapian::termcount doclen,
1153  Xapian::termcount uniqterms,
1154  Xapian::termcount wdfdocmax) const;
1155  double get_maxextra() const;
1156 
1157  BM25Weight * create_from_parameters(const char * params) const;
1158 };
1159 
1164 
1166  mutable double termweight;
1167 
1169  double param_k1, param_k2, param_k3, param_b;
1170 
1173 
1175  double param_delta;
1176 
1177  BM25PlusWeight * clone() const;
1178 
1179  void init(double factor);
1180 
1181  public:
1216  BM25PlusWeight(double k1, double k2, double k3, double b,
1217  double min_normlen, double delta)
1218  : param_k1(k1), param_k2(k2), param_k3(k3), param_b(b),
1219  param_min_normlen(min_normlen), param_delta(delta)
1220  {
1221  if (param_k1 < 0) param_k1 = 0;
1222  if (param_k2 < 0) param_k2 = 0;
1223  if (param_k3 < 0) param_k3 = 0;
1224  if (param_delta < 0) param_delta = 0;
1225  if (param_b < 0) {
1226  param_b = 0;
1227  } else if (param_b > 1) {
1228  param_b = 1;
1229  }
1230  need_stat(COLLECTION_SIZE);
1231  need_stat(RSET_SIZE);
1232  need_stat(TERMFREQ);
1233  need_stat(RELTERMFREQ);
1234  need_stat(WDF);
1235  need_stat(WDF_MAX);
1236  if (param_k2 != 0 || (param_k1 != 0 && param_b != 0)) {
1237  need_stat(DOC_LENGTH_MIN);
1238  need_stat(AVERAGE_LENGTH);
1239  }
1240  if (param_k1 != 0 && param_b != 0) need_stat(DOC_LENGTH);
1241  if (param_k2 != 0) {
1242  need_stat(DOC_LENGTH);
1243  need_stat(QUERY_LENGTH);
1244  }
1245  if (param_k3 != 0) need_stat(WQF);
1246  }
1247 
1249  : param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
1250  param_min_normlen(0.5), param_delta(1)
1251  {
1252  need_stat(COLLECTION_SIZE);
1253  need_stat(RSET_SIZE);
1254  need_stat(TERMFREQ);
1255  need_stat(RELTERMFREQ);
1256  need_stat(WDF);
1257  need_stat(WDF_MAX);
1258  need_stat(DOC_LENGTH_MIN);
1259  need_stat(AVERAGE_LENGTH);
1260  need_stat(DOC_LENGTH);
1261  need_stat(WQF);
1262  }
1263 
1264  std::string name() const;
1265 
1266  std::string serialise() const;
1267  BM25PlusWeight * unserialise(const std::string & serialised) const;
1268 
1269  double get_sumpart(Xapian::termcount wdf,
1270  Xapian::termcount doclen,
1271  Xapian::termcount uniqterms,
1272  Xapian::termcount wdfdocmax) const;
1273  double get_maxpart() const;
1274 
1275  double get_sumextra(Xapian::termcount doclen,
1276  Xapian::termcount uniqterms,
1277  Xapian::termcount wdfdocmax) const;
1278  double get_maxextra() const;
1279 
1280  BM25PlusWeight * create_from_parameters(const char * params) const;
1281 };
1282 
1296 class XAPIAN_DEPRECATED_CLASS TradWeight : public BM25Weight
1297 {
1298  public:
1306  explicit TradWeight(double k = 1.0) : BM25Weight(k, 0.0, 0.0, 1.0, 0.0) { }
1307 };
1308 
1329  double param_c;
1330 
1332  double upper_bound;
1333 
1337 
1338  InL2Weight * clone() const;
1339 
1340  void init(double factor);
1341 
1342  public:
1351  explicit InL2Weight(double c);
1352 
1354  : param_c(1.0)
1355  {
1356  need_stat(AVERAGE_LENGTH);
1357  need_stat(DOC_LENGTH);
1358  need_stat(DOC_LENGTH_MIN);
1359  need_stat(DOC_LENGTH_MAX);
1360  need_stat(COLLECTION_SIZE);
1361  need_stat(WDF);
1362  need_stat(WDF_MAX);
1363  need_stat(WQF);
1364  need_stat(TERMFREQ);
1365  }
1366 
1367  std::string name() const;
1368 
1369  std::string serialise() const;
1370  InL2Weight * unserialise(const std::string & serialised) const;
1371 
1372  double get_sumpart(Xapian::termcount wdf,
1373  Xapian::termcount doclen,
1374  Xapian::termcount uniqterms,
1375  Xapian::termcount wdfdocmax) const;
1376  double get_maxpart() const;
1377 
1378  InL2Weight * create_from_parameters(const char * params) const;
1379 };
1380 
1399  double param_c;
1400 
1402  double upper_bound;
1403 
1407  double B_constant;
1408 
1409  IfB2Weight * clone() const;
1410 
1411  void init(double factor);
1412 
1413  public:
1424  explicit IfB2Weight(double c);
1425 
1426  IfB2Weight() : param_c(1.0) {
1427  need_stat(AVERAGE_LENGTH);
1428  need_stat(DOC_LENGTH);
1429  need_stat(DOC_LENGTH_MIN);
1430  need_stat(DOC_LENGTH_MAX);
1431  need_stat(COLLECTION_SIZE);
1432  need_stat(COLLECTION_FREQ);
1433  need_stat(WDF);
1434  need_stat(WDF_MAX);
1435  need_stat(WQF);
1436  need_stat(TERMFREQ);
1437  }
1438 
1439  std::string name() const;
1440 
1441  std::string serialise() const;
1442  IfB2Weight * unserialise(const std::string & serialised) const;
1443 
1444  double get_sumpart(Xapian::termcount wdf,
1445  Xapian::termcount doclen,
1446  Xapian::termcount uniqterm,
1447  Xapian::termcount wdfdocmax) const;
1448  double get_maxpart() const;
1449 
1450  IfB2Weight * create_from_parameters(const char * params) const;
1451 };
1452 
1471  double param_c;
1472 
1474  double upper_bound;
1475 
1479  double B_constant;
1480 
1481  IneB2Weight * clone() const;
1482 
1483  void init(double factor);
1484 
1485  public:
1494  explicit IneB2Weight(double c);
1495 
1496  IneB2Weight() : param_c(1.0) {
1497  need_stat(AVERAGE_LENGTH);
1498  need_stat(DOC_LENGTH);
1499  need_stat(DOC_LENGTH_MIN);
1500  need_stat(DOC_LENGTH_MAX);
1501  need_stat(COLLECTION_SIZE);
1502  need_stat(WDF);
1503  need_stat(WDF_MAX);
1504  need_stat(WQF);
1505  need_stat(COLLECTION_FREQ);
1506  need_stat(TERMFREQ);
1507  }
1508 
1509  std::string name() const;
1510 
1511  std::string serialise() const;
1512  IneB2Weight * unserialise(const std::string & serialised) const;
1513 
1514  double get_sumpart(Xapian::termcount wdf,
1515  Xapian::termcount doclen,
1516  Xapian::termcount uniqterms,
1517  Xapian::termcount wdfdocmax) const;
1518  double get_maxpart() const;
1519 
1520  IneB2Weight * create_from_parameters(const char * params) const;
1521 };
1522 
1542  double param_c;
1543 
1545  double upper_bound;
1546 
1549  double B_constant;
1550  double wt;
1553 
1554  BB2Weight * clone() const;
1555 
1556  void init(double factor);
1557 
1558  public:
1569  explicit BB2Weight(double c);
1570 
1571  BB2Weight() : param_c(1.0) {
1572  need_stat(AVERAGE_LENGTH);
1573  need_stat(DOC_LENGTH);
1574  need_stat(DOC_LENGTH_MIN);
1575  need_stat(DOC_LENGTH_MAX);
1576  need_stat(COLLECTION_SIZE);
1577  need_stat(COLLECTION_FREQ);
1578  need_stat(WDF);
1579  need_stat(WDF_MAX);
1580  need_stat(WQF);
1581  need_stat(TERMFREQ);
1582  }
1583 
1584  std::string name() const;
1585 
1586  std::string serialise() const;
1587  BB2Weight * unserialise(const std::string & serialised) const;
1588 
1589  double get_sumpart(Xapian::termcount wdf,
1590  Xapian::termcount doclen,
1591  Xapian::termcount uniqterms,
1592  Xapian::termcount wdfdocmax) const;
1593  double get_maxpart() const;
1594 
1595  BB2Weight * create_from_parameters(const char * params) const;
1596 };
1597 
1617  double upper_bound;
1618 
1622 
1623  DLHWeight * clone() const;
1624 
1625  void init(double factor);
1626 
1627  public:
1629  need_stat(DOC_LENGTH);
1630  need_stat(COLLECTION_FREQ);
1631  need_stat(WDF);
1632  need_stat(WQF);
1633  need_stat(WDF_MAX);
1634  need_stat(DOC_LENGTH_MIN);
1635  need_stat(DOC_LENGTH_MAX);
1636  need_stat(TOTAL_LENGTH);
1637  }
1638 
1639  std::string name() const;
1640 
1641  std::string serialise() const;
1642  DLHWeight * unserialise(const std::string & serialised) const;
1643 
1644  double get_sumpart(Xapian::termcount wdf,
1645  Xapian::termcount doclen,
1646  Xapian::termcount uniqterms,
1647  Xapian::termcount wdfdocmax) const;
1648  double get_maxpart() const;
1649 
1650  DLHWeight * create_from_parameters(const char * params) const;
1651 };
1652 
1673  double factor;
1674 
1676  double param_c;
1677 
1679  double upper_bound;
1680 
1682  double P1, P2;
1683 
1685  double cl;
1686 
1687  PL2Weight * clone() const;
1688 
1689  void init(double factor_);
1690 
1691  public:
1702  explicit PL2Weight(double c);
1703 
1704  PL2Weight() : param_c(1.0) {
1705  need_stat(AVERAGE_LENGTH);
1706  need_stat(DOC_LENGTH);
1707  need_stat(DOC_LENGTH_MIN);
1708  need_stat(DOC_LENGTH_MAX);
1709  need_stat(COLLECTION_SIZE);
1710  need_stat(COLLECTION_FREQ);
1711  need_stat(WDF);
1712  need_stat(WDF_MAX);
1713  need_stat(WQF);
1714  }
1715 
1716  std::string name() const;
1717 
1718  std::string serialise() const;
1719  PL2Weight * unserialise(const std::string & serialised) const;
1720 
1721  double get_sumpart(Xapian::termcount wdf,
1722  Xapian::termcount doclen,
1723  Xapian::termcount uniqterms,
1724  Xapian::termcount wdfdocmax) const;
1725  double get_maxpart() const;
1726 
1727  PL2Weight * create_from_parameters(const char * params) const;
1728 };
1729 
1733  double factor;
1734 
1736  double param_c;
1737 
1739  double param_delta;
1740 
1742  double upper_bound;
1743 
1745  double P1, P2;
1746 
1748  double cl;
1749 
1751  double mean;
1752 
1754  double dw;
1755 
1756  PL2PlusWeight * clone() const;
1757 
1758  void init(double factor_);
1759 
1760  public:
1778  PL2PlusWeight(double c, double delta);
1779 
1781  : param_c(1.0), param_delta(0.8) {
1782  need_stat(AVERAGE_LENGTH);
1783  need_stat(DOC_LENGTH);
1784  need_stat(DOC_LENGTH_MIN);
1785  need_stat(DOC_LENGTH_MAX);
1786  need_stat(COLLECTION_SIZE);
1787  need_stat(COLLECTION_FREQ);
1788  need_stat(WDF);
1789  need_stat(WDF_MAX);
1790  need_stat(WQF);
1791  }
1792 
1793  std::string name() const;
1794 
1795  std::string serialise() const;
1796  PL2PlusWeight * unserialise(const std::string & serialised) const;
1797 
1798  double get_sumpart(Xapian::termcount wdf,
1799  Xapian::termcount doclen,
1800  Xapian::termcount uniqterms,
1801  Xapian::termcount wdfdocmax) const;
1802  double get_maxpart() const;
1803 
1804  PL2PlusWeight * create_from_parameters(const char * params) const;
1805 };
1806 
1828  double upper_bound;
1829 
1833 
1834  DPHWeight * clone() const;
1835 
1836  void init(double factor);
1837 
1838  public:
1841  need_stat(DOC_LENGTH);
1842  need_stat(COLLECTION_FREQ);
1843  need_stat(WDF);
1844  need_stat(WQF);
1845  need_stat(WDF_MAX);
1846  need_stat(DOC_LENGTH_MIN);
1847  need_stat(DOC_LENGTH_MAX);
1848  need_stat(TOTAL_LENGTH);
1849  }
1850 
1851  std::string name() const;
1852 
1853  std::string serialise() const;
1854  DPHWeight * unserialise(const std::string & serialised) const;
1855 
1856  double get_sumpart(Xapian::termcount wdf,
1857  Xapian::termcount doclen,
1858  Xapian::termcount uniqterms,
1859  Xapian::termcount wdfdocmax) const;
1860  double get_maxpart() const;
1861 
1862  DPHWeight * create_from_parameters(const char * params) const;
1863 };
1864 
1865 
1877  double factor;
1878 
1881 
1883  double multiplier;
1884 
1885  LMJMWeight* clone() const;
1886 
1887  void init(double factor_);
1888 
1889  public:
1908  explicit LMJMWeight(double lambda = 0.0) : param_lambda(lambda) {
1909  need_stat(WQF);
1910  need_stat(QUERY_LENGTH);
1911  need_stat(DOC_LENGTH);
1912  need_stat(WDF);
1913  need_stat(WDF_MAX);
1914  need_stat(COLLECTION_FREQ);
1915  need_stat(TOTAL_LENGTH);
1916  need_stat(DOC_LENGTH_MIN);
1917  }
1918 
1919  double get_sumpart(Xapian::termcount wdf,
1920  Xapian::termcount doclen,
1921  Xapian::termcount uniqterm,
1922  Xapian::termcount wdfdocmax) const;
1923 
1924  double get_maxpart() const;
1925 
1926  std::string name() const;
1927 
1928  std::string serialise() const;
1929  LMJMWeight* unserialise(const std::string& serialised) const;
1930 
1931  LMJMWeight* create_from_parameters(const char* params) const;
1932 };
1933 
1950  double factor;
1951 
1953  double param_mu;
1954 
1956  double param_delta;
1957 
1959  double multiplier;
1960 
1967 
1968  LMDirichletWeight* clone() const;
1969 
1970  void init(double factor_);
1971 
1972  public:
1981  explicit LMDirichletWeight(double mu = 2000.0, double delta = 0.05)
1982  : param_mu(mu), param_delta(delta) {
1983  need_stat(WQF);
1984  need_stat(QUERY_LENGTH);
1985  need_stat(DOC_LENGTH);
1986  need_stat(WDF);
1987  need_stat(WDF_MAX);
1988  need_stat(COLLECTION_FREQ);
1989  need_stat(TOTAL_LENGTH);
1990  need_stat(DOC_LENGTH_MIN);
1991  need_stat(DOC_LENGTH_MAX);
1992  }
1993 
1994  double get_sumpart(Xapian::termcount wdf,
1995  Xapian::termcount doclen,
1996  Xapian::termcount uniqterm,
1997  Xapian::termcount wdfdocmax) const;
1998 
1999  double get_maxpart() const;
2000 
2001  double get_sumextra(Xapian::termcount doclen,
2003  Xapian::termcount) const;
2004 
2005  double get_maxextra() const;
2006 
2007  std::string name() const;
2008 
2009  std::string serialise() const;
2010  LMDirichletWeight* unserialise(const std::string& serialised) const;
2011 
2012  LMDirichletWeight* create_from_parameters(const char* params) const;
2013 };
2014 
2026  double factor;
2027 
2029  double param_delta;
2030 
2032  double multiplier;
2033 
2040 
2041  LMAbsDiscountWeight* clone() const;
2042 
2043  void init(double factor_);
2044 
2045  public:
2050  explicit LMAbsDiscountWeight(double delta = 0.7) : param_delta(delta) {
2051  need_stat(WQF);
2052  need_stat(QUERY_LENGTH);
2053  need_stat(DOC_LENGTH);
2054  need_stat(WDF);
2055  need_stat(WDF_MAX);
2056  need_stat(COLLECTION_FREQ);
2057  need_stat(TOTAL_LENGTH);
2058  need_stat(DOC_LENGTH_MIN);
2059  need_stat(UNIQUE_TERMS);
2060  need_stat(DOC_LENGTH_MAX);
2061  }
2062 
2063  double get_sumpart(Xapian::termcount wdf,
2065  Xapian::termcount uniqterm,
2066  Xapian::termcount wdfdocmax) const;
2067 
2068  double get_maxpart() const;
2069 
2070  double get_sumextra(Xapian::termcount doclen,
2072  Xapian::termcount) const;
2073 
2074  double get_maxextra() const;
2075 
2076  std::string name() const;
2077 
2078  std::string serialise() const;
2079  LMAbsDiscountWeight* unserialise(const std::string& serialised) const;
2080 
2081  LMAbsDiscountWeight* create_from_parameters(const char* params) const;
2082 };
2083 
2095  double factor;
2096 
2099 
2101  double param_mu;
2102 
2104  double multiplier;
2105 
2112 
2113  LM2StageWeight* clone() const;
2114 
2115  void init(double factor_);
2116 
2117  public:
2125  explicit LM2StageWeight(double lambda = 0.7, double mu = 2000.0)
2126  : param_lambda(lambda), param_mu(mu)
2127  {
2128  need_stat(WQF);
2129  need_stat(QUERY_LENGTH);
2130  need_stat(DOC_LENGTH);
2131  need_stat(WDF);
2132  need_stat(WDF_MAX);
2133  need_stat(COLLECTION_FREQ);
2134  need_stat(TOTAL_LENGTH);
2135  need_stat(DOC_LENGTH_MIN);
2136  need_stat(DOC_LENGTH_MAX);
2137  }
2138 
2139  double get_sumpart(Xapian::termcount wdf,
2140  Xapian::termcount doclen,
2141  Xapian::termcount uniqterm,
2142  Xapian::termcount wdfdocmax) const;
2143  double get_maxpart() const;
2144 
2145  double get_sumextra(Xapian::termcount doclen,
2146  Xapian::termcount uniqterm,
2147  Xapian::termcount wdfdocmax) const;
2148  double get_maxextra() const;
2149 
2150  std::string name() const;
2151 
2152  std::string serialise() const;
2153  LM2StageWeight* unserialise(const std::string& serialised) const;
2154 
2155  LM2StageWeight* create_from_parameters(const char* params) const;
2156 };
2157 
2165  double factor;
2166 
2167  public:
2168  CoordWeight * clone() const;
2169 
2170  void init(double factor_);
2171 
2174 
2175  std::string name() const;
2176 
2177  std::string serialise() const;
2178  CoordWeight * unserialise(const std::string & serialised) const;
2179 
2180  double get_sumpart(Xapian::termcount wdf,
2181  Xapian::termcount doclen,
2182  Xapian::termcount uniqterms,
2183  Xapian::termcount wdfdocmax) const;
2184  double get_maxpart() const;
2185 
2186  CoordWeight * create_from_parameters(const char * params) const;
2187 };
2188 
2209  double numerator;
2210 
2212  double upper_bound;
2213 
2214  void init(double factor);
2215 
2216  public:
2217  DiceWeight* clone() const;
2218 
2221  need_stat(WQF);
2222  need_stat(QUERY_LENGTH);
2223  need_stat(UNIQUE_TERMS);
2224  need_stat(UNIQUE_TERMS_MIN);
2225  }
2226 
2227  std::string name() const;
2228 
2229  std::string serialise() const;
2230  DiceWeight* unserialise(const std::string& serialised) const;
2231 
2232  double get_sumpart(Xapian::termcount wdf,
2233  Xapian::termcount doclen,
2234  Xapian::termcount uniqterm,
2235  Xapian::termcount wdfdocmax) const;
2236  double get_maxpart() const;
2237 
2238  DiceWeight* create_from_parameters(const char* params) const;
2239 };
2240 }
2241 
2242 #endif // XAPIAN_INCLUDED_WEIGHT_H
char name[9]
Definition: dbcheck.cc:57
This class implements the BB2 weighting scheme.
Definition: weight.h:1540
double stirling_constant_2
Definition: weight.h:1552
double B_constant
Definition: weight.h:1549
double upper_bound
The upper bound on the weight.
Definition: weight.h:1545
double stirling_constant_1
Definition: weight.h:1551
double c_product_avlen
The constant values to be used in get_sumpart().
Definition: weight.h:1548
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:1542
Xapian::Weight subclass implementing the BM25+ probabilistic formula.
Definition: weight.h:1161
Xapian::doclength len_factor
Factor to multiply the document length by.
Definition: weight.h:1163
Xapian::doclength param_min_normlen
The minimum normalised document length value.
Definition: weight.h:1172
double param_delta
Additional parameter delta in the BM25+ formula.
Definition: weight.h:1175
double termweight
Factor combining all the document independent factors.
Definition: weight.h:1166
BM25PlusWeight(double k1, double k2, double k3, double b, double min_normlen, double delta)
Construct a BM25PlusWeight.
Definition: weight.h:1216
Xapian::Weight subclass implementing the BM25 probabilistic formula.
Definition: weight.h:1050
Xapian::doclength param_min_normlen
The minimum normalised document length value.
Definition: weight.h:1061
BM25Weight(double k1, double k2, double k3, double b, double min_normlen)
Construct a BM25Weight.
Definition: weight.h:1095
Xapian::doclength len_factor
Factor to multiply the document length by.
Definition: weight.h:1052
double termweight
Factor combining all the document independent factors.
Definition: weight.h:1055
Class implementing a "boolean" weighting scheme.
Definition: weight.h:678
BoolWeight()
Construct a BoolWeight.
Definition: weight.h:685
Xapian::Weight subclass implementing Coordinate Matching.
Definition: weight.h:2163
double factor
The factor to multiply weights by.
Definition: weight.h:2165
CoordWeight()
Construct a CoordWeight.
Definition: weight.h:2173
This class implements the DLH weighting scheme, which is a representative scheme of the Divergence fr...
Definition: weight.h:1615
double upper_bound
The upper bound on the weight.
Definition: weight.h:1617
double log_constant
The constant value to be used in get_sumpart().
Definition: weight.h:1620
double wqf_product_factor
Definition: weight.h:1621
This class implements the DPH weighting scheme.
Definition: weight.h:1826
double wqf_product_factor
Definition: weight.h:1832
DPHWeight()
Construct a DPHWeight.
Definition: weight.h:1840
double upper_bound
The upper bound on the weight.
Definition: weight.h:1828
double log_constant
The constant value used in get_sumpart() .
Definition: weight.h:1831
Virtual base class for Database internals.
Xapian::Weight subclass implementing Dice Coefficient.
Definition: weight.h:2207
double upper_bound
Upper bound on the weight.
Definition: weight.h:2212
double numerator
The numerator in the weight calculation.
Definition: weight.h:2209
DiceWeight()
Construct a DiceWeight.
Definition: weight.h:2220
This class implements the IfB2 weighting scheme.
Definition: weight.h:1397
double c_product_avlen
Definition: weight.h:1406
double B_constant
Definition: weight.h:1407
double upper_bound
The upper bound on the weight.
Definition: weight.h:1402
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:1399
double wqf_product_idf
The constant values which are used for calculations in get_sumpart().
Definition: weight.h:1405
This class implements the InL2 weighting scheme.
Definition: weight.h:1327
double c_product_avlen
Definition: weight.h:1336
double wqf_product_idf
The constant values which are used on every call to get_sumpart().
Definition: weight.h:1335
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:1329
double upper_bound
The upper bound on the weight a term can give to a document.
Definition: weight.h:1332
This class implements the IneB2 weighting scheme.
Definition: weight.h:1469
double upper_bound
The upper bound of the weight.
Definition: weight.h:1474
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:1471
double c_product_avlen
Definition: weight.h:1478
double wqf_product_idf
Constant values used in get_sumpart().
Definition: weight.h:1477
Language Model weighting with Two Stage smoothing.
Definition: weight.h:2093
LM2StageWeight(double lambda=0.7, double mu=2000.0)
Construct a LM2StageWeight.
Definition: weight.h:2125
double factor
The factor to multiply weights by.
Definition: weight.h:2095
double multiplier
Precalculated multiplier for use in weight calculations.
Definition: weight.h:2104
double extra_offset
Precalculated offset to add to every sumextra.
Definition: weight.h:2111
double param_mu
Parameter controlling the smoothing.
Definition: weight.h:2101
double param_lambda
Parameter controlling the smoothing.
Definition: weight.h:2098
Language Model weighting with Absolute Discount smoothing.
Definition: weight.h:2024
double factor
The factor to multiply weights by.
Definition: weight.h:2026
double param_delta
Parameter controlling the smoothing.
Definition: weight.h:2029
LMAbsDiscountWeight(double delta=0.7)
Construct a LMAbsDiscountWeight.
Definition: weight.h:2050
double multiplier
Precalculated multiplier for use in weight calculations.
Definition: weight.h:2032
double extra_offset
Precalculated offset to add to every sumextra.
Definition: weight.h:2039
Language Model weighting with Dirichlet or Dir+ smoothing.
Definition: weight.h:1948
double param_delta
A pseudo TF value to control the scale of the TF lower bound.
Definition: weight.h:1956
double param_mu
Parameter controlling the smoothing.
Definition: weight.h:1953
double factor
The factor to multiply weights by.
Definition: weight.h:1950
double extra_offset
Precalculated offset to add to every sumextra.
Definition: weight.h:1966
LMDirichletWeight(double mu=2000.0, double delta=0.05)
Construct a LMDirichletWeight.
Definition: weight.h:1981
double multiplier
Precalculated multiplier for use in weight calculations.
Definition: weight.h:1959
Language Model weighting with Jelinek-Mercer smoothing.
Definition: weight.h:1875
LMJMWeight(double lambda=0.0)
Construct a LMJMWeight.
Definition: weight.h:1908
double multiplier
Precalculated multiplier for use in weight calculations.
Definition: weight.h:1883
double factor
The factor to multiply weights by.
Definition: weight.h:1877
double param_lambda
Parameter controlling the smoothing.
Definition: weight.h:1880
Xapian::Weight subclass implementing the PL2+ probabilistic formula.
Definition: weight.h:1731
double mean
Set by init() to get_collection_freq()) / get_collection_size()
Definition: weight.h:1751
double dw
Weight contribution of delta term in the PL2+ function.
Definition: weight.h:1754
double factor
The factor to multiply weights by.
Definition: weight.h:1733
double P1
Constants for a given term in a given query.
Definition: weight.h:1745
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:1736
double cl
Set by init() to (param_c * get_average_length())
Definition: weight.h:1748
double param_delta
Additional parameter delta in the PL2+ weighting formula.
Definition: weight.h:1739
double upper_bound
The upper bound on the weight.
Definition: weight.h:1742
This class implements the PL2 weighting scheme.
Definition: weight.h:1671
double upper_bound
The upper bound on the weight.
Definition: weight.h:1679
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:1676
double cl
Set by init() to (param_c * get_average_length())
Definition: weight.h:1685
double P1
Constants for a given term in a given query.
Definition: weight.h:1682
double factor
The factor to multiply weights by.
Definition: weight.h:1673
Registry for user subclasses.
Definition: registry.h:47
Xapian::Weight subclass implementing the tf-idf weighting scheme.
Definition: weight.h:704
double wqf_factor
The factor to multiply with the weight.
Definition: weight.h:868
TfIdfWeight(const std::string &normalizations)
Construct a TfIdfWeight.
Definition: weight.h:936
idf_norm
Idf normalizations.
Definition: weight.h:784
double param_delta
Definition: weight.h:874
wt_norm wt_norm_
The parameter for normalization for the document weight.
Definition: weight.h:865
wdf_norm wdf_norm_
The parameter for normalization for the wdf.
Definition: weight.h:861
wt_norm
Weight normalizations.
Definition: weight.h:852
idf_norm idf_norm_
The parameter for normalization for the idf.
Definition: weight.h:863
double idfn
Normalised IDF value (document-independent).
Definition: weight.h:871
TfIdfWeight()
Construct a TfIdfWeight using the default normalizations ("ntn").
Definition: weight.h:1023
wdf_norm
Wdf normalizations.
Definition: weight.h:710
TfIdfWeight(wdf_norm wdf_normalization, idf_norm idf_normalization, wt_norm wt_normalization)
Construct a TfIdfWeight.
Definition: weight.h:998
Xapian::Weight subclass implementing the traditional probabilistic formula.
Definition: weight.h:1297
TradWeight(double k=1.0)
Construct a TradWeight.
Definition: weight.h:1306
Class to hold statistics for a given collection.
Abstract base class for weighting schemes.
Definition: weight.h:38
Xapian::termcount unique_terms_upper_bound_
An upper bound on the number of unique terms in any document in the shard.
Definition: weight.h:249
Xapian::termcount get_db_wdf_upper_bound() const
An upper bound on the wdf of this term in the database.
Definition: weight.h:669
Xapian::termcount db_doclength_upper_bound_
An upper bound on the maximum length of any document in the database.
Definition: weight.h:255
Xapian::termcount wqf_
The within-query-frequency of this term.
Definition: weight.h:227
Xapian::termcount get_query_length() const
The length of the query.
Definition: weight.h:567
bool get_sumpart_needs_wdfdocmax_() const
Definition: weight.h:535
Xapian::termcount get_doclength_lower_bound() const
A lower bound on the minimum length of any document in the shard.
Definition: weight.h:586
Xapian::doccount get_reltermfreq() const
The number of relevant documents which this term indexes.
Definition: weight.h:561
Xapian::totallength total_length_
Total length of all documents in the collection.
Definition: weight.h:239
void operator=(const Weight &)
Don't allow assignment.
Xapian::termcount db_wdf_upper_bound_
An upper bound on the wdf of this term in the database.
Definition: weight.h:258
Xapian::doccount collection_size_
The number of documents in the collection.
Definition: weight.h:206
Weight()
Default constructor, needed by subclass constructors.
Definition: weight.h:273
Xapian::termcount query_length_
The length of the query.
Definition: weight.h:224
Xapian::termcount get_db_unique_terms_upper_bound() const
A lower bound on the number of unique terms in any document in the database.
Definition: weight.h:652
Xapian::doccount get_termfreq() const
The number of documents which this term indexes.
Definition: weight.h:558
bool is_bool_weight_() const
Definition: weight.h:520
virtual Weight * clone() const =0
Clone this object.
Xapian::doclength average_length_
The average length of a document in the collection.
Definition: weight.h:212
Xapian::termcount doclength_upper_bound_
An upper bound on the maximum length of any document in the shard.
Definition: weight.h:233
Xapian::doccount rset_size_
The number of documents marked as relevant.
Definition: weight.h:209
bool get_sumpart_needs_wdf_() const
Definition: weight.h:484
Xapian::totallength get_total_length() const
Total length of all documents in the collection.
Definition: weight.h:599
Xapian::doccount reltermfreq_
The number of relevant documents which this term indexes.
Definition: weight.h:221
Xapian::termcount wdf_upper_bound_
An upper bound on the wdf of this term in the shard.
Definition: weight.h:236
Xapian::termcount get_db_doclength_lower_bound() const
A lower bound on the minimum length of any document in the database.
Definition: weight.h:641
virtual void init(double factor)=0
Allow the subclass to perform any initialisation it needs to.
void need_stat(stat_flags flag)
Tell Xapian that your subclass will want a particular statistic.
Definition: weight.h:183
virtual double get_maxpart() const =0
Return an upper bound on what get_sumpart() can return for any document.
Xapian::termcount get_db_unique_terms_lower_bound() const
An upper bound on the number of unique terms in any document in the database.
Definition: weight.h:661
Xapian::doccount get_rset_size() const
The number of documents marked as relevant.
Definition: weight.h:552
Xapian::termcount db_doclength_lower_bound_
A lower bound on the minimum length of any document in the database.
Definition: weight.h:252
Xapian::doccount termfreq_
The number of documents which this term indexes.
Definition: weight.h:215
Xapian::termcount get_unique_terms_lower_bound() const
An upper bound on the number of unique terms in any document in the shard.
Definition: weight.h:623
Xapian::termcount get_wqf() const
The within-query-frequency of this term.
Definition: weight.h:570
Xapian::termcount get_collection_freq() const
The collection frequency of the term.
Definition: weight.h:564
Xapian::termcount collectionfreq_
Definition: weight.h:218
Xapian::doccount get_collection_size() const
The number of documents in the collection.
Definition: weight.h:549
Weight(const Weight &)
Don't allow copying.
Xapian::doclength get_average_length() const
The average length of a document in the collection.
Definition: weight.h:555
Xapian::termcount get_unique_terms_upper_bound() const
A lower bound on the number of unique terms in any document in the shard.
Definition: weight.h:612
Xapian::termcount db_unique_terms_upper_bound_
An upper bound on the number of unique terms in any document in the database.
Definition: weight.h:268
bool get_sumpart_needs_uniqueterms_() const
Definition: weight.h:494
Xapian::termcount get_doclength_upper_bound() const
An upper bound on the maximum length of any document in the shard.
Definition: weight.h:576
stat_flags stats_needed
A bitmask of the statistics this weighting scheme needs.
Definition: weight.h:203
Xapian::termcount get_db_doclength_upper_bound() const
An upper bound on the maximum length of any document in the database.
Definition: weight.h:631
bool get_sumpart_needs_doclength_() const
Definition: weight.h:475
Xapian::termcount doclength_lower_bound_
A lower bound on the minimum length of any document in the shard.
Definition: weight.h:230
stat_flags
Stats which the weighting scheme can use (see need_stat()).
Definition: weight.h:41
Xapian::termcount db_unique_terms_lower_bound_
A lower bound on the number of unique terms in any document in the database.
Definition: weight.h:263
virtual double get_sumpart(Xapian::termcount wdf, Xapian::termcount doclen, Xapian::termcount uniqterms, Xapian::termcount wdfdocmax) const =0
Calculate the weight contribution for this object's term to a document.
Xapian::termcount unique_terms_lower_bound_
A lower bound on the number of unique terms in any document in the shard.
Definition: weight.h:244
Xapian::termcount get_wdf_upper_bound() const
An upper bound on the wdf of this term in the shard.
Definition: weight.h:594
An indexed database of documents.
string term
Define XAPIAN_DEPRECATED() and related macros.
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
double doclength
A normalised document length.
Definition: types.h:58
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
Definition: types.h:114
Class for looking up user subclasses during unserialisation.
@ NONE
Definition: sbl-dispatch.h:26
typedefs for Xapian
Define XAPIAN_VISIBILITY_* macros.
#define XAPIAN_VISIBILITY_DEFAULT
Definition: visibility.h:28
#define XAPIAN_VISIBILITY_INTERNAL
Definition: visibility.h:29