xapian-core  1.4.25
weight.h
Go to the documentation of this file.
1 
4 /* Copyright (C) 2004,2007,2008,2009,2010,2011,2012,2015,2016,2019 Olly Betts
5  * Copyright (C) 2009 Lemur Consulting Ltd
6  * Copyright (C) 2013,2014 Aarsh Shah
7  * Copyright (C) 2016 Vivek Pal
8  *
9  * This program is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU General Public License as
11  * published by the Free Software Foundation; either version 2 of the
12  * License, or (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #ifndef XAPIAN_INCLUDED_WEIGHT_H
25 #define XAPIAN_INCLUDED_WEIGHT_H
26 
27 #include <string>
28 
29 #include <xapian/types.h>
30 #include <xapian/visibility.h>
31 
32 namespace Xapian {
33 
36  protected:
38  typedef enum {
40  COLLECTION_SIZE = 1,
42  RSET_SIZE = 2,
44  AVERAGE_LENGTH = 4,
46  TERMFREQ = 8,
48  RELTERMFREQ = 16,
50  QUERY_LENGTH = 32,
52  WQF = 64,
54  WDF = 128,
56  DOC_LENGTH = 256,
58  DOC_LENGTH_MIN = 512,
60  DOC_LENGTH_MAX = 1024,
62  WDF_MAX = 2048,
64  COLLECTION_FREQ = 4096,
66  UNIQUE_TERMS = 8192,
71  TOTAL_LENGTH = COLLECTION_SIZE | AVERAGE_LENGTH
72  } stat_flags;
73 
83  void need_stat(stat_flags flag) {
84  stats_needed = stat_flags(stats_needed | flag);
85  }
86 
96  virtual void init(double factor) = 0;
97 
98  private:
100  void operator=(const Weight &);
101 
103  stat_flags stats_needed;
104 
107 
110 
113 
116 
117  // The collection frequency of the term.
119 
122 
125 
128 
131 
134 
137 
138  public:
139 
141  Weight() : stats_needed() { }
142 
147  typedef enum {
148  TWO_STAGE_SMOOTHING = 1,
149  DIRICHLET_SMOOTHING = 2,
150  ABSOLUTE_DISCOUNT_SMOOTHING = 3,
151  JELINEK_MERCER_SMOOTHING = 4,
152  DIRICHLET_PLUS_SMOOTHING = 5
153  } type_smoothing;
154 
155  class Internal;
156 
158  virtual ~Weight();
159 
176  virtual Weight * clone() const = 0;
177 
191  virtual std::string name() const;
192 
198  virtual std::string serialise() const;
199 
217  virtual Weight * unserialise(const std::string & serialised) const;
218 
229  virtual double get_sumpart(Xapian::termcount wdf,
230  Xapian::termcount doclen,
231  Xapian::termcount uniqterms) const = 0;
232 
238  virtual double get_maxpart() const = 0;
239 
248  virtual double get_sumextra(Xapian::termcount doclen,
249  Xapian::termcount uniqterms) const = 0;
250 
257  virtual double get_maxextra() const = 0;
258 
273  void init_(const Internal & stats, Xapian::termcount query_len_,
274  const std::string & term, Xapian::termcount wqf_,
275  double factor);
276 
290  void init_(const Internal & stats, Xapian::termcount query_len_,
291  const std::string & term, Xapian::termcount wqf_,
292  double factor, void* postlist);
293 
304  void init_(const Internal & stats, Xapian::termcount query_len_,
305  double factor, Xapian::doccount termfreq,
306  Xapian::doccount reltermfreq, Xapian::termcount collection_freq);
307 
314  void init_(const Internal & stats, Xapian::termcount query_len_);
315 
323  return stats_needed & DOC_LENGTH;
324  }
325 
331  bool get_sumpart_needs_wdf_() const {
332  return stats_needed & WDF;
333  }
334 
342  return stats_needed & UNIQUE_TERMS;
343  }
344 
346  bool is_bool_weight_() const {
347  // Checking the name isn't ideal, but (get_maxpart() == 0.0) isn't
348  // required to work without init() having been called. We can at
349  // least avoid the virtual method call in most non-BoolWeight cases
350  // as most other classes will need at least some stats.
351  return stats_needed == 0 && name() == "Xapian::BoolWeight";
352  }
353 
354  protected:
360  Weight(const Weight &);
361 
363  Xapian::doccount get_collection_size() const { return collection_size_; }
364 
366  Xapian::doccount get_rset_size() const { return rset_size_; }
367 
369  Xapian::doclength get_average_length() const { return average_length_; }
370 
372  Xapian::doccount get_termfreq() const { return termfreq_; }
373 
375  Xapian::doccount get_reltermfreq() const { return reltermfreq_; }
376 
378  Xapian::termcount get_collection_freq() const { return collectionfreq_; }
379 
381  Xapian::termcount get_query_length() const { return query_length_; }
382 
384  Xapian::termcount get_wqf() const { return wqf_; }
385 
391  return doclength_upper_bound_;
392  }
393 
401  return doclength_lower_bound_;
402  }
403 
409  return wdf_upper_bound_;
410  }
411 
414  return Xapian::totallength(average_length_ * collection_size_ + 0.5);
415  }
416 };
417 
423  BoolWeight * clone() const;
424 
425  void init(double factor);
426 
427  public:
430 
431  std::string name() const;
432 
433  std::string serialise() const;
434  BoolWeight * unserialise(const std::string & serialised) const;
435 
436  double get_sumpart(Xapian::termcount wdf,
437  Xapian::termcount doclen,
438  Xapian::termcount uniqterms) const;
439  double get_maxpart() const;
440 
441  double get_sumextra(Xapian::termcount doclen,
442  Xapian::termcount uniqterms) const;
443  double get_maxextra() const;
444 };
445 
448  /* Three character string indicating the normalizations for tf(wdf), idf and
449  tfidf weight. */
450  std::string normalizations;
451 
453  double factor;
454 
455  TfIdfWeight * clone() const;
456 
457  void init(double factor);
458 
459  /* When additional normalizations are implemented in the future, the additional statistics for them
460  should be accessed by these functions. */
461  double get_wdfn(Xapian::termcount wdf, char c) const;
462  double get_idfn(Xapian::doccount termfreq, char c) const;
463  double get_wtn(double wt, char c) const;
464 
465  public:
506  explicit TfIdfWeight(const std::string &normalizations);
507 
510  : normalizations("ntn")
511  {
512  need_stat(TERMFREQ);
513  need_stat(WDF);
514  need_stat(WDF_MAX);
515  need_stat(COLLECTION_SIZE);
516  }
517 
518  std::string name() const;
519 
520  std::string serialise() const;
521  TfIdfWeight * unserialise(const std::string & serialised) const;
522 
523  double get_sumpart(Xapian::termcount wdf,
524  Xapian::termcount doclen,
525  Xapian::termcount uniqterm) const;
526  double get_maxpart() const;
527 
528  double get_sumextra(Xapian::termcount doclen,
529  Xapian::termcount uniqterms) const;
530  double get_maxextra() const;
531 };
532 
533 
538 
540  mutable double termweight;
541 
543  double param_k1, param_k2, param_k3, param_b;
544 
547 
548  BM25Weight * clone() const;
549 
550  void init(double factor);
551 
552  public:
580  BM25Weight(double k1, double k2, double k3, double b, double min_normlen)
581  : param_k1(k1), param_k2(k2), param_k3(k3), param_b(b),
582  param_min_normlen(min_normlen)
583  {
584  if (param_k1 < 0) param_k1 = 0;
585  if (param_k2 < 0) param_k2 = 0;
586  if (param_k3 < 0) param_k3 = 0;
587  if (param_b < 0) {
588  param_b = 0;
589  } else if (param_b > 1) {
590  param_b = 1;
591  }
592  need_stat(COLLECTION_SIZE);
593  need_stat(RSET_SIZE);
594  need_stat(TERMFREQ);
595  need_stat(RELTERMFREQ);
596  need_stat(WDF);
597  need_stat(WDF_MAX);
598  if (param_k2 != 0 || (param_k1 != 0 && param_b != 0)) {
599  need_stat(DOC_LENGTH_MIN);
600  need_stat(AVERAGE_LENGTH);
601  }
602  if (param_k1 != 0 && param_b != 0) need_stat(DOC_LENGTH);
603  if (param_k2 != 0) need_stat(QUERY_LENGTH);
604  if (param_k3 != 0) need_stat(WQF);
605  }
606 
608  : param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
609  param_min_normlen(0.5)
610  {
611  need_stat(COLLECTION_SIZE);
612  need_stat(RSET_SIZE);
613  need_stat(TERMFREQ);
614  need_stat(RELTERMFREQ);
615  need_stat(WDF);
616  need_stat(WDF_MAX);
617  need_stat(DOC_LENGTH_MIN);
618  need_stat(AVERAGE_LENGTH);
619  need_stat(DOC_LENGTH);
620  need_stat(WQF);
621  }
622 
623  std::string name() const;
624 
625  std::string serialise() const;
626  BM25Weight * unserialise(const std::string & serialised) const;
627 
628  double get_sumpart(Xapian::termcount wdf,
629  Xapian::termcount doclen,
630  Xapian::termcount uniqterm) const;
631  double get_maxpart() const;
632 
633  double get_sumextra(Xapian::termcount doclen,
634  Xapian::termcount uniqterms) const;
635  double get_maxextra() const;
636 };
637 
642 
644  mutable double termweight;
645 
647  double param_k1, param_k2, param_k3, param_b;
648 
651 
653  double param_delta;
654 
655  BM25PlusWeight * clone() const;
656 
657  void init(double factor);
658 
659  public:
694  BM25PlusWeight(double k1, double k2, double k3, double b,
695  double min_normlen, double delta)
696  : param_k1(k1), param_k2(k2), param_k3(k3), param_b(b),
697  param_min_normlen(min_normlen), param_delta(delta)
698  {
699  if (param_k1 < 0) param_k1 = 0;
700  if (param_k2 < 0) param_k2 = 0;
701  if (param_k3 < 0) param_k3 = 0;
702  if (param_delta < 0) param_delta = 0;
703  if (param_b < 0) {
704  param_b = 0;
705  } else if (param_b > 1) {
706  param_b = 1;
707  }
708  need_stat(COLLECTION_SIZE);
709  need_stat(RSET_SIZE);
710  need_stat(TERMFREQ);
711  need_stat(RELTERMFREQ);
712  need_stat(WDF);
713  need_stat(WDF_MAX);
714  if (param_k2 != 0 || (param_k1 != 0 && param_b != 0)) {
715  need_stat(DOC_LENGTH_MIN);
716  need_stat(AVERAGE_LENGTH);
717  }
718  if (param_k1 != 0 && param_b != 0) need_stat(DOC_LENGTH);
719  if (param_k2 != 0) need_stat(QUERY_LENGTH);
720  if (param_k3 != 0) need_stat(WQF);
721  }
722 
724  : param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
725  param_min_normlen(0.5), param_delta(1)
726  {
727  need_stat(COLLECTION_SIZE);
728  need_stat(RSET_SIZE);
729  need_stat(TERMFREQ);
730  need_stat(RELTERMFREQ);
731  need_stat(WDF);
732  need_stat(WDF_MAX);
733  need_stat(DOC_LENGTH_MIN);
734  need_stat(AVERAGE_LENGTH);
735  need_stat(DOC_LENGTH);
736  need_stat(WQF);
737  }
738 
739  std::string name() const;
740 
741  std::string serialise() const;
742  BM25PlusWeight * unserialise(const std::string & serialised) const;
743 
744  double get_sumpart(Xapian::termcount wdf,
745  Xapian::termcount doclen,
746  Xapian::termcount uniqterm) const;
747  double get_maxpart() const;
748 
749  double get_sumextra(Xapian::termcount doclen,
750  Xapian::termcount uniqterms) const;
751  double get_maxextra() const;
752 };
753 
766 
768  mutable double termweight;
769 
771  double param_k;
772 
773  TradWeight * clone() const;
774 
775  void init(double factor);
776 
777  public:
785  explicit TradWeight(double k = 1.0) : param_k(k) {
786  if (param_k < 0) param_k = 0;
787  if (param_k != 0.0) {
788  need_stat(AVERAGE_LENGTH);
789  need_stat(DOC_LENGTH);
790  }
791  need_stat(COLLECTION_SIZE);
792  need_stat(RSET_SIZE);
793  need_stat(TERMFREQ);
794  need_stat(RELTERMFREQ);
795  need_stat(DOC_LENGTH_MIN);
796  need_stat(WDF);
797  need_stat(WDF_MAX);
798  }
799 
800  std::string name() const;
801 
802  std::string serialise() const;
803  TradWeight * unserialise(const std::string & serialised) const;
804 
805  double get_sumpart(Xapian::termcount wdf,
806  Xapian::termcount doclen,
807  Xapian::termcount uniqueterms) const;
808  double get_maxpart() const;
809 
810  double get_sumextra(Xapian::termcount doclen,
811  Xapian::termcount uniqterms) const;
812  double get_maxextra() const;
813 };
814 
835  double param_c;
836 
838  double upper_bound;
839 
843 
844  InL2Weight * clone() const;
845 
846  void init(double factor);
847 
848  public:
857  explicit InL2Weight(double c);
858 
860  : param_c(1.0)
861  {
862  need_stat(AVERAGE_LENGTH);
863  need_stat(DOC_LENGTH);
864  need_stat(DOC_LENGTH_MIN);
865  need_stat(DOC_LENGTH_MAX);
866  need_stat(COLLECTION_SIZE);
867  need_stat(WDF);
868  need_stat(WDF_MAX);
869  need_stat(WQF);
870  need_stat(TERMFREQ);
871  }
872 
873  std::string name() const;
874 
875  std::string serialise() const;
876  InL2Weight * unserialise(const std::string & serialised) const;
877 
878  double get_sumpart(Xapian::termcount wdf,
879  Xapian::termcount doclen,
880  Xapian::termcount uniqterms) const;
881  double get_maxpart() const;
882 
883  double get_sumextra(Xapian::termcount doclen,
884  Xapian::termcount uniqterms) const;
885  double get_maxextra() const;
886 };
887 
906  double param_c;
907 
909  double upper_bound;
910 
914  double B_constant;
915 
916  IfB2Weight * clone() const;
917 
918  void init(double factor);
919 
920  public:
931  explicit IfB2Weight(double c);
932 
933  IfB2Weight() : param_c(1.0) {
934  need_stat(AVERAGE_LENGTH);
935  need_stat(DOC_LENGTH);
936  need_stat(DOC_LENGTH_MIN);
937  need_stat(DOC_LENGTH_MAX);
938  need_stat(COLLECTION_SIZE);
939  need_stat(COLLECTION_FREQ);
940  need_stat(WDF);
941  need_stat(WDF_MAX);
942  need_stat(WQF);
943  need_stat(TERMFREQ);
944  }
945 
946  std::string name() const;
947 
948  std::string serialise() const;
949  IfB2Weight * unserialise(const std::string & serialised) const;
950 
951  double get_sumpart(Xapian::termcount wdf,
952  Xapian::termcount doclen,
953  Xapian::termcount uniqterm) const;
954  double get_maxpart() const;
955 
956  double get_sumextra(Xapian::termcount doclen,
957  Xapian::termcount uniqterms) const;
958  double get_maxextra() const;
959 };
960 
979  double param_c;
980 
982  double upper_bound;
983 
987  double B_constant;
988 
989  IneB2Weight * clone() const;
990 
991  void init(double factor);
992 
993  public:
1002  explicit IneB2Weight(double c);
1003 
1004  IneB2Weight() : param_c(1.0) {
1005  need_stat(AVERAGE_LENGTH);
1006  need_stat(DOC_LENGTH);
1007  need_stat(DOC_LENGTH_MIN);
1008  need_stat(DOC_LENGTH_MAX);
1009  need_stat(COLLECTION_SIZE);
1010  need_stat(WDF);
1011  need_stat(WDF_MAX);
1012  need_stat(WQF);
1013  need_stat(COLLECTION_FREQ);
1014  need_stat(TERMFREQ);
1015  }
1016 
1017  std::string name() const;
1018 
1019  std::string serialise() const;
1020  IneB2Weight * unserialise(const std::string & serialised) const;
1021 
1022  double get_sumpart(Xapian::termcount wdf,
1023  Xapian::termcount doclen,
1024  Xapian::termcount uniqterms) const;
1025  double get_maxpart() const;
1026 
1027  double get_sumextra(Xapian::termcount doclen,
1028  Xapian::termcount uniqterms) const;
1029  double get_maxextra() const;
1030 };
1031 
1051  double param_c;
1052 
1054  double upper_bound;
1055 
1058  double B_constant;
1059  double wt;
1062 
1063  BB2Weight * clone() const;
1064 
1065  void init(double factor);
1066 
1067  public:
1078  explicit BB2Weight(double c);
1079 
1080  BB2Weight() : param_c(1.0) {
1081  need_stat(AVERAGE_LENGTH);
1082  need_stat(DOC_LENGTH);
1083  need_stat(DOC_LENGTH_MIN);
1084  need_stat(DOC_LENGTH_MAX);
1085  need_stat(COLLECTION_SIZE);
1086  need_stat(COLLECTION_FREQ);
1087  need_stat(WDF);
1088  need_stat(WDF_MAX);
1089  need_stat(WQF);
1090  need_stat(TERMFREQ);
1091  }
1092 
1093  std::string name() const;
1094 
1095  std::string serialise() const;
1096  BB2Weight * unserialise(const std::string & serialised) const;
1097 
1098  double get_sumpart(Xapian::termcount wdf,
1099  Xapian::termcount doclen,
1100  Xapian::termcount uniqterms) const;
1101  double get_maxpart() const;
1102 
1103  double get_sumextra(Xapian::termcount doclen,
1104  Xapian::termcount uniqterms) const;
1105  double get_maxextra() const;
1106 };
1107 
1127  double lower_bound;
1128 
1130  double upper_bound;
1131 
1135 
1136  DLHWeight * clone() const;
1137 
1138  void init(double factor);
1139 
1140  public:
1142  need_stat(DOC_LENGTH);
1143  need_stat(COLLECTION_FREQ);
1144  need_stat(WDF);
1145  need_stat(WQF);
1146  need_stat(WDF_MAX);
1147  need_stat(DOC_LENGTH_MIN);
1148  need_stat(DOC_LENGTH_MAX);
1149  need_stat(TOTAL_LENGTH);
1150  }
1151 
1152  std::string name() const;
1153 
1154  std::string serialise() const;
1155  DLHWeight * unserialise(const std::string & serialised) const;
1156 
1157  double get_sumpart(Xapian::termcount wdf,
1158  Xapian::termcount doclen,
1159  Xapian::termcount uniqterms) const;
1160  double get_maxpart() const;
1161 
1162  double get_sumextra(Xapian::termcount doclen,
1163  Xapian::termcount uniqterms) const;
1164  double get_maxextra() const;
1165 };
1166 
1187  double param_c;
1188 
1196  double lower_bound;
1197 
1199  double upper_bound;
1200 
1202  double P1, P2;
1203 
1205  double cl;
1206 
1207  PL2Weight * clone() const;
1208 
1209  void init(double factor);
1210 
1211  public:
1222  explicit PL2Weight(double c);
1223 
1224  PL2Weight() : param_c(1.0) {
1225  need_stat(AVERAGE_LENGTH);
1226  need_stat(DOC_LENGTH);
1227  need_stat(DOC_LENGTH_MIN);
1228  need_stat(DOC_LENGTH_MAX);
1229  need_stat(COLLECTION_SIZE);
1230  need_stat(COLLECTION_FREQ);
1231  need_stat(WDF);
1232  need_stat(WDF_MAX);
1233  need_stat(WQF);
1234  }
1235 
1236  std::string name() const;
1237 
1238  std::string serialise() const;
1239  PL2Weight * unserialise(const std::string & serialised) const;
1240 
1241  double get_sumpart(Xapian::termcount wdf,
1242  Xapian::termcount doclen,
1243  Xapian::termcount uniqterms) const;
1244  double get_maxpart() const;
1245 
1246  double get_sumextra(Xapian::termcount doclen,
1247  Xapian::termcount uniqterms) const;
1248  double get_maxextra() const;
1249 };
1250 
1254  double factor;
1255 
1257  double param_c;
1258 
1260  double param_delta;
1261 
1263  double upper_bound;
1264 
1266  double P1, P2;
1267 
1269  double cl;
1270 
1272  double mean;
1273 
1275  double dw;
1276 
1277  PL2PlusWeight * clone() const;
1278 
1279  void init(double factor_);
1280 
1281  public:
1299  PL2PlusWeight(double c, double delta);
1300 
1302  : param_c(1.0), param_delta(0.8) {
1303  need_stat(AVERAGE_LENGTH);
1304  need_stat(DOC_LENGTH);
1305  need_stat(DOC_LENGTH_MIN);
1306  need_stat(DOC_LENGTH_MAX);
1307  need_stat(COLLECTION_SIZE);
1308  need_stat(COLLECTION_FREQ);
1309  need_stat(WDF);
1310  need_stat(WDF_MAX);
1311  need_stat(WQF);
1312  }
1313 
1314  std::string name() const;
1315 
1316  std::string serialise() const;
1317  PL2PlusWeight * unserialise(const std::string & serialised) const;
1318 
1319  double get_sumpart(Xapian::termcount wdf,
1320  Xapian::termcount doclen,
1321  Xapian::termcount uniqterms) const;
1322  double get_maxpart() const;
1323 
1324  double get_sumextra(Xapian::termcount doclen,
1325  Xapian::termcount uniqterms) const;
1326  double get_maxextra() const;
1327 };
1328 
1350  double upper_bound;
1351 
1353  double lower_bound;
1354 
1358 
1359  DPHWeight * clone() const;
1360 
1361  void init(double factor);
1362 
1363  public:
1366  need_stat(DOC_LENGTH);
1367  need_stat(COLLECTION_FREQ);
1368  need_stat(WDF);
1369  need_stat(WQF);
1370  need_stat(WDF_MAX);
1371  need_stat(DOC_LENGTH_MIN);
1372  need_stat(DOC_LENGTH_MAX);
1373  need_stat(TOTAL_LENGTH);
1374  }
1375 
1376  std::string name() const;
1377 
1378  std::string serialise() const;
1379  DPHWeight * unserialise(const std::string & serialised) const;
1380 
1381  double get_sumpart(Xapian::termcount wdf,
1382  Xapian::termcount doclen,
1383  Xapian::termcount uniqterms) const;
1384  double get_maxpart() const;
1385 
1386  double get_sumextra(Xapian::termcount doclen,
1387  Xapian::termcount uniqterms) const;
1388  double get_maxextra() const;
1389 };
1390 
1391 
1403  type_smoothing select_smoothing;
1404 
1405  // Parameters for handling negative value of log, and for smoothing.
1406  double param_log, param_smoothing1, param_smoothing2;
1407 
1418 
1419  LMWeight * clone() const;
1420 
1421  void init(double factor);
1422 
1423  public:
1455  // Unigram LM Constructor to specifically mention all parameters for handling negative log value and smoothing.
1456  explicit LMWeight(double param_log_ = 0.0,
1457  type_smoothing select_smoothing_ = TWO_STAGE_SMOOTHING,
1458  double param_smoothing1_ = -1.0,
1459  double param_smoothing2_ = -1.0)
1460  : select_smoothing(select_smoothing_), param_log(param_log_), param_smoothing1(param_smoothing1_),
1461  param_smoothing2(param_smoothing2_)
1462  {
1463  if (param_smoothing1 < 0) param_smoothing1 = 0.7;
1464  if (param_smoothing2 < 0) {
1465  if (select_smoothing == TWO_STAGE_SMOOTHING)
1466  param_smoothing2 = 2000.0;
1467  else
1468  param_smoothing2 = 0.05;
1469  }
1470  need_stat(DOC_LENGTH);
1471  need_stat(RSET_SIZE);
1472  need_stat(TERMFREQ);
1473  need_stat(RELTERMFREQ);
1474  need_stat(DOC_LENGTH_MAX);
1475  need_stat(WDF);
1476  need_stat(WDF_MAX);
1477  need_stat(COLLECTION_FREQ);
1478  need_stat(TOTAL_LENGTH);
1479  if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING)
1480  need_stat(UNIQUE_TERMS);
1481  if (select_smoothing == DIRICHLET_PLUS_SMOOTHING)
1482  need_stat(DOC_LENGTH_MIN);
1483  }
1484 
1485  std::string name() const;
1486 
1487  std::string serialise() const;
1488  LMWeight * unserialise(const std::string & serialised) const;
1489 
1490  double get_sumpart(Xapian::termcount wdf,
1491  Xapian::termcount doclen,
1492  Xapian::termcount uniqterm) const;
1493  double get_maxpart() const;
1494 
1495  double get_sumextra(Xapian::termcount doclen, Xapian::termcount) const;
1496  double get_maxextra() const;
1497 };
1498 
1506  double factor;
1507 
1508  public:
1509  CoordWeight * clone() const;
1510 
1511  void init(double factor_);
1512 
1515 
1516  std::string name() const;
1517 
1518  std::string serialise() const;
1519  CoordWeight * unserialise(const std::string & serialised) const;
1520 
1521  double get_sumpart(Xapian::termcount wdf,
1522  Xapian::termcount doclen,
1523  Xapian::termcount uniqterm) const;
1524  double get_maxpart() const;
1525 
1526  double get_sumextra(Xapian::termcount, Xapian::termcount) const;
1527  double get_maxextra() const;
1528 };
1529 
1530 }
1531 
1532 #endif // XAPIAN_INCLUDED_WEIGHT_H
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
type_smoothing select_smoothing
The type of smoothing to use.
Definition: weight.h:1403
double factor
The factor to multiply weights by.
Definition: weight.h:1254
Xapian::doccount termfreq_
The number of documents which this term indexes.
Definition: weight.h:115
double param_delta
Additional parameter delta in the PL2+ weighting formula.
Definition: weight.h:1260
Xapian::doccount get_collection_size() const
The number of documents in the collection.
Definition: weight.h:363
std::string normalizations
Definition: weight.h:450
typedefs for Xapian
Xapian::termcount doclength_lower_bound_
A lower bound on the minimum length of any document in the database.
Definition: weight.h:130
double factor
The factor to multiply with the weight.
Definition: weight.h:453
double upper_bound
The upper bound on the weight.
Definition: weight.h:1263
Xapian::termcount get_collection_freq() const
The collection frequency of the term.
Definition: weight.h:378
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:1187
double B_constant
Definition: weight.h:1058
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:1257
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:906
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
Definition: types.h:139
Xapian::termcount doclength_upper_bound_
An upper bound on the maximum length of any document in the database.
Definition: weight.h:133
double stirling_constant_1
Definition: weight.h:1060
double upper_bound
The upper bound on the weight.
Definition: weight.h:1199
Xapian::doclength param_min_normlen
The minimum normalised document length value.
Definition: weight.h:546
bool get_sumpart_needs_wdf_() const
Definition: weight.h:331
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:1051
This class implements the InL2 weighting scheme.
Definition: weight.h:833
double param_k
The parameter in the formula.
Definition: weight.h:771
double dw
Weight contribution of delta term in the PL2+ function.
Definition: weight.h:1275
bool get_sumpart_needs_uniqueterms_() const
Definition: weight.h:341
bool is_bool_weight_() const
Definition: weight.h:346
Xapian::Weight subclass implementing the PL2+ probabilistic formula.
Definition: weight.h:1252
double wqf_product_idf
The constant values which are used on every call to get_sumpart().
Definition: weight.h:841
stat_flags stats_needed
A bitmask of the statistics this weighting scheme needs.
Definition: weight.h:103
Xapian::totallength get_total_length() const
Total length of all documents in the collection.
Definition: weight.h:413
bool get_sumpart_needs_doclength_() const
Definition: weight.h:322
double lower_bound
Now unused but left in place in 1.4.x for ABI compatibility.
Definition: weight.h:1127
#define XAPIAN_VISIBILITY_DEFAULT
Definition: visibility.h:28
double upper_bound
The upper bound on the weight.
Definition: weight.h:1350
double lower_bound
The factor to multiply weights by.
Definition: weight.h:1196
DPHWeight()
Construct a DPHWeight.
Definition: weight.h:1365
Xapian::doccount collection_size_
The number of documents in the collection.
Definition: weight.h:106
Xapian::termcount wdf_upper_bound_
An upper bound on the wdf of this term.
Definition: weight.h:136
CoordWeight()
Construct a CoordWeight.
Definition: weight.h:1514
TfIdfWeight()
Construct a TfIdfWeight using the default normalizations ("ntn").
Definition: weight.h:509
double param_k3
Definition: weight.h:543
This class implements the BB2 weighting scheme.
Definition: weight.h:1049
type_smoothing
Type of smoothing to use with the Language Model Weighting scheme.
Definition: weight.h:147
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
Xapian::termcount wqf_
The within-query-frequency of this term.
Definition: weight.h:127
Xapian::Weight subclass implementing Coordinate Matching.
Definition: weight.h:1504
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:979
double c_product_avlen
Definition: weight.h:913
Xapian::termcount get_doclength_lower_bound() const
A lower bound on the minimum length of any document in the database.
Definition: weight.h:400
double wqf_product_idf
The constant values which are used for calculations in get_sumpart().
Definition: weight.h:912
double c_product_avlen
Definition: weight.h:842
Class implementing a "boolean" weighting scheme.
Definition: weight.h:422
Xapian::doclength param_min_normlen
The minimum normalised document length value.
Definition: weight.h:650
Define XAPIAN_VISIBILITY_* macros.
double doclength
A normalised document length.
Definition: types.h:59
double stirling_constant_2
Definition: weight.h:1061
Xapian::termcount collectionfreq_
Definition: weight.h:118
Class to hold statistics for a given collection.
Xapian::doclength average_length_
The average length of a document in the collection.
Definition: weight.h:112
double termweight
Factor combining all the document independent factors.
Definition: weight.h:644
double mean
Set by init() to get_collection_freq()) / get_collection_size()
Definition: weight.h:1272
double termweight
Factor combining all the document independent factors.
Definition: weight.h:768
stat_flags
Stats which the weighting scheme can use (see need_stat()).
Definition: weight.h:38
Xapian::termcount get_query_length() const
The length of the query.
Definition: weight.h:381
Xapian::doclength len_factor
Factor to multiply the document length by.
Definition: weight.h:765
BoolWeight()
Construct a BoolWeight.
Definition: weight.h:429
Xapian::Weight subclass implementing the traditional probabilistic formula.
Definition: weight.h:763
This class implements the DLH weighting scheme, which is a representative scheme of the Divergence fr...
Definition: weight.h:1125
This class implements the PL2 weighting scheme.
Definition: weight.h:1185
Xapian::doccount rset_size_
The number of documents marked as relevant.
Definition: weight.h:109
LMWeight(double param_log_=0.0, type_smoothing select_smoothing_=TWO_STAGE_SMOOTHING, double param_smoothing1_=-1.0, double param_smoothing2_=-1.0)
Construct a LMWeight.
Definition: weight.h:1456
This class implements the IneB2 weighting scheme.
Definition: weight.h:977
Xapian::termcount get_wqf() const
The within-query-frequency of this term.
Definition: weight.h:384
double upper_bound
The upper bound on the weight a term can give to a document.
Definition: weight.h:838
Xapian::doccount get_rset_size() const
The number of documents marked as relevant.
Definition: weight.h:366
Xapian::termcount query_length_
The length of the query.
Definition: weight.h:124
double upper_bound
The upper bound on the weight.
Definition: weight.h:1054
double log_constant
The constant value to be used in get_sumpart().
Definition: weight.h:1133
double wqf_product_factor
Definition: weight.h:1357
Xapian::termcount get_doclength_upper_bound() const
An upper bound on the maximum length of any document in the database.
Definition: weight.h:390
double B_constant
Definition: weight.h:914
BM25Weight(double k1, double k2, double k3, double b, double min_normlen)
Construct a BM25Weight.
Definition: weight.h:580
double lower_bound
Now unused but left in place in 1.4.x for ABI compatibility.
Definition: weight.h:1353
TradWeight(double k=1.0)
Construct a TradWeight.
Definition: weight.h:785
This class implements the IfB2 weighting scheme.
Definition: weight.h:904
Xapian::doccount get_termfreq() const
The number of documents which this term indexes.
Definition: weight.h:372
double wqf_product_factor
Definition: weight.h:1134
Xapian::doclength get_average_length() const
The average length of a document in the collection.
Definition: weight.h:369
char name[9]
Definition: dbcheck.cc:55
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
double termweight
Factor combining all the document independent factors.
Definition: weight.h:540
double cl
Set by init() to (param_c * get_average_length())
Definition: weight.h:1205
Xapian::doclength len_factor
Factor to multiply the document length by.
Definition: weight.h:641
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:835
double cl
Set by init() to (param_c * get_average_length())
Definition: weight.h:1269
This class implements the DPH weighting scheme.
Definition: weight.h:1348
Xapian::doccount get_reltermfreq() const
The number of relevant documents which this term indexes.
Definition: weight.h:375
Xapian::doclength len_factor
Factor to multiply the document length by.
Definition: weight.h:537
void need_stat(stat_flags flag)
Tell Xapian that your subclass will want a particular statistic.
Definition: weight.h:83
double weight_collection
The factor to multiply weights by.
Definition: weight.h:1417
double upper_bound
The upper bound on the weight.
Definition: weight.h:1130
double log_constant
The constant value used in get_sumpart() .
Definition: weight.h:1356
double c_product_avlen
The constant values to be used in get_sumpart().
Definition: weight.h:1057
double param_delta
Additional parameter delta in the BM25+ formula.
Definition: weight.h:653
BM25PlusWeight(double k1, double k2, double k3, double b, double min_normlen, double delta)
Construct a BM25PlusWeight.
Definition: weight.h:694
Xapian::Weight subclass implementing the Language Model formula.
Definition: weight.h:1401
Weight()
Default constructor, needed by subclass constructors.
Definition: weight.h:141
double c_product_avlen
Definition: weight.h:986
double upper_bound
The upper bound on the weight.
Definition: weight.h:909
Xapian::termcount get_wdf_upper_bound() const
An upper bound on the wdf of this term.
Definition: weight.h:408
double wqf_product_idf
Constant values used in get_sumpart().
Definition: weight.h:985
double upper_bound
The upper bound of the weight.
Definition: weight.h:982
double factor
The factor to multiply weights by.
Definition: weight.h:1506
double B_constant
Definition: weight.h:987
Xapian::Weight subclass implementing the BM25+ probabilistic formula.
Definition: weight.h:639
double param_smoothing2
Definition: weight.h:1406
Xapian::Weight subclass implementing the BM25 probabilistic formula.
Definition: weight.h:535
Xapian::doccount reltermfreq_
The number of relevant documents which this term indexes.
Definition: weight.h:121
Xapian::Weight subclass implementing the tf-idf weighting scheme.
Definition: weight.h:447
Abstract base class for weighting schemes.
Definition: weight.h:35