xapian-core  1.4.20
weight.h
Go to the documentation of this file.
1 
4 /* Copyright (C) 2004,2007,2008,2009,2010,2011,2012,2015,2016,2019 Olly Betts
5  * Copyright (C) 2009 Lemur Consulting Ltd
6  * Copyright (C) 2013,2014 Aarsh Shah
7  * Copyright (C) 2016 Vivek Pal
8  *
9  * This program is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU General Public License as
11  * published by the Free Software Foundation; either version 2 of the
12  * License, or (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #ifndef XAPIAN_INCLUDED_WEIGHT_H
25 #define XAPIAN_INCLUDED_WEIGHT_H
26 
27 #include <string>
28 
29 #include <xapian/types.h>
30 #include <xapian/visibility.h>
31 
32 namespace Xapian {
33 
36  protected:
38  typedef enum {
40  COLLECTION_SIZE = 1,
42  RSET_SIZE = 2,
44  AVERAGE_LENGTH = 4,
46  TERMFREQ = 8,
48  RELTERMFREQ = 16,
50  QUERY_LENGTH = 32,
52  WQF = 64,
54  WDF = 128,
56  DOC_LENGTH = 256,
58  DOC_LENGTH_MIN = 512,
60  DOC_LENGTH_MAX = 1024,
62  WDF_MAX = 2048,
64  COLLECTION_FREQ = 4096,
66  UNIQUE_TERMS = 8192,
71  TOTAL_LENGTH = COLLECTION_SIZE | AVERAGE_LENGTH
72  } stat_flags;
73 
83  void need_stat(stat_flags flag) {
84  stats_needed = stat_flags(stats_needed | flag);
85  }
86 
96  virtual void init(double factor) = 0;
97 
98  private:
100  void operator=(const Weight &);
101 
103  stat_flags stats_needed;
104 
107 
110 
113 
116 
117  // The collection frequency of the term.
119 
122 
125 
128 
131 
134 
137 
138  public:
139 
141  Weight() : stats_needed() { }
142 
147  typedef enum {
148  TWO_STAGE_SMOOTHING = 1,
149  DIRICHLET_SMOOTHING = 2,
150  ABSOLUTE_DISCOUNT_SMOOTHING = 3,
151  JELINEK_MERCER_SMOOTHING = 4,
152  DIRICHLET_PLUS_SMOOTHING = 5
153  } type_smoothing;
154 
155  class Internal;
156 
158  virtual ~Weight();
159 
176  virtual Weight * clone() const = 0;
177 
191  virtual std::string name() const;
192 
198  virtual std::string serialise() const;
199 
217  virtual Weight * unserialise(const std::string & serialised) const;
218 
229  virtual double get_sumpart(Xapian::termcount wdf,
230  Xapian::termcount doclen,
231  Xapian::termcount uniqterms) const = 0;
232 
238  virtual double get_maxpart() const = 0;
239 
248  virtual double get_sumextra(Xapian::termcount doclen,
249  Xapian::termcount uniqterms) const = 0;
250 
257  virtual double get_maxextra() const = 0;
258 
273  void init_(const Internal & stats, Xapian::termcount query_len_,
274  const std::string & term, Xapian::termcount wqf_,
275  double factor);
276 
290  void init_(const Internal & stats, Xapian::termcount query_len_,
291  const std::string & term, Xapian::termcount wqf_,
292  double factor, void* postlist);
293 
304  void init_(const Internal & stats, Xapian::termcount query_len_,
305  double factor, Xapian::doccount termfreq,
306  Xapian::doccount reltermfreq, Xapian::termcount collection_freq);
307 
314  void init_(const Internal & stats, Xapian::termcount query_len_);
315 
323  return stats_needed & DOC_LENGTH;
324  }
325 
331  bool get_sumpart_needs_wdf_() const {
332  return stats_needed & WDF;
333  }
334 
342  return stats_needed & UNIQUE_TERMS;
343  }
344 
346  bool is_bool_weight_() const {
347  // Checking the name isn't ideal, but (get_maxpart() == 0.0) isn't
348  // required to work without init() having been called. We can at
349  // least avoid the virtual method call in most non-BoolWeight cases
350  // as most other classes will need at least some stats.
351  return stats_needed == 0 && name() == "Xapian::BoolWeight";
352  }
353 
354  protected:
360  Weight(const Weight &);
361 
363  Xapian::doccount get_collection_size() const { return collection_size_; }
364 
366  Xapian::doccount get_rset_size() const { return rset_size_; }
367 
369  Xapian::doclength get_average_length() const { return average_length_; }
370 
372  Xapian::doccount get_termfreq() const { return termfreq_; }
373 
375  Xapian::doccount get_reltermfreq() const { return reltermfreq_; }
376 
378  Xapian::termcount get_collection_freq() const { return collectionfreq_; }
379 
381  Xapian::termcount get_query_length() const { return query_length_; }
382 
384  Xapian::termcount get_wqf() const { return wqf_; }
385 
391  return doclength_upper_bound_;
392  }
393 
401  return doclength_lower_bound_;
402  }
403 
409  return wdf_upper_bound_;
410  }
411 
414  return Xapian::totallength(average_length_ * collection_size_ + 0.5);
415  }
416 };
417 
423  BoolWeight * clone() const;
424 
425  void init(double factor);
426 
427  public:
430 
431  std::string name() const;
432 
433  std::string serialise() const;
434  BoolWeight * unserialise(const std::string & serialised) const;
435 
436  double get_sumpart(Xapian::termcount wdf,
437  Xapian::termcount doclen,
438  Xapian::termcount uniqterms) const;
439  double get_maxpart() const;
440 
441  double get_sumextra(Xapian::termcount doclen,
442  Xapian::termcount uniqterms) const;
443  double get_maxextra() const;
444 };
445 
448  /* Three character string indicating the normalizations for tf(wdf), idf and
449  tfidf weight. */
450  std::string normalizations;
451 
453  double factor;
454 
455  TfIdfWeight * clone() const;
456 
457  void init(double factor);
458 
459  /* When additional normalizations are implemented in the future, the additional statistics for them
460  should be accessed by these functions. */
461  double get_wdfn(Xapian::termcount wdf, char c) const;
462  double get_idfn(Xapian::doccount termfreq, char c) const;
463  double get_wtn(double wt, char c) const;
464 
465  public:
506  explicit TfIdfWeight(const std::string &normalizations);
507 
510  : normalizations("ntn")
511  {
512  need_stat(TERMFREQ);
513  need_stat(WDF);
514  need_stat(WDF_MAX);
515  need_stat(COLLECTION_SIZE);
516  }
517 
518  std::string name() const;
519 
520  std::string serialise() const;
521  TfIdfWeight * unserialise(const std::string & serialised) const;
522 
523  double get_sumpart(Xapian::termcount wdf,
524  Xapian::termcount doclen,
525  Xapian::termcount uniqterm) const;
526  double get_maxpart() const;
527 
528  double get_sumextra(Xapian::termcount doclen,
529  Xapian::termcount uniqterms) const;
530  double get_maxextra() const;
531 };
532 
533 
538 
540  mutable double termweight;
541 
543  double param_k1, param_k2, param_k3, param_b;
544 
547 
548  BM25Weight * clone() const;
549 
550  void init(double factor);
551 
552  public:
580  BM25Weight(double k1, double k2, double k3, double b, double min_normlen)
581  : param_k1(k1), param_k2(k2), param_k3(k3), param_b(b),
582  param_min_normlen(min_normlen)
583  {
584  if (param_k1 < 0) param_k1 = 0;
585  if (param_k2 < 0) param_k2 = 0;
586  if (param_k3 < 0) param_k3 = 0;
587  if (param_b < 0) {
588  param_b = 0;
589  } else if (param_b > 1) {
590  param_b = 1;
591  }
592  need_stat(COLLECTION_SIZE);
593  need_stat(RSET_SIZE);
594  need_stat(TERMFREQ);
595  need_stat(RELTERMFREQ);
596  need_stat(WDF);
597  need_stat(WDF_MAX);
598  if (param_k2 != 0 || (param_k1 != 0 && param_b != 0)) {
599  need_stat(DOC_LENGTH_MIN);
600  need_stat(AVERAGE_LENGTH);
601  }
602  if (param_k1 != 0 && param_b != 0) need_stat(DOC_LENGTH);
603  if (param_k2 != 0) need_stat(QUERY_LENGTH);
604  if (param_k3 != 0) need_stat(WQF);
605  }
606 
608  : param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
609  param_min_normlen(0.5)
610  {
611  need_stat(COLLECTION_SIZE);
612  need_stat(RSET_SIZE);
613  need_stat(TERMFREQ);
614  need_stat(RELTERMFREQ);
615  need_stat(WDF);
616  need_stat(WDF_MAX);
617  need_stat(DOC_LENGTH_MIN);
618  need_stat(AVERAGE_LENGTH);
619  need_stat(DOC_LENGTH);
620  need_stat(WQF);
621  }
622 
623  std::string name() const;
624 
625  std::string serialise() const;
626  BM25Weight * unserialise(const std::string & serialised) const;
627 
628  double get_sumpart(Xapian::termcount wdf,
629  Xapian::termcount doclen,
630  Xapian::termcount uniqterm) const;
631  double get_maxpart() const;
632 
633  double get_sumextra(Xapian::termcount doclen,
634  Xapian::termcount uniqterms) const;
635  double get_maxextra() const;
636 };
637 
642 
644  mutable double termweight;
645 
647  double param_k1, param_k2, param_k3, param_b;
648 
651 
653  double param_delta;
654 
655  BM25PlusWeight * clone() const;
656 
657  void init(double factor);
658 
659  public:
694  BM25PlusWeight(double k1, double k2, double k3, double b,
695  double min_normlen, double delta)
696  : param_k1(k1), param_k2(k2), param_k3(k3), param_b(b),
697  param_min_normlen(min_normlen), param_delta(delta)
698  {
699  if (param_k1 < 0) param_k1 = 0;
700  if (param_k2 < 0) param_k2 = 0;
701  if (param_k3 < 0) param_k3 = 0;
702  if (param_delta < 0) param_delta = 0;
703  if (param_b < 0) {
704  param_b = 0;
705  } else if (param_b > 1) {
706  param_b = 1;
707  }
708  need_stat(COLLECTION_SIZE);
709  need_stat(RSET_SIZE);
710  need_stat(TERMFREQ);
711  need_stat(RELTERMFREQ);
712  need_stat(WDF);
713  need_stat(WDF_MAX);
714  if (param_k2 != 0 || (param_k1 != 0 && param_b != 0)) {
715  need_stat(DOC_LENGTH_MIN);
716  need_stat(AVERAGE_LENGTH);
717  }
718  if (param_k1 != 0 && param_b != 0) need_stat(DOC_LENGTH);
719  if (param_k2 != 0) need_stat(QUERY_LENGTH);
720  if (param_k3 != 0) need_stat(WQF);
721  if (param_delta != 0) {
722  need_stat(AVERAGE_LENGTH);
723  need_stat(DOC_LENGTH);
724  need_stat(WQF);
725  }
726  }
727 
729  : param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
730  param_min_normlen(0.5), param_delta(1)
731  {
732  need_stat(COLLECTION_SIZE);
733  need_stat(RSET_SIZE);
734  need_stat(TERMFREQ);
735  need_stat(RELTERMFREQ);
736  need_stat(WDF);
737  need_stat(WDF_MAX);
738  need_stat(DOC_LENGTH_MIN);
739  need_stat(AVERAGE_LENGTH);
740  need_stat(DOC_LENGTH);
741  need_stat(WQF);
742  }
743 
744  std::string name() const;
745 
746  std::string serialise() const;
747  BM25PlusWeight * unserialise(const std::string & serialised) const;
748 
749  double get_sumpart(Xapian::termcount wdf,
750  Xapian::termcount doclen,
751  Xapian::termcount uniqterm) const;
752  double get_maxpart() const;
753 
754  double get_sumextra(Xapian::termcount doclen,
755  Xapian::termcount uniqterms) const;
756  double get_maxextra() const;
757 };
758 
771 
773  mutable double termweight;
774 
776  double param_k;
777 
778  TradWeight * clone() const;
779 
780  void init(double factor);
781 
782  public:
790  explicit TradWeight(double k = 1.0) : param_k(k) {
791  if (param_k < 0) param_k = 0;
792  if (param_k != 0.0) {
793  need_stat(AVERAGE_LENGTH);
794  need_stat(DOC_LENGTH);
795  }
796  need_stat(COLLECTION_SIZE);
797  need_stat(RSET_SIZE);
798  need_stat(TERMFREQ);
799  need_stat(RELTERMFREQ);
800  need_stat(DOC_LENGTH_MIN);
801  need_stat(WDF);
802  need_stat(WDF_MAX);
803  }
804 
805  std::string name() const;
806 
807  std::string serialise() const;
808  TradWeight * unserialise(const std::string & serialised) const;
809 
810  double get_sumpart(Xapian::termcount wdf,
811  Xapian::termcount doclen,
812  Xapian::termcount uniqueterms) const;
813  double get_maxpart() const;
814 
815  double get_sumextra(Xapian::termcount doclen,
816  Xapian::termcount uniqterms) const;
817  double get_maxextra() const;
818 };
819 
840  double param_c;
841 
843  double upper_bound;
844 
848 
849  InL2Weight * clone() const;
850 
851  void init(double factor);
852 
853  public:
862  explicit InL2Weight(double c);
863 
865  : param_c(1.0)
866  {
867  need_stat(AVERAGE_LENGTH);
868  need_stat(DOC_LENGTH);
869  need_stat(DOC_LENGTH_MIN);
870  need_stat(DOC_LENGTH_MAX);
871  need_stat(COLLECTION_SIZE);
872  need_stat(WDF);
873  need_stat(WDF_MAX);
874  need_stat(WQF);
875  need_stat(TERMFREQ);
876  }
877 
878  std::string name() const;
879 
880  std::string serialise() const;
881  InL2Weight * unserialise(const std::string & serialised) const;
882 
883  double get_sumpart(Xapian::termcount wdf,
884  Xapian::termcount doclen,
885  Xapian::termcount uniqterms) const;
886  double get_maxpart() const;
887 
888  double get_sumextra(Xapian::termcount doclen,
889  Xapian::termcount uniqterms) const;
890  double get_maxextra() const;
891 };
892 
911  double param_c;
912 
914  double upper_bound;
915 
919  double B_constant;
920 
921  IfB2Weight * clone() const;
922 
923  void init(double factor);
924 
925  public:
936  explicit IfB2Weight(double c);
937 
938  IfB2Weight() : param_c(1.0) {
939  need_stat(AVERAGE_LENGTH);
940  need_stat(DOC_LENGTH);
941  need_stat(DOC_LENGTH_MIN);
942  need_stat(DOC_LENGTH_MAX);
943  need_stat(COLLECTION_SIZE);
944  need_stat(COLLECTION_FREQ);
945  need_stat(WDF);
946  need_stat(WDF_MAX);
947  need_stat(WQF);
948  need_stat(TERMFREQ);
949  }
950 
951  std::string name() const;
952 
953  std::string serialise() const;
954  IfB2Weight * unserialise(const std::string & serialised) const;
955 
956  double get_sumpart(Xapian::termcount wdf,
957  Xapian::termcount doclen,
958  Xapian::termcount uniqterm) const;
959  double get_maxpart() const;
960 
961  double get_sumextra(Xapian::termcount doclen,
962  Xapian::termcount uniqterms) const;
963  double get_maxextra() const;
964 };
965 
984  double param_c;
985 
987  double upper_bound;
988 
992  double B_constant;
993 
994  IneB2Weight * clone() const;
995 
996  void init(double factor);
997 
998  public:
1007  explicit IneB2Weight(double c);
1008 
1009  IneB2Weight() : param_c(1.0) {
1010  need_stat(AVERAGE_LENGTH);
1011  need_stat(DOC_LENGTH);
1012  need_stat(DOC_LENGTH_MIN);
1013  need_stat(DOC_LENGTH_MAX);
1014  need_stat(COLLECTION_SIZE);
1015  need_stat(WDF);
1016  need_stat(WDF_MAX);
1017  need_stat(WQF);
1018  need_stat(COLLECTION_FREQ);
1019  need_stat(TERMFREQ);
1020  }
1021 
1022  std::string name() const;
1023 
1024  std::string serialise() const;
1025  IneB2Weight * unserialise(const std::string & serialised) const;
1026 
1027  double get_sumpart(Xapian::termcount wdf,
1028  Xapian::termcount doclen,
1029  Xapian::termcount uniqterms) const;
1030  double get_maxpart() const;
1031 
1032  double get_sumextra(Xapian::termcount doclen,
1033  Xapian::termcount uniqterms) const;
1034  double get_maxextra() const;
1035 };
1036 
1056  double param_c;
1057 
1059  double upper_bound;
1060 
1063  double B_constant;
1064  double wt;
1067 
1068  BB2Weight * clone() const;
1069 
1070  void init(double factor);
1071 
1072  public:
1083  explicit BB2Weight(double c);
1084 
1085  BB2Weight() : param_c(1.0) {
1086  need_stat(AVERAGE_LENGTH);
1087  need_stat(DOC_LENGTH);
1088  need_stat(DOC_LENGTH_MIN);
1089  need_stat(DOC_LENGTH_MAX);
1090  need_stat(COLLECTION_SIZE);
1091  need_stat(COLLECTION_FREQ);
1092  need_stat(WDF);
1093  need_stat(WDF_MAX);
1094  need_stat(WQF);
1095  need_stat(TERMFREQ);
1096  }
1097 
1098  std::string name() const;
1099 
1100  std::string serialise() const;
1101  BB2Weight * unserialise(const std::string & serialised) const;
1102 
1103  double get_sumpart(Xapian::termcount wdf,
1104  Xapian::termcount doclen,
1105  Xapian::termcount uniqterms) const;
1106  double get_maxpart() const;
1107 
1108  double get_sumextra(Xapian::termcount doclen,
1109  Xapian::termcount uniqterms) const;
1110  double get_maxextra() const;
1111 };
1112 
1132  double lower_bound;
1133 
1135  double upper_bound;
1136 
1140 
1141  DLHWeight * clone() const;
1142 
1143  void init(double factor);
1144 
1145  public:
1147  need_stat(DOC_LENGTH);
1148  need_stat(COLLECTION_FREQ);
1149  need_stat(WDF);
1150  need_stat(WQF);
1151  need_stat(WDF_MAX);
1152  need_stat(DOC_LENGTH_MIN);
1153  need_stat(DOC_LENGTH_MAX);
1154  need_stat(TOTAL_LENGTH);
1155  }
1156 
1157  std::string name() const;
1158 
1159  std::string serialise() const;
1160  DLHWeight * unserialise(const std::string & serialised) const;
1161 
1162  double get_sumpart(Xapian::termcount wdf,
1163  Xapian::termcount doclen,
1164  Xapian::termcount uniqterms) const;
1165  double get_maxpart() const;
1166 
1167  double get_sumextra(Xapian::termcount doclen,
1168  Xapian::termcount uniqterms) const;
1169  double get_maxextra() const;
1170 };
1171 
1192  double param_c;
1193 
1201  double lower_bound;
1202 
1204  double upper_bound;
1205 
1207  double P1, P2;
1208 
1210  double cl;
1211 
1212  PL2Weight * clone() const;
1213 
1214  void init(double factor);
1215 
1216  public:
1227  explicit PL2Weight(double c);
1228 
1229  PL2Weight() : param_c(1.0) {
1230  need_stat(AVERAGE_LENGTH);
1231  need_stat(DOC_LENGTH);
1232  need_stat(DOC_LENGTH_MIN);
1233  need_stat(DOC_LENGTH_MAX);
1234  need_stat(COLLECTION_SIZE);
1235  need_stat(COLLECTION_FREQ);
1236  need_stat(WDF);
1237  need_stat(WDF_MAX);
1238  need_stat(WQF);
1239  }
1240 
1241  std::string name() const;
1242 
1243  std::string serialise() const;
1244  PL2Weight * unserialise(const std::string & serialised) const;
1245 
1246  double get_sumpart(Xapian::termcount wdf,
1247  Xapian::termcount doclen,
1248  Xapian::termcount uniqterms) const;
1249  double get_maxpart() const;
1250 
1251  double get_sumextra(Xapian::termcount doclen,
1252  Xapian::termcount uniqterms) const;
1253  double get_maxextra() const;
1254 };
1255 
1259  double factor;
1260 
1262  double param_c;
1263 
1265  double param_delta;
1266 
1268  double upper_bound;
1269 
1271  double P1, P2;
1272 
1274  double cl;
1275 
1277  double mean;
1278 
1280  double dw;
1281 
1282  PL2PlusWeight * clone() const;
1283 
1284  void init(double factor_);
1285 
1286  public:
1304  PL2PlusWeight(double c, double delta);
1305 
1307  : param_c(1.0), param_delta(0.8) {
1308  need_stat(AVERAGE_LENGTH);
1309  need_stat(DOC_LENGTH);
1310  need_stat(DOC_LENGTH_MIN);
1311  need_stat(DOC_LENGTH_MAX);
1312  need_stat(COLLECTION_SIZE);
1313  need_stat(COLLECTION_FREQ);
1314  need_stat(WDF);
1315  need_stat(WDF_MAX);
1316  need_stat(WQF);
1317  }
1318 
1319  std::string name() const;
1320 
1321  std::string serialise() const;
1322  PL2PlusWeight * unserialise(const std::string & serialised) const;
1323 
1324  double get_sumpart(Xapian::termcount wdf,
1325  Xapian::termcount doclen,
1326  Xapian::termcount uniqterms) const;
1327  double get_maxpart() const;
1328 
1329  double get_sumextra(Xapian::termcount doclen,
1330  Xapian::termcount uniqterms) const;
1331  double get_maxextra() const;
1332 };
1333 
1355  double upper_bound;
1356 
1358  double lower_bound;
1359 
1363 
1364  DPHWeight * clone() const;
1365 
1366  void init(double factor);
1367 
1368  public:
1371  need_stat(DOC_LENGTH);
1372  need_stat(COLLECTION_FREQ);
1373  need_stat(WDF);
1374  need_stat(WQF);
1375  need_stat(WDF_MAX);
1376  need_stat(DOC_LENGTH_MIN);
1377  need_stat(DOC_LENGTH_MAX);
1378  need_stat(TOTAL_LENGTH);
1379  }
1380 
1381  std::string name() const;
1382 
1383  std::string serialise() const;
1384  DPHWeight * unserialise(const std::string & serialised) const;
1385 
1386  double get_sumpart(Xapian::termcount wdf,
1387  Xapian::termcount doclen,
1388  Xapian::termcount uniqterms) const;
1389  double get_maxpart() const;
1390 
1391  double get_sumextra(Xapian::termcount doclen,
1392  Xapian::termcount uniqterms) const;
1393  double get_maxextra() const;
1394 };
1395 
1396 
1408  type_smoothing select_smoothing;
1409 
1410  // Parameters for handling negative value of log, and for smoothing.
1411  double param_log, param_smoothing1, param_smoothing2;
1412 
1423 
1424  LMWeight * clone() const;
1425 
1426  void init(double factor);
1427 
1428  public:
1460  // Unigram LM Constructor to specifically mention all parameters for handling negative log value and smoothing.
1461  explicit LMWeight(double param_log_ = 0.0,
1462  type_smoothing select_smoothing_ = TWO_STAGE_SMOOTHING,
1463  double param_smoothing1_ = -1.0,
1464  double param_smoothing2_ = -1.0)
1465  : select_smoothing(select_smoothing_), param_log(param_log_), param_smoothing1(param_smoothing1_),
1466  param_smoothing2(param_smoothing2_)
1467  {
1468  if (param_smoothing1 < 0) param_smoothing1 = 0.7;
1469  if (param_smoothing2 < 0) {
1470  if (select_smoothing == TWO_STAGE_SMOOTHING)
1471  param_smoothing2 = 2000.0;
1472  else
1473  param_smoothing2 = 0.05;
1474  }
1475  need_stat(DOC_LENGTH);
1476  need_stat(RSET_SIZE);
1477  need_stat(TERMFREQ);
1478  need_stat(RELTERMFREQ);
1479  need_stat(DOC_LENGTH_MAX);
1480  need_stat(WDF);
1481  need_stat(WDF_MAX);
1482  need_stat(COLLECTION_FREQ);
1483  need_stat(TOTAL_LENGTH);
1484  if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING)
1485  need_stat(UNIQUE_TERMS);
1486  if (select_smoothing == DIRICHLET_PLUS_SMOOTHING)
1487  need_stat(DOC_LENGTH_MIN);
1488  }
1489 
1490  std::string name() const;
1491 
1492  std::string serialise() const;
1493  LMWeight * unserialise(const std::string & serialised) const;
1494 
1495  double get_sumpart(Xapian::termcount wdf,
1496  Xapian::termcount doclen,
1497  Xapian::termcount uniqterm) const;
1498  double get_maxpart() const;
1499 
1500  double get_sumextra(Xapian::termcount doclen, Xapian::termcount) const;
1501  double get_maxextra() const;
1502 };
1503 
1511  double factor;
1512 
1513  public:
1514  CoordWeight * clone() const;
1515 
1516  void init(double factor_);
1517 
1520 
1521  std::string name() const;
1522 
1523  std::string serialise() const;
1524  CoordWeight * unserialise(const std::string & serialised) const;
1525 
1526  double get_sumpart(Xapian::termcount wdf,
1527  Xapian::termcount doclen,
1528  Xapian::termcount uniqterm) const;
1529  double get_maxpart() const;
1530 
1531  double get_sumextra(Xapian::termcount, Xapian::termcount) const;
1532  double get_maxextra() const;
1533 };
1534 
1535 }
1536 
1537 #endif // XAPIAN_INCLUDED_WEIGHT_H
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
type_smoothing select_smoothing
The type of smoothing to use.
Definition: weight.h:1408
double factor
The factor to multiply weights by.
Definition: weight.h:1259
Xapian::doccount termfreq_
The number of documents which this term indexes.
Definition: weight.h:115
double param_delta
Additional parameter delta in the PL2+ weighting formula.
Definition: weight.h:1265
Xapian::doccount get_collection_size() const
The number of documents in the collection.
Definition: weight.h:363
std::string normalizations
Definition: weight.h:450
typedefs for Xapian
Xapian::termcount doclength_lower_bound_
A lower bound on the minimum length of any document in the database.
Definition: weight.h:130
double factor
The factor to multiply with the weight.
Definition: weight.h:453
double upper_bound
The upper bound on the weight.
Definition: weight.h:1268
Xapian::termcount get_collection_freq() const
The collection frequency of the term.
Definition: weight.h:378
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:1192
double B_constant
Definition: weight.h:1063
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:1262
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:911
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
Definition: types.h:139
Xapian::termcount doclength_upper_bound_
An upper bound on the maximum length of any document in the database.
Definition: weight.h:133
double stirling_constant_1
Definition: weight.h:1065
double upper_bound
The upper bound on the weight.
Definition: weight.h:1204
Xapian::doclength param_min_normlen
The minimum normalised document length value.
Definition: weight.h:546
bool get_sumpart_needs_wdf_() const
Definition: weight.h:331
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:1056
This class implements the InL2 weighting scheme.
Definition: weight.h:838
double param_k
The parameter in the formula.
Definition: weight.h:776
double dw
Weight contribution of delta term in the PL2+ function.
Definition: weight.h:1280
bool get_sumpart_needs_uniqueterms_() const
Definition: weight.h:341
bool is_bool_weight_() const
Definition: weight.h:346
Xapian::Weight subclass implementing the PL2+ probabilistic formula.
Definition: weight.h:1257
double wqf_product_idf
The constant values which are used on every call to get_sumpart().
Definition: weight.h:846
stat_flags stats_needed
A bitmask of the statistics this weighting scheme needs.
Definition: weight.h:103
Xapian::totallength get_total_length() const
Total length of all documents in the collection.
Definition: weight.h:413
bool get_sumpart_needs_doclength_() const
Definition: weight.h:322
double lower_bound
Now unused but left in place in 1.4.x for ABI compatibility.
Definition: weight.h:1132
#define XAPIAN_VISIBILITY_DEFAULT
Definition: visibility.h:28
double upper_bound
The upper bound on the weight.
Definition: weight.h:1355
double lower_bound
The factor to multiply weights by.
Definition: weight.h:1201
DPHWeight()
Construct a DPHWeight.
Definition: weight.h:1370
Xapian::doccount collection_size_
The number of documents in the collection.
Definition: weight.h:106
Xapian::termcount wdf_upper_bound_
An upper bound on the wdf of this term.
Definition: weight.h:136
CoordWeight()
Construct a CoordWeight.
Definition: weight.h:1519
TfIdfWeight()
Construct a TfIdfWeight using the default normalizations ("ntn").
Definition: weight.h:509
double param_k3
Definition: weight.h:543
This class implements the BB2 weighting scheme.
Definition: weight.h:1054
type_smoothing
Type of smoothing to use with the Language Model Weighting scheme.
Definition: weight.h:147
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
Xapian::termcount wqf_
The within-query-frequency of this term.
Definition: weight.h:127
Xapian::Weight subclass implementing Coordinate Matching.
Definition: weight.h:1509
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:984
double c_product_avlen
Definition: weight.h:918
Xapian::termcount get_doclength_lower_bound() const
A lower bound on the minimum length of any document in the database.
Definition: weight.h:400
double wqf_product_idf
The constant values which are used for calculations in get_sumpart().
Definition: weight.h:917
double c_product_avlen
Definition: weight.h:847
Class implementing a "boolean" weighting scheme.
Definition: weight.h:422
Xapian::doclength param_min_normlen
The minimum normalised document length value.
Definition: weight.h:650
Define XAPIAN_VISIBILITY_* macros.
double doclength
A normalised document length.
Definition: types.h:59
double stirling_constant_2
Definition: weight.h:1066
Xapian::termcount collectionfreq_
Definition: weight.h:118
Class to hold statistics for a given collection.
Xapian::doclength average_length_
The average length of a document in the collection.
Definition: weight.h:112
double termweight
Factor combining all the document independent factors.
Definition: weight.h:644
double mean
Set by init() to get_collection_freq()) / get_collection_size()
Definition: weight.h:1277
double termweight
Factor combining all the document independent factors.
Definition: weight.h:773
stat_flags
Stats which the weighting scheme can use (see need_stat()).
Definition: weight.h:38
Xapian::termcount get_query_length() const
The length of the query.
Definition: weight.h:381
Xapian::doclength len_factor
Factor to multiply the document length by.
Definition: weight.h:770
BoolWeight()
Construct a BoolWeight.
Definition: weight.h:429
Xapian::Weight subclass implementing the traditional probabilistic formula.
Definition: weight.h:768
This class implements the DLH weighting scheme, which is a representative scheme of the Divergence fr...
Definition: weight.h:1130
This class implements the PL2 weighting scheme.
Definition: weight.h:1190
Xapian::doccount rset_size_
The number of documents marked as relevant.
Definition: weight.h:109
LMWeight(double param_log_=0.0, type_smoothing select_smoothing_=TWO_STAGE_SMOOTHING, double param_smoothing1_=-1.0, double param_smoothing2_=-1.0)
Construct a LMWeight.
Definition: weight.h:1461
This class implements the IneB2 weighting scheme.
Definition: weight.h:982
Xapian::termcount get_wqf() const
The within-query-frequency of this term.
Definition: weight.h:384
double upper_bound
The upper bound on the weight a term can give to a document.
Definition: weight.h:843
Xapian::doccount get_rset_size() const
The number of documents marked as relevant.
Definition: weight.h:366
Xapian::termcount query_length_
The length of the query.
Definition: weight.h:124
double upper_bound
The upper bound on the weight.
Definition: weight.h:1059
double log_constant
The constant value to be used in get_sumpart().
Definition: weight.h:1138
double wqf_product_factor
Definition: weight.h:1362
Xapian::termcount get_doclength_upper_bound() const
An upper bound on the maximum length of any document in the database.
Definition: weight.h:390
double B_constant
Definition: weight.h:919
BM25Weight(double k1, double k2, double k3, double b, double min_normlen)
Construct a BM25Weight.
Definition: weight.h:580
double lower_bound
Now unused but left in place in 1.4.x for ABI compatibility.
Definition: weight.h:1358
TradWeight(double k=1.0)
Construct a TradWeight.
Definition: weight.h:790
This class implements the IfB2 weighting scheme.
Definition: weight.h:909
Xapian::doccount get_termfreq() const
The number of documents which this term indexes.
Definition: weight.h:372
double wqf_product_factor
Definition: weight.h:1139
Xapian::doclength get_average_length() const
The average length of a document in the collection.
Definition: weight.h:369
char name[9]
Definition: dbcheck.cc:55
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
double termweight
Factor combining all the document independent factors.
Definition: weight.h:540
double cl
Set by init() to (param_c * get_average_length())
Definition: weight.h:1210
Xapian::doclength len_factor
Factor to multiply the document length by.
Definition: weight.h:641
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:840
double cl
Set by init() to (param_c * get_average_length())
Definition: weight.h:1274
This class implements the DPH weighting scheme.
Definition: weight.h:1353
Xapian::doccount get_reltermfreq() const
The number of relevant documents which this term indexes.
Definition: weight.h:375
Definition: quest.cc:110
Xapian::doclength len_factor
Factor to multiply the document length by.
Definition: weight.h:537
void need_stat(stat_flags flag)
Tell Xapian that your subclass will want a particular statistic.
Definition: weight.h:83
double weight_collection
The factor to multiply weights by.
Definition: weight.h:1422
double upper_bound
The upper bound on the weight.
Definition: weight.h:1135
double log_constant
The constant value used in get_sumpart() .
Definition: weight.h:1361
double c_product_avlen
The constant values to be used in get_sumpart().
Definition: weight.h:1062
double param_delta
Additional parameter delta in the BM25+ formula.
Definition: weight.h:653
BM25PlusWeight(double k1, double k2, double k3, double b, double min_normlen, double delta)
Construct a BM25PlusWeight.
Definition: weight.h:694
Xapian::Weight subclass implementing the Language Model formula.
Definition: weight.h:1406
Weight()
Default constructor, needed by subclass constructors.
Definition: weight.h:141
double c_product_avlen
Definition: weight.h:991
double upper_bound
The upper bound on the weight.
Definition: weight.h:914
Xapian::termcount get_wdf_upper_bound() const
An upper bound on the wdf of this term.
Definition: weight.h:408
double wqf_product_idf
Constant values used in get_sumpart().
Definition: weight.h:990
double upper_bound
The upper bound of the weight.
Definition: weight.h:987
double factor
The factor to multiply weights by.
Definition: weight.h:1511
double B_constant
Definition: weight.h:992
Xapian::Weight subclass implementing the BM25+ probabilistic formula.
Definition: weight.h:639
double param_smoothing2
Definition: weight.h:1411
Xapian::Weight subclass implementing the BM25 probabilistic formula.
Definition: weight.h:535
Xapian::doccount reltermfreq_
The number of relevant documents which this term indexes.
Definition: weight.h:121
Xapian::Weight subclass implementing the tf-idf weighting scheme.
Definition: weight.h:447
Abstract base class for weighting schemes.
Definition: weight.h:35