xapian-core  1.4.27
weight.h
Go to the documentation of this file.
1 
4 /* Copyright (C) 2004,2007,2008,2009,2010,2011,2012,2015,2016,2019 Olly Betts
5  * Copyright (C) 2009 Lemur Consulting Ltd
6  * Copyright (C) 2013,2014 Aarsh Shah
7  * Copyright (C) 2016 Vivek Pal
8  *
9  * This program is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU General Public License as
11  * published by the Free Software Foundation; either version 2 of the
12  * License, or (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #ifndef XAPIAN_INCLUDED_WEIGHT_H
25 #define XAPIAN_INCLUDED_WEIGHT_H
26 
27 #include <string>
28 
29 #include <xapian/types.h>
30 #include <xapian/visibility.h>
31 
32 namespace Xapian {
33 
36  protected:
38  typedef enum {
40  COLLECTION_SIZE = 1,
42  RSET_SIZE = 2,
44  AVERAGE_LENGTH = 4,
46  TERMFREQ = 8,
48  RELTERMFREQ = 16,
50  QUERY_LENGTH = 32,
52  WQF = 64,
54  WDF = 128,
56  DOC_LENGTH = 256,
62  DOC_LENGTH_MIN = 512,
68  DOC_LENGTH_MAX = 1024,
74  WDF_MAX = 2048,
76  COLLECTION_FREQ = 4096,
78  UNIQUE_TERMS = 8192,
82  TOTAL_LENGTH = COLLECTION_SIZE | AVERAGE_LENGTH
83  } stat_flags;
84 
94  void need_stat(stat_flags flag) {
95  stats_needed = stat_flags(stats_needed | flag);
96  }
97 
107  virtual void init(double factor) = 0;
108 
109  private:
111  void operator=(const Weight &);
112 
114  stat_flags stats_needed;
115 
118 
121 
124 
127 
128  // The collection frequency of the term.
130 
133 
136 
139 
142 
145 
148 
149  public:
150 
152  Weight() : stats_needed() { }
153 
158  typedef enum {
159  TWO_STAGE_SMOOTHING = 1,
160  DIRICHLET_SMOOTHING = 2,
161  ABSOLUTE_DISCOUNT_SMOOTHING = 3,
162  JELINEK_MERCER_SMOOTHING = 4,
163  DIRICHLET_PLUS_SMOOTHING = 5
164  } type_smoothing;
165 
166  class Internal;
167 
169  virtual ~Weight();
170 
187  virtual Weight * clone() const = 0;
188 
202  virtual std::string name() const;
203 
209  virtual std::string serialise() const;
210 
228  virtual Weight * unserialise(const std::string & serialised) const;
229 
240  virtual double get_sumpart(Xapian::termcount wdf,
241  Xapian::termcount doclen,
242  Xapian::termcount uniqterms) const = 0;
243 
249  virtual double get_maxpart() const = 0;
250 
259  virtual double get_sumextra(Xapian::termcount doclen,
260  Xapian::termcount uniqterms) const = 0;
261 
268  virtual double get_maxextra() const = 0;
269 
284  void init_(const Internal & stats, Xapian::termcount query_len_,
285  const std::string & term, Xapian::termcount wqf_,
286  double factor);
287 
301  void init_(const Internal & stats, Xapian::termcount query_len_,
302  const std::string & term, Xapian::termcount wqf_,
303  double factor, void* postlist);
304 
315  void init_(const Internal & stats, Xapian::termcount query_len_,
316  double factor, Xapian::doccount termfreq,
317  Xapian::doccount reltermfreq, Xapian::termcount collection_freq);
318 
325  void init_(const Internal & stats, Xapian::termcount query_len_);
326 
334  return stats_needed & DOC_LENGTH;
335  }
336 
342  bool get_sumpart_needs_wdf_() const {
343  return stats_needed & WDF;
344  }
345 
353  return stats_needed & UNIQUE_TERMS;
354  }
355 
357  bool is_bool_weight_() const {
358  // Checking the name isn't ideal, but (get_maxpart() == 0.0) isn't
359  // required to work without init() having been called. We can at
360  // least avoid the virtual method call in most non-BoolWeight cases
361  // as most other classes will need at least some stats.
362  return stats_needed == 0 && name() == "Xapian::BoolWeight";
363  }
364 
365  protected:
371  Weight(const Weight &);
372 
374  Xapian::doccount get_collection_size() const { return collection_size_; }
375 
377  Xapian::doccount get_rset_size() const { return rset_size_; }
378 
380  Xapian::doclength get_average_length() const { return average_length_; }
381 
383  Xapian::doccount get_termfreq() const { return termfreq_; }
384 
386  Xapian::doccount get_reltermfreq() const { return reltermfreq_; }
387 
389  Xapian::termcount get_collection_freq() const { return collectionfreq_; }
390 
392  Xapian::termcount get_query_length() const { return query_length_; }
393 
395  Xapian::termcount get_wqf() const { return wqf_; }
396 
402  return doclength_upper_bound_;
403  }
404 
412  return doclength_lower_bound_;
413  }
414 
420  return wdf_upper_bound_;
421  }
422 
425  return Xapian::totallength(average_length_ * collection_size_ + 0.5);
426  }
427 };
428 
434  BoolWeight * clone() const;
435 
436  void init(double factor);
437 
438  public:
441 
442  std::string name() const;
443 
444  std::string serialise() const;
445  BoolWeight * unserialise(const std::string & serialised) const;
446 
447  double get_sumpart(Xapian::termcount wdf,
448  Xapian::termcount doclen,
449  Xapian::termcount uniqterms) const;
450  double get_maxpart() const;
451 
452  double get_sumextra(Xapian::termcount doclen,
453  Xapian::termcount uniqterms) const;
454  double get_maxextra() const;
455 };
456 
459  /* Three character string indicating the normalizations for tf(wdf), idf and
460  tfidf weight. */
461  std::string normalizations;
462 
464  double factor;
465 
466  TfIdfWeight * clone() const;
467 
468  void init(double factor);
469 
470  /* When additional normalizations are implemented in the future, the additional statistics for them
471  should be accessed by these functions. */
472  double get_wdfn(Xapian::termcount wdf, char c) const;
473  double get_idfn(Xapian::doccount termfreq, char c) const;
474  double get_wtn(double wt, char c) const;
475 
476  public:
517  explicit TfIdfWeight(const std::string &normalizations);
518 
521  : normalizations("ntn")
522  {
523  need_stat(TERMFREQ);
524  need_stat(WDF);
525  need_stat(WDF_MAX);
526  need_stat(COLLECTION_SIZE);
527  }
528 
529  std::string name() const;
530 
531  std::string serialise() const;
532  TfIdfWeight * unserialise(const std::string & serialised) const;
533 
534  double get_sumpart(Xapian::termcount wdf,
535  Xapian::termcount doclen,
536  Xapian::termcount uniqterm) const;
537  double get_maxpart() const;
538 
539  double get_sumextra(Xapian::termcount doclen,
540  Xapian::termcount uniqterms) const;
541  double get_maxextra() const;
542 };
543 
544 
549 
551  mutable double termweight;
552 
554  double param_k1, param_k2, param_k3, param_b;
555 
558 
559  BM25Weight * clone() const;
560 
561  void init(double factor);
562 
563  public:
591  BM25Weight(double k1, double k2, double k3, double b, double min_normlen)
592  : param_k1(k1), param_k2(k2), param_k3(k3), param_b(b),
593  param_min_normlen(min_normlen)
594  {
595  if (param_k1 < 0) param_k1 = 0;
596  if (param_k2 < 0) param_k2 = 0;
597  if (param_k3 < 0) param_k3 = 0;
598  if (param_b < 0) {
599  param_b = 0;
600  } else if (param_b > 1) {
601  param_b = 1;
602  }
603  need_stat(COLLECTION_SIZE);
604  need_stat(RSET_SIZE);
605  need_stat(TERMFREQ);
606  need_stat(RELTERMFREQ);
607  need_stat(WDF);
608  need_stat(WDF_MAX);
609  if (param_k2 != 0 || (param_k1 != 0 && param_b != 0)) {
610  need_stat(DOC_LENGTH_MIN);
611  need_stat(AVERAGE_LENGTH);
612  }
613  if (param_k1 != 0 && param_b != 0) need_stat(DOC_LENGTH);
614  if (param_k2 != 0) need_stat(QUERY_LENGTH);
615  if (param_k3 != 0) need_stat(WQF);
616  }
617 
619  : param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
620  param_min_normlen(0.5)
621  {
622  need_stat(COLLECTION_SIZE);
623  need_stat(RSET_SIZE);
624  need_stat(TERMFREQ);
625  need_stat(RELTERMFREQ);
626  need_stat(WDF);
627  need_stat(WDF_MAX);
628  need_stat(DOC_LENGTH_MIN);
629  need_stat(AVERAGE_LENGTH);
630  need_stat(DOC_LENGTH);
631  need_stat(WQF);
632  }
633 
634  std::string name() const;
635 
636  std::string serialise() const;
637  BM25Weight * unserialise(const std::string & serialised) const;
638 
639  double get_sumpart(Xapian::termcount wdf,
640  Xapian::termcount doclen,
641  Xapian::termcount uniqterm) const;
642  double get_maxpart() const;
643 
644  double get_sumextra(Xapian::termcount doclen,
645  Xapian::termcount uniqterms) const;
646  double get_maxextra() const;
647 };
648 
653 
655  mutable double termweight;
656 
658  double param_k1, param_k2, param_k3, param_b;
659 
662 
664  double param_delta;
665 
666  BM25PlusWeight * clone() const;
667 
668  void init(double factor);
669 
670  public:
705  BM25PlusWeight(double k1, double k2, double k3, double b,
706  double min_normlen, double delta)
707  : param_k1(k1), param_k2(k2), param_k3(k3), param_b(b),
708  param_min_normlen(min_normlen), param_delta(delta)
709  {
710  if (param_k1 < 0) param_k1 = 0;
711  if (param_k2 < 0) param_k2 = 0;
712  if (param_k3 < 0) param_k3 = 0;
713  if (param_delta < 0) param_delta = 0;
714  if (param_b < 0) {
715  param_b = 0;
716  } else if (param_b > 1) {
717  param_b = 1;
718  }
719  need_stat(COLLECTION_SIZE);
720  need_stat(RSET_SIZE);
721  need_stat(TERMFREQ);
722  need_stat(RELTERMFREQ);
723  need_stat(WDF);
724  need_stat(WDF_MAX);
725  if (param_k2 != 0 || (param_k1 != 0 && param_b != 0)) {
726  need_stat(DOC_LENGTH_MIN);
727  need_stat(AVERAGE_LENGTH);
728  }
729  if (param_k1 != 0 && param_b != 0) need_stat(DOC_LENGTH);
730  if (param_k2 != 0) need_stat(QUERY_LENGTH);
731  if (param_k3 != 0) need_stat(WQF);
732  }
733 
735  : param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
736  param_min_normlen(0.5), param_delta(1)
737  {
738  need_stat(COLLECTION_SIZE);
739  need_stat(RSET_SIZE);
740  need_stat(TERMFREQ);
741  need_stat(RELTERMFREQ);
742  need_stat(WDF);
743  need_stat(WDF_MAX);
744  need_stat(DOC_LENGTH_MIN);
745  need_stat(AVERAGE_LENGTH);
746  need_stat(DOC_LENGTH);
747  need_stat(WQF);
748  }
749 
750  std::string name() const;
751 
752  std::string serialise() const;
753  BM25PlusWeight * unserialise(const std::string & serialised) const;
754 
755  double get_sumpart(Xapian::termcount wdf,
756  Xapian::termcount doclen,
757  Xapian::termcount uniqterm) const;
758  double get_maxpart() const;
759 
760  double get_sumextra(Xapian::termcount doclen,
761  Xapian::termcount uniqterms) const;
762  double get_maxextra() const;
763 };
764 
777 
779  mutable double termweight;
780 
782  double param_k;
783 
784  TradWeight * clone() const;
785 
786  void init(double factor);
787 
788  public:
796  explicit TradWeight(double k = 1.0) : param_k(k) {
797  if (param_k < 0) param_k = 0;
798  if (param_k != 0.0) {
799  need_stat(AVERAGE_LENGTH);
800  need_stat(DOC_LENGTH);
801  }
802  need_stat(COLLECTION_SIZE);
803  need_stat(RSET_SIZE);
804  need_stat(TERMFREQ);
805  need_stat(RELTERMFREQ);
806  need_stat(DOC_LENGTH_MIN);
807  need_stat(WDF);
808  need_stat(WDF_MAX);
809  }
810 
811  std::string name() const;
812 
813  std::string serialise() const;
814  TradWeight * unserialise(const std::string & serialised) const;
815 
816  double get_sumpart(Xapian::termcount wdf,
817  Xapian::termcount doclen,
818  Xapian::termcount uniqueterms) const;
819  double get_maxpart() const;
820 
821  double get_sumextra(Xapian::termcount doclen,
822  Xapian::termcount uniqterms) const;
823  double get_maxextra() const;
824 };
825 
846  double param_c;
847 
849  double upper_bound;
850 
854 
855  InL2Weight * clone() const;
856 
857  void init(double factor);
858 
859  public:
868  explicit InL2Weight(double c);
869 
871  : param_c(1.0)
872  {
873  need_stat(AVERAGE_LENGTH);
874  need_stat(DOC_LENGTH);
875  need_stat(DOC_LENGTH_MIN);
876  need_stat(DOC_LENGTH_MAX);
877  need_stat(COLLECTION_SIZE);
878  need_stat(WDF);
879  need_stat(WDF_MAX);
880  need_stat(WQF);
881  need_stat(TERMFREQ);
882  }
883 
884  std::string name() const;
885 
886  std::string serialise() const;
887  InL2Weight * unserialise(const std::string & serialised) const;
888 
889  double get_sumpart(Xapian::termcount wdf,
890  Xapian::termcount doclen,
891  Xapian::termcount uniqterms) const;
892  double get_maxpart() const;
893 
894  double get_sumextra(Xapian::termcount doclen,
895  Xapian::termcount uniqterms) const;
896  double get_maxextra() const;
897 };
898 
917  double param_c;
918 
920  double upper_bound;
921 
925  double B_constant;
926 
927  IfB2Weight * clone() const;
928 
929  void init(double factor);
930 
931  public:
942  explicit IfB2Weight(double c);
943 
944  IfB2Weight() : param_c(1.0) {
945  need_stat(AVERAGE_LENGTH);
946  need_stat(DOC_LENGTH);
947  need_stat(DOC_LENGTH_MIN);
948  need_stat(DOC_LENGTH_MAX);
949  need_stat(COLLECTION_SIZE);
950  need_stat(COLLECTION_FREQ);
951  need_stat(WDF);
952  need_stat(WDF_MAX);
953  need_stat(WQF);
954  need_stat(TERMFREQ);
955  }
956 
957  std::string name() const;
958 
959  std::string serialise() const;
960  IfB2Weight * unserialise(const std::string & serialised) const;
961 
962  double get_sumpart(Xapian::termcount wdf,
963  Xapian::termcount doclen,
964  Xapian::termcount uniqterm) const;
965  double get_maxpart() const;
966 
967  double get_sumextra(Xapian::termcount doclen,
968  Xapian::termcount uniqterms) const;
969  double get_maxextra() const;
970 };
971 
990  double param_c;
991 
993  double upper_bound;
994 
998  double B_constant;
999 
1000  IneB2Weight * clone() const;
1001 
1002  void init(double factor);
1003 
1004  public:
1013  explicit IneB2Weight(double c);
1014 
1015  IneB2Weight() : param_c(1.0) {
1016  need_stat(AVERAGE_LENGTH);
1017  need_stat(DOC_LENGTH);
1018  need_stat(DOC_LENGTH_MIN);
1019  need_stat(DOC_LENGTH_MAX);
1020  need_stat(COLLECTION_SIZE);
1021  need_stat(WDF);
1022  need_stat(WDF_MAX);
1023  need_stat(WQF);
1024  need_stat(COLLECTION_FREQ);
1025  need_stat(TERMFREQ);
1026  }
1027 
1028  std::string name() const;
1029 
1030  std::string serialise() const;
1031  IneB2Weight * unserialise(const std::string & serialised) const;
1032 
1033  double get_sumpart(Xapian::termcount wdf,
1034  Xapian::termcount doclen,
1035  Xapian::termcount uniqterms) const;
1036  double get_maxpart() const;
1037 
1038  double get_sumextra(Xapian::termcount doclen,
1039  Xapian::termcount uniqterms) const;
1040  double get_maxextra() const;
1041 };
1042 
1062  double param_c;
1063 
1065  double upper_bound;
1066 
1069  double B_constant;
1070  double wt;
1073 
1074  BB2Weight * clone() const;
1075 
1076  void init(double factor);
1077 
1078  public:
1089  explicit BB2Weight(double c);
1090 
1091  BB2Weight() : param_c(1.0) {
1092  need_stat(AVERAGE_LENGTH);
1093  need_stat(DOC_LENGTH);
1094  need_stat(DOC_LENGTH_MIN);
1095  need_stat(DOC_LENGTH_MAX);
1096  need_stat(COLLECTION_SIZE);
1097  need_stat(COLLECTION_FREQ);
1098  need_stat(WDF);
1099  need_stat(WDF_MAX);
1100  need_stat(WQF);
1101  need_stat(TERMFREQ);
1102  }
1103 
1104  std::string name() const;
1105 
1106  std::string serialise() const;
1107  BB2Weight * unserialise(const std::string & serialised) const;
1108 
1109  double get_sumpart(Xapian::termcount wdf,
1110  Xapian::termcount doclen,
1111  Xapian::termcount uniqterms) const;
1112  double get_maxpart() const;
1113 
1114  double get_sumextra(Xapian::termcount doclen,
1115  Xapian::termcount uniqterms) const;
1116  double get_maxextra() const;
1117 };
1118 
1138  double lower_bound;
1139 
1141  double upper_bound;
1142 
1146 
1147  DLHWeight * clone() const;
1148 
1149  void init(double factor);
1150 
1151  public:
1153  need_stat(DOC_LENGTH);
1154  need_stat(COLLECTION_FREQ);
1155  need_stat(WDF);
1156  need_stat(WQF);
1157  need_stat(WDF_MAX);
1158  need_stat(DOC_LENGTH_MIN);
1159  need_stat(DOC_LENGTH_MAX);
1160  need_stat(TOTAL_LENGTH);
1161  }
1162 
1163  std::string name() const;
1164 
1165  std::string serialise() const;
1166  DLHWeight * unserialise(const std::string & serialised) const;
1167 
1168  double get_sumpart(Xapian::termcount wdf,
1169  Xapian::termcount doclen,
1170  Xapian::termcount uniqterms) const;
1171  double get_maxpart() const;
1172 
1173  double get_sumextra(Xapian::termcount doclen,
1174  Xapian::termcount uniqterms) const;
1175  double get_maxextra() const;
1176 };
1177 
1198  double param_c;
1199 
1207  double lower_bound;
1208 
1210  double upper_bound;
1211 
1213  double P1, P2;
1214 
1216  double cl;
1217 
1218  PL2Weight * clone() const;
1219 
1220  void init(double factor);
1221 
1222  public:
1233  explicit PL2Weight(double c);
1234 
1235  PL2Weight() : param_c(1.0) {
1236  need_stat(AVERAGE_LENGTH);
1237  need_stat(DOC_LENGTH);
1238  need_stat(DOC_LENGTH_MIN);
1239  need_stat(DOC_LENGTH_MAX);
1240  need_stat(COLLECTION_SIZE);
1241  need_stat(COLLECTION_FREQ);
1242  need_stat(WDF);
1243  need_stat(WDF_MAX);
1244  need_stat(WQF);
1245  }
1246 
1247  std::string name() const;
1248 
1249  std::string serialise() const;
1250  PL2Weight * unserialise(const std::string & serialised) const;
1251 
1252  double get_sumpart(Xapian::termcount wdf,
1253  Xapian::termcount doclen,
1254  Xapian::termcount uniqterms) const;
1255  double get_maxpart() const;
1256 
1257  double get_sumextra(Xapian::termcount doclen,
1258  Xapian::termcount uniqterms) const;
1259  double get_maxextra() const;
1260 };
1261 
1265  double factor;
1266 
1268  double param_c;
1269 
1271  double param_delta;
1272 
1274  double upper_bound;
1275 
1277  double P1, P2;
1278 
1280  double cl;
1281 
1283  double mean;
1284 
1286  double dw;
1287 
1288  PL2PlusWeight * clone() const;
1289 
1290  void init(double factor_);
1291 
1292  public:
1310  PL2PlusWeight(double c, double delta);
1311 
1313  : param_c(1.0), param_delta(0.8) {
1314  need_stat(AVERAGE_LENGTH);
1315  need_stat(DOC_LENGTH);
1316  need_stat(DOC_LENGTH_MIN);
1317  need_stat(DOC_LENGTH_MAX);
1318  need_stat(COLLECTION_SIZE);
1319  need_stat(COLLECTION_FREQ);
1320  need_stat(WDF);
1321  need_stat(WDF_MAX);
1322  need_stat(WQF);
1323  }
1324 
1325  std::string name() const;
1326 
1327  std::string serialise() const;
1328  PL2PlusWeight * unserialise(const std::string & serialised) const;
1329 
1330  double get_sumpart(Xapian::termcount wdf,
1331  Xapian::termcount doclen,
1332  Xapian::termcount uniqterms) const;
1333  double get_maxpart() const;
1334 
1335  double get_sumextra(Xapian::termcount doclen,
1336  Xapian::termcount uniqterms) const;
1337  double get_maxextra() const;
1338 };
1339 
1361  double upper_bound;
1362 
1364  double lower_bound;
1365 
1369 
1370  DPHWeight * clone() const;
1371 
1372  void init(double factor);
1373 
1374  public:
1377  need_stat(DOC_LENGTH);
1378  need_stat(COLLECTION_FREQ);
1379  need_stat(WDF);
1380  need_stat(WQF);
1381  need_stat(WDF_MAX);
1382  need_stat(DOC_LENGTH_MIN);
1383  need_stat(DOC_LENGTH_MAX);
1384  need_stat(TOTAL_LENGTH);
1385  }
1386 
1387  std::string name() const;
1388 
1389  std::string serialise() const;
1390  DPHWeight * unserialise(const std::string & serialised) const;
1391 
1392  double get_sumpart(Xapian::termcount wdf,
1393  Xapian::termcount doclen,
1394  Xapian::termcount uniqterms) const;
1395  double get_maxpart() const;
1396 
1397  double get_sumextra(Xapian::termcount doclen,
1398  Xapian::termcount uniqterms) const;
1399  double get_maxextra() const;
1400 };
1401 
1402 
1415  type_smoothing select_smoothing;
1416 
1417  // Parameters for handling negative value of log, and for smoothing.
1418  double param_log, param_smoothing1, param_smoothing2;
1419 
1430 
1431  LMWeight * clone() const;
1432 
1433  void init(double factor);
1434 
1435  public:
1467  // Unigram LM Constructor to specifically mention all parameters for handling negative log value and smoothing.
1468  explicit LMWeight(double param_log_ = 0.0,
1469  type_smoothing select_smoothing_ = TWO_STAGE_SMOOTHING,
1470  double param_smoothing1_ = -1.0,
1471  double param_smoothing2_ = -1.0)
1472  : select_smoothing(select_smoothing_), param_log(param_log_), param_smoothing1(param_smoothing1_),
1473  param_smoothing2(param_smoothing2_)
1474  {
1475  if (param_smoothing1 < 0) param_smoothing1 = 0.7;
1476  if (param_smoothing2 < 0) {
1477  if (select_smoothing == TWO_STAGE_SMOOTHING)
1478  param_smoothing2 = 2000.0;
1479  else
1480  param_smoothing2 = 0.05;
1481  }
1482  need_stat(DOC_LENGTH);
1483  need_stat(RSET_SIZE);
1484  need_stat(TERMFREQ);
1485  need_stat(RELTERMFREQ);
1486  need_stat(DOC_LENGTH_MAX);
1487  need_stat(WDF);
1488  need_stat(WDF_MAX);
1489  need_stat(COLLECTION_FREQ);
1490  need_stat(TOTAL_LENGTH);
1491  if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING)
1492  need_stat(UNIQUE_TERMS);
1493  if (select_smoothing == DIRICHLET_PLUS_SMOOTHING)
1494  need_stat(DOC_LENGTH_MIN);
1495  }
1496 
1497  std::string name() const;
1498 
1499  std::string serialise() const;
1500  LMWeight * unserialise(const std::string & serialised) const;
1501 
1502  double get_sumpart(Xapian::termcount wdf,
1503  Xapian::termcount doclen,
1504  Xapian::termcount uniqterm) const;
1505  double get_maxpart() const;
1506 
1507  double get_sumextra(Xapian::termcount doclen, Xapian::termcount) const;
1508  double get_maxextra() const;
1509 };
1510 
1518  double factor;
1519 
1520  public:
1521  CoordWeight * clone() const;
1522 
1523  void init(double factor_);
1524 
1527 
1528  std::string name() const;
1529 
1530  std::string serialise() const;
1531  CoordWeight * unserialise(const std::string & serialised) const;
1532 
1533  double get_sumpart(Xapian::termcount wdf,
1534  Xapian::termcount doclen,
1535  Xapian::termcount uniqterm) const;
1536  double get_maxpart() const;
1537 
1538  double get_sumextra(Xapian::termcount, Xapian::termcount) const;
1539  double get_maxextra() const;
1540 };
1541 
1542 }
1543 
1544 #endif // XAPIAN_INCLUDED_WEIGHT_H
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
type_smoothing select_smoothing
The type of smoothing to use.
Definition: weight.h:1415
double factor
The factor to multiply weights by.
Definition: weight.h:1265
Xapian::doccount termfreq_
The number of documents which this term indexes.
Definition: weight.h:126
double param_delta
Additional parameter delta in the PL2+ weighting formula.
Definition: weight.h:1271
Xapian::doccount get_collection_size() const
The number of documents in the collection.
Definition: weight.h:374
std::string normalizations
Definition: weight.h:461
typedefs for Xapian
Xapian::termcount doclength_lower_bound_
A lower bound on the minimum length of any document in the shard.
Definition: weight.h:141
double factor
The factor to multiply with the weight.
Definition: weight.h:464
double upper_bound
The upper bound on the weight.
Definition: weight.h:1274
Xapian::termcount get_collection_freq() const
The collection frequency of the term.
Definition: weight.h:389
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:1198
double B_constant
Definition: weight.h:1069
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:1268
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:917
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
Definition: types.h:139
Xapian::termcount doclength_upper_bound_
An upper bound on the maximum length of any document in the shard.
Definition: weight.h:144
double stirling_constant_1
Definition: weight.h:1071
double upper_bound
The upper bound on the weight.
Definition: weight.h:1210
Xapian::doclength param_min_normlen
The minimum normalised document length value.
Definition: weight.h:557
bool get_sumpart_needs_wdf_() const
Definition: weight.h:342
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:1062
This class implements the InL2 weighting scheme.
Definition: weight.h:844
double param_k
The parameter in the formula.
Definition: weight.h:782
double dw
Weight contribution of delta term in the PL2+ function.
Definition: weight.h:1286
bool get_sumpart_needs_uniqueterms_() const
Definition: weight.h:352
bool is_bool_weight_() const
Definition: weight.h:357
Xapian::Weight subclass implementing the PL2+ probabilistic formula.
Definition: weight.h:1263
double wqf_product_idf
The constant values which are used on every call to get_sumpart().
Definition: weight.h:852
stat_flags stats_needed
A bitmask of the statistics this weighting scheme needs.
Definition: weight.h:114
Xapian::totallength get_total_length() const
Total length of all documents in the collection.
Definition: weight.h:424
bool get_sumpart_needs_doclength_() const
Definition: weight.h:333
double lower_bound
Now unused but left in place in 1.4.x for ABI compatibility.
Definition: weight.h:1138
#define XAPIAN_VISIBILITY_DEFAULT
Definition: visibility.h:28
double upper_bound
The upper bound on the weight.
Definition: weight.h:1361
double lower_bound
The factor to multiply weights by.
Definition: weight.h:1207
DPHWeight()
Construct a DPHWeight.
Definition: weight.h:1376
Xapian::doccount collection_size_
The number of documents in the collection.
Definition: weight.h:117
Xapian::termcount wdf_upper_bound_
An upper bound on the wdf of this term in the shard.
Definition: weight.h:147
CoordWeight()
Construct a CoordWeight.
Definition: weight.h:1526
TfIdfWeight()
Construct a TfIdfWeight using the default normalizations ("ntn").
Definition: weight.h:520
double param_k3
Definition: weight.h:554
This class implements the BB2 weighting scheme.
Definition: weight.h:1060
type_smoothing
Type of smoothing to use with the Language Model Weighting scheme.
Definition: weight.h:158
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
Xapian::termcount wqf_
The within-query-frequency of this term.
Definition: weight.h:138
Xapian::Weight subclass implementing Coordinate Matching.
Definition: weight.h:1516
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:990
double c_product_avlen
Definition: weight.h:924
Xapian::termcount get_doclength_lower_bound() const
A lower bound on the minimum length of any document in the shard.
Definition: weight.h:411
double wqf_product_idf
The constant values which are used for calculations in get_sumpart().
Definition: weight.h:923
double c_product_avlen
Definition: weight.h:853
Class implementing a "boolean" weighting scheme.
Definition: weight.h:433
Xapian::doclength param_min_normlen
The minimum normalised document length value.
Definition: weight.h:661
Define XAPIAN_VISIBILITY_* macros.
double doclength
A normalised document length.
Definition: types.h:59
double stirling_constant_2
Definition: weight.h:1072
Xapian::termcount collectionfreq_
Definition: weight.h:129
Class to hold statistics for a given collection.
Xapian::doclength average_length_
The average length of a document in the collection.
Definition: weight.h:123
double termweight
Factor combining all the document independent factors.
Definition: weight.h:655
double mean
Set by init() to get_collection_freq()) / get_collection_size()
Definition: weight.h:1283
double termweight
Factor combining all the document independent factors.
Definition: weight.h:779
stat_flags
Stats which the weighting scheme can use (see need_stat()).
Definition: weight.h:38
Xapian::termcount get_query_length() const
The length of the query.
Definition: weight.h:392
Xapian::doclength len_factor
Factor to multiply the document length by.
Definition: weight.h:776
BoolWeight()
Construct a BoolWeight.
Definition: weight.h:440
Xapian::Weight subclass implementing the traditional probabilistic formula.
Definition: weight.h:774
This class implements the DLH weighting scheme, which is a representative scheme of the Divergence fr...
Definition: weight.h:1136
This class implements the PL2 weighting scheme.
Definition: weight.h:1196
Xapian::doccount rset_size_
The number of documents marked as relevant.
Definition: weight.h:120
LMWeight(double param_log_=0.0, type_smoothing select_smoothing_=TWO_STAGE_SMOOTHING, double param_smoothing1_=-1.0, double param_smoothing2_=-1.0)
Construct a LMWeight.
Definition: weight.h:1468
This class implements the IneB2 weighting scheme.
Definition: weight.h:988
Xapian::termcount get_wqf() const
The within-query-frequency of this term.
Definition: weight.h:395
double upper_bound
The upper bound on the weight a term can give to a document.
Definition: weight.h:849
Xapian::doccount get_rset_size() const
The number of documents marked as relevant.
Definition: weight.h:377
Xapian::termcount query_length_
The length of the query.
Definition: weight.h:135
double upper_bound
The upper bound on the weight.
Definition: weight.h:1065
double log_constant
The constant value to be used in get_sumpart().
Definition: weight.h:1144
double wqf_product_factor
Definition: weight.h:1368
Xapian::termcount get_doclength_upper_bound() const
An upper bound on the maximum length of any document in the shard.
Definition: weight.h:401
double B_constant
Definition: weight.h:925
BM25Weight(double k1, double k2, double k3, double b, double min_normlen)
Construct a BM25Weight.
Definition: weight.h:591
double lower_bound
Now unused but left in place in 1.4.x for ABI compatibility.
Definition: weight.h:1364
TradWeight(double k=1.0)
Construct a TradWeight.
Definition: weight.h:796
This class implements the IfB2 weighting scheme.
Definition: weight.h:915
Xapian::doccount get_termfreq() const
The number of documents which this term indexes.
Definition: weight.h:383
double wqf_product_factor
Definition: weight.h:1145
Xapian::doclength get_average_length() const
The average length of a document in the collection.
Definition: weight.h:380
char name[9]
Definition: dbcheck.cc:55
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
double termweight
Factor combining all the document independent factors.
Definition: weight.h:551
double cl
Set by init() to (param_c * get_average_length())
Definition: weight.h:1216
Xapian::doclength len_factor
Factor to multiply the document length by.
Definition: weight.h:652
double param_c
The wdf normalization parameter in the formula.
Definition: weight.h:846
double cl
Set by init() to (param_c * get_average_length())
Definition: weight.h:1280
This class implements the DPH weighting scheme.
Definition: weight.h:1359
Xapian::doccount get_reltermfreq() const
The number of relevant documents which this term indexes.
Definition: weight.h:386
Xapian::doclength len_factor
Factor to multiply the document length by.
Definition: weight.h:548
void need_stat(stat_flags flag)
Tell Xapian that your subclass will want a particular statistic.
Definition: weight.h:94
double weight_collection
The factor to multiply weights by.
Definition: weight.h:1429
double upper_bound
The upper bound on the weight.
Definition: weight.h:1141
double log_constant
The constant value used in get_sumpart() .
Definition: weight.h:1367
double c_product_avlen
The constant values to be used in get_sumpart().
Definition: weight.h:1068
double param_delta
Additional parameter delta in the BM25+ formula.
Definition: weight.h:664
BM25PlusWeight(double k1, double k2, double k3, double b, double min_normlen, double delta)
Construct a BM25PlusWeight.
Definition: weight.h:705
Xapian::Weight subclass implementing the Language Model formula.
Definition: weight.h:1413
Weight()
Default constructor, needed by subclass constructors.
Definition: weight.h:152
double c_product_avlen
Definition: weight.h:997
double upper_bound
The upper bound on the weight.
Definition: weight.h:920
Xapian::termcount get_wdf_upper_bound() const
An upper bound on the wdf of this term in the shard.
Definition: weight.h:419
double wqf_product_idf
Constant values used in get_sumpart().
Definition: weight.h:996
double upper_bound
The upper bound of the weight.
Definition: weight.h:993
double factor
The factor to multiply weights by.
Definition: weight.h:1518
double B_constant
Definition: weight.h:998
Xapian::Weight subclass implementing the BM25+ probabilistic formula.
Definition: weight.h:650
double param_smoothing2
Definition: weight.h:1418
Xapian::Weight subclass implementing the BM25 probabilistic formula.
Definition: weight.h:546
Xapian::doccount reltermfreq_
The number of relevant documents which this term indexes.
Definition: weight.h:132
Xapian::Weight subclass implementing the tf-idf weighting scheme.
Definition: weight.h:458
Abstract base class for weighting schemes.
Definition: weight.h:35