00001
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <config.h>
00022
00023 #include "xapian/weight.h"
00024
00025 #include "debuglog.h"
00026 #include "omassert.h"
00027 #include "serialise-double.h"
00028
00029 #include "xapian/error.h"
00030
00031 #include <algorithm>
00032 #include <cmath>
00033
00034 using namespace std;
00035
00036 namespace Xapian {
00037
00038 BM25Weight *
00039 BM25Weight::clone() const
00040 {
00041 return new BM25Weight(param_k1, param_k2, param_k3, param_b,
00042 param_min_normlen);
00043 }
00044
00045 void
00046 BM25Weight::init(double factor)
00047 {
00048 Xapian::doccount tf = get_termfreq();
00049
00050 Xapian::weight tw = 0;
00051 if (get_rset_size() != 0) {
00052 Xapian::doccount reltermfreq = get_reltermfreq();
00053
00054
00055
00056 AssertRel(reltermfreq,<=,tf);
00057
00058
00059
00060 AssertRel(reltermfreq,<=,get_rset_size());
00061
00062 Xapian::doccount reldocs_not_indexed = get_rset_size() - reltermfreq;
00063
00064
00065
00066 AssertRel(reldocs_not_indexed,<=,get_collection_size() - tf);
00067
00068 Xapian::doccount Q = get_collection_size() - reldocs_not_indexed;
00069
00070 Xapian::doccount nonreldocs_indexed = tf - reltermfreq;
00071 double numerator = (reltermfreq + 0.5) * (Q - tf + 0.5);
00072 double denom = (reldocs_not_indexed + 0.5) * (nonreldocs_indexed + 0.5);
00073 tw = numerator / denom;
00074 } else {
00075 tw = (get_collection_size() - tf + 0.5) / (tf + 0.5);
00076 }
00077
00078 AssertRel(tw,>,0);
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095 #if 0
00096 if (rare(tw <= 1.0)) {
00097 termweight = 0;
00098 } else {
00099 termweight = log(tw) * factor;
00100 if (param_k3 != 0) {
00101 double wqf_double = get_wqf();
00102 termweight *= (param_k3 + 1) * wqf_double / (param_k3 + wqf_double);
00103 }
00104 }
00105 #else
00106 if (tw < 2) tw = tw * 0.5 + 1;
00107 termweight = log(tw) * factor;
00108 if (param_k3 != 0) {
00109 double wqf_double = get_wqf();
00110 termweight *= (param_k3 + 1) * wqf_double / (param_k3 + wqf_double);
00111 }
00112 #endif
00113
00114 LOGVALUE(WTCALC, termweight);
00115
00116 if (param_b == 0 || param_k1 == 0) {
00117
00118
00119 len_factor = 0;
00120 } else {
00121 len_factor = get_average_length();
00122
00123
00124 if (len_factor != 0) len_factor = 1 / len_factor;
00125 }
00126
00127 LOGVALUE(WTCALC, len_factor);
00128 }
00129
00130 string
00131 BM25Weight::name() const
00132 {
00133 return "Xapian::BM25Weight";
00134 }
00135
00136 string
00137 BM25Weight::serialise() const
00138 {
00139 string result = serialise_double(param_k1);
00140 result += serialise_double(param_k2);
00141 result += serialise_double(param_k3);
00142 result += serialise_double(param_b);
00143 result += serialise_double(param_min_normlen);
00144 return result;
00145 }
00146
00147 BM25Weight *
00148 BM25Weight::unserialise(const string & s) const
00149 {
00150 const char *ptr = s.data();
00151 const char *end = ptr + s.size();
00152 double k1 = unserialise_double(&ptr, end);
00153 double k2 = unserialise_double(&ptr, end);
00154 double k3 = unserialise_double(&ptr, end);
00155 double b = unserialise_double(&ptr, end);
00156 double min_normlen = unserialise_double(&ptr, end);
00157 if (rare(ptr != end))
00158 throw Xapian::NetworkError("Extra data in BM25Weight::unserialise()");
00159 return new BM25Weight(k1, k2, k3, b, min_normlen);
00160 }
00161
00162 Xapian::weight
00163 BM25Weight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len) const
00164 {
00165 LOGCALL(WTCALC, Xapian::weight, "BM25Weight::get_sumpart", wdf | len);
00166 Xapian::doclength normlen = max(len * len_factor, param_min_normlen);
00167
00168 double wdf_double(wdf);
00169 double denom = param_k1 * (normlen * param_b + (1 - param_b)) + wdf_double;
00170 AssertRel(denom,>,0);
00171 RETURN(termweight * (param_k1 + 1) * (wdf_double / denom));
00172 }
00173
00174 Xapian::weight
00175 BM25Weight::get_maxpart() const
00176 {
00177 LOGCALL(WTCALC, Xapian::weight, "BM25Weight::get_maxpart", NO_ARGS);
00178 Xapian::doclength normlen_lb = max(get_doclength_lower_bound() * len_factor,
00179 param_min_normlen);
00180 double wdf_max(get_wdf_upper_bound());
00181 double denom = param_k1 * (normlen_lb * param_b + (1 - param_b)) + wdf_max;
00182 AssertRel(denom,>,0);
00183 RETURN(termweight * (param_k1 + 1) * (wdf_max / denom));
00184 }
00185
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195 Xapian::weight
00196 BM25Weight::get_sumextra(Xapian::termcount len) const
00197 {
00198 LOGCALL(WTCALC, Xapian::weight, "BM25Weight::get_sumextra", len);
00199 Xapian::weight num = (2.0 * param_k2 * get_query_length());
00200 RETURN(num / (1.0 + max(len * len_factor, param_min_normlen)));
00201 }
00202
00203 Xapian::weight
00204 BM25Weight::get_maxextra() const
00205 {
00206 LOGCALL(WTCALC, Xapian::weight, "BM25Weight::get_maxextra", NO_ARGS);
00207 Xapian::weight num = (2.0 * param_k2 * get_query_length());
00208 RETURN(num / (1.0 + max(double(get_doclength_lower_bound()),
00209 param_min_normlen)));
00210 }
00211
00212 }