xapian-core  1.4.20
tradweight.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2009,2010,2011,2012,2014,2015,2017 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <config.h>
22 
23 #include "xapian/weight.h"
24 
25 #include "debuglog.h"
26 #include "omassert.h"
27 #include "serialise-double.h"
28 
29 #include "xapian/error.h"
30 
31 #include <algorithm>
32 #include <cmath>
33 
34 using namespace std;
35 
36 namespace Xapian {
37 
38 TradWeight *
39 TradWeight::clone() const
40 {
41  return new TradWeight(param_k);
42 }
43 
44 void
45 TradWeight::init(double factor)
46 {
47  if (factor == 0.0) {
48  // This object is for the term-independent contribution, and that's
49  // always zero for this scheme.
50  return;
51  }
52 
53  Xapian::doccount tf = get_termfreq();
54 
55  double tw = 0;
56  if (get_rset_size() != 0) {
57  Xapian::doccount reltermfreq = get_reltermfreq();
58 
59  // There can't be more relevant documents indexed by a term than there
60  // are documents indexed by that term.
61  AssertRel(reltermfreq,<=,tf);
62 
63  // There can't be more relevant documents indexed by a term than there
64  // are relevant documents.
65  AssertRel(reltermfreq,<=,get_rset_size());
66 
67  Xapian::doccount reldocs_not_indexed = get_rset_size() - reltermfreq;
68 
69  // There can't be more relevant documents not indexed by a term than
70  // there are documents not indexed by that term.
71  AssertRel(reldocs_not_indexed,<=,get_collection_size() - tf);
72 
73  Xapian::doccount Q = get_collection_size() - reldocs_not_indexed;
74 
75  Xapian::doccount nonreldocs_indexed = tf - reltermfreq;
76  double numerator = (reltermfreq + 0.5) * (Q - tf + 0.5);
77  double denom = (reldocs_not_indexed + 0.5) * (nonreldocs_indexed + 0.5);
78  tw = numerator / denom;
79  } else {
80  tw = (get_collection_size() - tf + 0.5) / (tf + 0.5);
81  }
82 
83  AssertRel(tw,>,0);
84 
85  // The "official" formula can give a negative termweight in unusual cases
86  // (without an RSet, when a term indexes more than half the documents in
87  // the database). These negative weights aren't actually helpful, and it
88  // is common for implementations to replace them with a small positive
89  // weight or similar.
90  //
91  // Truncating to zero doesn't seem a great approach in practice as it
92  // means that some terms in the query can have no effect at all on the
93  // ranking, and that some results can have zero weight, both of which
94  // are seem surprising.
95  //
96  // Xapian 1.0.x and earlier adjusted the termweight for any term indexing
97  // more than a third of documents, which seems rather "intrusive". That's
98  // what the code currently enabled does, but perhaps it would be better to
99  // do something else. (FIXME)
100 #if 0
101  if (rare(tw <= 1.0)) {
102  termweight = 0;
103  } else {
104  termweight = log(tw) * factor;
105  }
106 #else
107  if (tw < 2) tw = tw * 0.5 + 1;
108  termweight = log(tw) * factor;
109 #endif
110 
111  LOGVALUE(WTCALC, termweight);
112 
113  if (param_k == 0) {
114  // If param_k is 0 then the document length doesn't affect the weight.
115  len_factor = 0;
116  } else {
117  len_factor = get_average_length();
118  // len_factor can be zero if all documents are empty (or the database is
119  // empty!)
120  if (len_factor != 0) len_factor = param_k / len_factor;
121  }
122 
123  LOGVALUE(WTCALC, len_factor);
124 }
125 
126 string
128 {
129  return "Xapian::TradWeight";
130 }
131 
132 string
133 TradWeight::serialise() const
134 {
135  return serialise_double(param_k);
136 }
137 
138 TradWeight *
139 TradWeight::unserialise(const string & s) const
140 {
141  const char *ptr = s.data();
142  const char *end = ptr + s.size();
143  double k = unserialise_double(&ptr, end);
144  if (rare(ptr != end))
145  throw Xapian::SerialisationError("Extra data in TradWeight::unserialise()");
146  return new TradWeight(k);
147 }
148 
149 double
150 TradWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
151  Xapian::termcount) const
152 {
153  double wdf_double = wdf;
154  return termweight * (wdf_double / (len * len_factor + wdf_double));
155 }
156 
157 double
158 TradWeight::get_maxpart() const
159 {
160  // FIXME: need to force non-zero wdf_max to stop percentages breaking...
161  double wdf_max = max(get_wdf_upper_bound(), Xapian::termcount(1));
162  Xapian::termcount doclen_lb = get_doclength_lower_bound();
163  return termweight * (wdf_max / (doclen_lb * len_factor + wdf_max));
164 }
165 
166 double
167 TradWeight::get_sumextra(Xapian::termcount, Xapian::termcount) const
168 {
169  return 0;
170 }
171 
172 double
173 TradWeight::get_maxextra() const
174 {
175  return 0;
176 }
177 
178 }
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
#define AssertRel(A, REL, B)
Definition: omassert.h:123
STL namespace.
#define rare(COND)
Definition: config.h:562
Hierarchy of classes which Xapian can throw as exceptions.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
functions to serialise and unserialise a double
double unserialise_double(const char **p, const char *end)
Unserialise a double serialised by serialise_double.
Indicates an error in the std::string serialisation of an object.
Definition: error.h:929
#define LOGVALUE(a, b)
Definition: debuglog.h:484
Xapian::Weight subclass implementing the traditional probabilistic formula.
Definition: weight.h:768
Weighting scheme API.
std::string serialise_double(double v)
Serialise a double to a string.
char name[9]
Definition: dbcheck.cc:55
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Various assertion macros.
Debug logging macros.