xapian-core  1.4.21
expandweight.h
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007,2008,2009,2011,2016 Olly Betts
5  * Copyright (C) 2013 Aarsh Shah
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #ifndef XAPIAN_INCLUDED_EXPANDWEIGHT_H
23 #define XAPIAN_INCLUDED_EXPANDWEIGHT_H
24 
25 #include <xapian/database.h>
26 
27 #include "api/termlist.h"
28 #include "internaltypes.h"
29 
30 #include <string>
31 #include <vector>
32 
33 namespace Xapian {
34 namespace Internal {
35 
37 class ExpandStats {
39  std::vector<bool> dbs_seen;
40 
43 
45  double expand_k;
46 
47  public:
50 
53 
56 
59 
61  double multiplier;
62 
65  explicit ExpandStats(Xapian::doclength avlen_)
66  : avlen(avlen_), expand_k(0), dbsize(0), termfreq(0),
67  rcollection_freq(0), rtermfreq(0), multiplier(0) {
68  }
69 
71  ExpandStats(Xapian::doclength avlen_, double expand_k_)
72  : avlen(avlen_), expand_k(expand_k_), dbsize(0), termfreq(0),
73  rcollection_freq(0), rtermfreq(0), multiplier(0) {
74  }
75 
76  void accumulate(size_t shard_index,
78  Xapian::doccount subtf, Xapian::doccount subdbsize)
79  {
80  // Boolean terms may have wdf == 0, but treat that as 1 so such terms
81  // get a non-zero weight.
82  if (wdf == 0) wdf = 1;
83  ++rtermfreq;
84  rcollection_freq += wdf;
85 
86  multiplier += (expand_k + 1) * wdf / (expand_k * doclen / avlen + wdf);
87 
88  // If we've not seen this sub-database before, then update dbsize and
89  // termfreq and note that we have seen it.
90  if (shard_index >= dbs_seen.size() || !dbs_seen[shard_index]) {
91  if (shard_index >= dbs_seen.size()) {
92  dbs_seen.resize(shard_index + 1);
93  }
94  dbs_seen[shard_index] = true;
95  dbsize += subdbsize;
96  termfreq += subtf;
97  }
98  }
99 
100  /* Clear the statistics collected in the ExpandStats object before using it
101  * for a new term. */
102  void clear_stats()
103  {
104  dbs_seen.clear();
105  dbsize = 0;
106  termfreq = 0;
107  rcollection_freq = 0;
108  rtermfreq = 0;
109  multiplier = 0;
110  }
111 };
112 
117 
120 
123 
126 
129 
132 
144 
145  public:
155  Xapian::doccount rsize_,
156  bool use_exact_termfreq_)
157  : db(db_), dbsize(db.get_doccount()), avlen(db.get_avlength()),
158  rsize(rsize_), collection_freq(0),
159  collection_len(avlen * dbsize + .5),
160  use_exact_termfreq(use_exact_termfreq_), stats(avlen) {}
161 
172  Xapian::doccount rsize_,
173  bool use_exact_termfreq_,
174  double expand_k_)
175  : db(db_), dbsize(db.get_doccount()), avlen(db.get_avlength()),
176  rsize(rsize_), collection_freq(0),
177  collection_len(avlen * dbsize + .5),
178  use_exact_termfreq(use_exact_termfreq_), stats(avlen, expand_k_) {}
179 
184  void collect_stats(TermList * merger, const std::string & term);
185 
187  virtual double get_weight() const = 0;
188 
189  protected:
192 
194  double get_avlen() const { return avlen; }
195 
197  Xapian::doccount get_rsize() const { return rsize; }
198 
200  Xapian::termcount get_collection_freq() const { return collection_freq; }
201 
203  Xapian::totallength get_collection_len() const { return collection_len; }
204 
206  Xapian::doccount get_dbsize() const { return dbsize; }
207 };
208 
213 class TradEWeight : public ExpandWeight {
214  public:
227  Xapian::doccount rsize_,
228  bool use_exact_termfreq_,
229  double expand_k_)
230  : ExpandWeight(db_, rsize_, use_exact_termfreq_, expand_k_) { }
231 
232  double get_weight() const;
233 };
234 
246 class Bo1EWeight : public ExpandWeight {
247  public:
259  Xapian::doccount rsize_,
260  bool use_exact_termfreq_)
261  : ExpandWeight(db_, rsize_, use_exact_termfreq_) {}
262 
263  double get_weight() const;
264 };
265 
266 }
267 }
268 
269 #endif // XAPIAN_INCLUDED_EXPANDWEIGHT_H
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
This class is used to access a database, or a group of databases.
Definition: database.h:68
Xapian::doccount rtermfreq
The number of documents from the RSet indexed by the current term (r).
Definition: expandweight.h:58
Bo1EWeight(const Xapian::Database &db_, Xapian::doccount rsize_, bool use_exact_termfreq_)
Constructor.
Definition: expandweight.h:258
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
Definition: types.h:139
Xapian::doclength avlen
Average document length in the whole database.
Definition: expandweight.h:122
Abstract base class for termlists.
Definition: termlist.h:39
double expand_k
The parameter k to be used for TradWeight query expansion.
Definition: expandweight.h:45
ExpandStats(Xapian::doclength avlen_)
Constructor for expansion schemes which do not require the "expand_k" parameter.
Definition: expandweight.h:65
double get_avlen() const
Return the average length of the database.
Definition: expandweight.h:194
Xapian::doccount get_rsize() const
Return the number of documents in the RSet.
Definition: expandweight.h:197
Class for calculating ESet term weights.
Definition: expandweight.h:114
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
Xapian::doccount rsize
The number of documents in the RSet.
Definition: expandweight.h:125
Xapian::totallength collection_len
The total length of the database.
Definition: expandweight.h:131
Xapian::totallength get_collection_len() const
Return the length of the collection.
Definition: expandweight.h:203
void accumulate(size_t shard_index, Xapian::termcount wdf, Xapian::termcount doclen, Xapian::doccount subtf, Xapian::doccount subdbsize)
Definition: expandweight.h:76
bool use_exact_termfreq
Should we calculate the exact term frequency when generating an ESet?
Definition: expandweight.h:143
ExpandWeight(const Xapian::Database &db_, Xapian::doccount rsize_, bool use_exact_termfreq_, double expand_k_)
Constructor.
Definition: expandweight.h:171
ExpandStats(Xapian::doclength avlen_, double expand_k_)
Constructor for expansion schemes which require the "expand_k" parameter.
Definition: expandweight.h:71
TradEWeight(const Xapian::Database &db_, Xapian::doccount rsize_, bool use_exact_termfreq_, double expand_k_)
Constructor.
Definition: expandweight.h:226
std::vector< bool > dbs_seen
Which databases in a multidb are included in termfreq.
Definition: expandweight.h:39
double doclength
A normalised document length.
Definition: types.h:59
Xapian::termcount rcollection_freq
The number of times the term occurs in the rset.
Definition: expandweight.h:55
double multiplier
The multiplier to be used in TradWeight query expansion.
Definition: expandweight.h:61
API for working with Xapian databases.
const Xapian::Database db
The combined database.
Definition: expandweight.h:116
Xapian::doclength avlen
Average document length in the whole database.
Definition: expandweight.h:42
Xapian::doccount dbsize
The number of documents in the whole database.
Definition: expandweight.h:119
Xapian::termcount get_collection_freq() const
Return the collection frequency of the term.
Definition: expandweight.h:200
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
ExpandStats stats
An ExpandStats object to accumulate statistics.
Definition: expandweight.h:191
Collates statistics while calculating term weight in an ESet.
Definition: expandweight.h:37
Xapian::doccount termfreq
Term frequency (for a multidb, may be for a subset of the databases).
Definition: expandweight.h:52
This class implements the TradWeight scheme for query expansion.
Definition: expandweight.h:213
Abstract base class for termlists.
Xapian::termcount collection_freq
The collection frequency of the term.
Definition: expandweight.h:128
ExpandWeight(const Xapian::Database &db_, Xapian::doccount rsize_, bool use_exact_termfreq_)
Constructor.
Definition: expandweight.h:154
Xapian::doccount get_dbsize() const
Return the size of the database.
Definition: expandweight.h:206
Types used internally.
Xapian::doccount dbsize
Size of the subset of a multidb to which the value in termfreq applies.
Definition: expandweight.h:49
This class implements the Bo1 scheme for query expansion.
Definition: expandweight.h:246