xapian-core  1.4.27
expandweight.h
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007,2008,2009,2011,2016,2023 Olly Betts
5  * Copyright (C) 2013 Aarsh Shah
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #ifndef XAPIAN_INCLUDED_EXPANDWEIGHT_H
23 #define XAPIAN_INCLUDED_EXPANDWEIGHT_H
24 
25 #include <xapian/database.h>
26 
27 #include "api/termlist.h"
28 #include "internaltypes.h"
29 
30 #include <string>
31 #include <vector>
32 
33 namespace Xapian {
34 namespace Internal {
35 
37 class ExpandStats {
39  std::vector<bool> dbs_seen;
40 
43 
45  double expand_k;
46 
47  public:
50 
53 
56 
59 
61  double multiplier;
62 
65  explicit ExpandStats(Xapian::doclength avlen_)
66  : avlen(avlen_), expand_k(0), dbsize(0), termfreq(0),
67  rcollection_freq(0), rtermfreq(0), multiplier(0) {
68  }
69 
71  ExpandStats(Xapian::doclength avlen_, double expand_k_)
72  : avlen(avlen_), expand_k(expand_k_), dbsize(0), termfreq(0),
73  rcollection_freq(0), rtermfreq(0), multiplier(0) {
74  }
75 
76  void accumulate(size_t shard_index,
78  Xapian::doccount subtf, Xapian::doccount subdbsize)
79  {
80  // Boolean terms may have wdf == 0, but treat that as 1 so such terms
81  // get a non-zero weight.
82  if (wdf == 0) wdf = 1;
83  ++rtermfreq;
84  rcollection_freq += wdf;
85 
86  multiplier += (expand_k + 1) * wdf / (expand_k * doclen / avlen + wdf);
87 
88  // If we've not seen this sub-database before, then update dbsize and
89  // termfreq and note that we have seen it.
90  if (shard_index >= dbs_seen.size() || !dbs_seen[shard_index]) {
91  if (shard_index >= dbs_seen.size()) {
92  dbs_seen.resize(shard_index + 1);
93  }
94  dbs_seen[shard_index] = true;
95  dbsize += subdbsize;
96  termfreq += subtf;
97  }
98  }
99 
100  /* Clear the statistics collected in the ExpandStats object before using it
101  * for a new term. */
102  void clear_stats()
103  {
104  dbs_seen.clear();
105  dbsize = 0;
106  termfreq = 0;
107  rcollection_freq = 0;
108  rtermfreq = 0;
109  multiplier = 0;
110  }
111 };
112 
117 
120 
123 
126 
129 
132 
144 
147 
148  public:
158  Xapian::doccount rsize_,
159  bool use_exact_termfreq_,
160  bool want_collection_freq_)
161  : db(db_), dbsize(db.get_doccount()), avlen(db.get_avlength()),
162  rsize(rsize_), collection_freq(0),
163  collection_len(avlen * dbsize + .5),
164  use_exact_termfreq(use_exact_termfreq_),
165  want_collection_freq(want_collection_freq_),
166  stats(avlen) {}
167 
178  Xapian::doccount rsize_,
179  bool use_exact_termfreq_,
180  bool want_collection_freq_,
181  double expand_k_)
182  : db(db_), dbsize(db.get_doccount()), avlen(db.get_avlength()),
183  rsize(rsize_), collection_freq(0),
184  collection_len(avlen * dbsize + .5),
185  use_exact_termfreq(use_exact_termfreq_),
186  want_collection_freq(want_collection_freq_),
187  stats(avlen, expand_k_) {}
188 
193  void collect_stats(TermList * merger, const std::string & term);
194 
196  virtual double get_weight() const = 0;
197 
198  protected:
201 
203  double get_avlen() const { return avlen; }
204 
206  Xapian::doccount get_rsize() const { return rsize; }
207 
209  Xapian::termcount get_collection_freq() const { return collection_freq; }
210 
212  Xapian::totallength get_collection_len() const { return collection_len; }
213 
215  Xapian::doccount get_dbsize() const { return dbsize; }
216 };
217 
222 class TradEWeight : public ExpandWeight {
223  public:
236  Xapian::doccount rsize_,
237  bool use_exact_termfreq_,
238  double expand_k_)
239  : ExpandWeight(db_, rsize_, use_exact_termfreq_, false, expand_k_) { }
240 
241  double get_weight() const;
242 };
243 
255 class Bo1EWeight : public ExpandWeight {
256  public:
268  Xapian::doccount rsize_,
269  bool use_exact_termfreq_)
270  : ExpandWeight(db_, rsize_, use_exact_termfreq_, true) {}
271 
272  double get_weight() const;
273 };
274 
275 }
276 }
277 
278 #endif // XAPIAN_INCLUDED_EXPANDWEIGHT_H
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
ExpandWeight(const Xapian::Database &db_, Xapian::doccount rsize_, bool use_exact_termfreq_, bool want_collection_freq_, double expand_k_)
Constructor.
Definition: expandweight.h:177
This class is used to access a database, or a group of databases.
Definition: database.h:68
#define true
Definition: header.h:8
Xapian::doccount rtermfreq
The number of documents from the RSet indexed by the current term (r).
Definition: expandweight.h:58
Bo1EWeight(const Xapian::Database &db_, Xapian::doccount rsize_, bool use_exact_termfreq_)
Constructor.
Definition: expandweight.h:267
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
Definition: types.h:139
Xapian::doclength avlen
Average document length in the whole database.
Definition: expandweight.h:122
Abstract base class for termlists.
Definition: termlist.h:39
double expand_k
The parameter k to be used for TradWeight query expansion.
Definition: expandweight.h:45
ExpandStats(Xapian::doclength avlen_)
Constructor for expansion schemes which do not require the "expand_k" parameter.
Definition: expandweight.h:65
double get_avlen() const
Return the average length of the database.
Definition: expandweight.h:203
Xapian::doccount get_rsize() const
Return the number of documents in the RSet.
Definition: expandweight.h:206
#define false
Definition: header.h:9
ExpandWeight(const Xapian::Database &db_, Xapian::doccount rsize_, bool use_exact_termfreq_, bool want_collection_freq_)
Constructor.
Definition: expandweight.h:157
Class for calculating ESet term weights.
Definition: expandweight.h:114
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
Xapian::doccount rsize
The number of documents in the RSet.
Definition: expandweight.h:125
Xapian::totallength collection_len
The total length of the database.
Definition: expandweight.h:131
Xapian::totallength get_collection_len() const
Return the length of the collection.
Definition: expandweight.h:212
void accumulate(size_t shard_index, Xapian::termcount wdf, Xapian::termcount doclen, Xapian::doccount subtf, Xapian::doccount subdbsize)
Definition: expandweight.h:76
bool use_exact_termfreq
Should we calculate the exact term frequency when generating an ESet?
Definition: expandweight.h:143
ExpandStats(Xapian::doclength avlen_, double expand_k_)
Constructor for expansion schemes which require the "expand_k" parameter.
Definition: expandweight.h:71
TradEWeight(const Xapian::Database &db_, Xapian::doccount rsize_, bool use_exact_termfreq_, double expand_k_)
Constructor.
Definition: expandweight.h:235
std::vector< bool > dbs_seen
Which databases in a multidb are included in termfreq.
Definition: expandweight.h:39
double doclength
A normalised document length.
Definition: types.h:59
Xapian::termcount rcollection_freq
The number of times the term occurs in the rset.
Definition: expandweight.h:55
double multiplier
The multiplier to be used in TradWeight query expansion.
Definition: expandweight.h:61
API for working with Xapian databases.
const Xapian::Database db
The combined database.
Definition: expandweight.h:116
Xapian::doclength avlen
Average document length in the whole database.
Definition: expandweight.h:42
Xapian::doccount dbsize
The number of documents in the whole database.
Definition: expandweight.h:119
Xapian::termcount get_collection_freq() const
Return the collection frequency of the term.
Definition: expandweight.h:209
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
ExpandStats stats
An ExpandStats object to accumulate statistics.
Definition: expandweight.h:200
Collates statistics while calculating term weight in an ESet.
Definition: expandweight.h:37
Xapian::doccount termfreq
Term frequency (for a multidb, may be for a subset of the databases).
Definition: expandweight.h:52
This class implements the TradWeight scheme for query expansion.
Definition: expandweight.h:222
bool want_collection_freq
Does the expansion scheme use collection frequency?
Definition: expandweight.h:146
Abstract base class for termlists.
Xapian::termcount collection_freq
The collection frequency of the term.
Definition: expandweight.h:128
Xapian::doccount get_dbsize() const
Return the size of the database.
Definition: expandweight.h:215
Types used internally.
Xapian::doccount dbsize
Size of the subset of a multidb to which the value in termfreq applies.
Definition: expandweight.h:49
This class implements the Bo1 scheme for query expansion.
Definition: expandweight.h:255