xapian-core  2.0.0
expandweight.h
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007,2008,2009,2011,2016,2019,2023,2024 Olly Betts
5  * Copyright (C) 2013 Aarsh Shah
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, see
19  * <https://www.gnu.org/licenses/>.
20  */
21 
22 #ifndef XAPIAN_INCLUDED_EXPANDWEIGHT_H
23 #define XAPIAN_INCLUDED_EXPANDWEIGHT_H
24 
25 #include <xapian/database.h>
26 
27 #include "api/termlist.h"
28 #include "internaltypes.h"
29 
30 #include <string>
31 #include <vector>
32 
33 namespace Xapian {
34 namespace Internal {
35 
37 class ExpandStats {
39  std::vector<bool> dbs_seen;
40 
43 
45  double expand_k;
46 
47  public:
50 
53 
56 
59 
61  double multiplier;
62 
68  ExpandStats(Xapian::doclength avlen_, double expand_k_ = 0.0)
69  : avlen(avlen_), expand_k(expand_k_) { }
70 
71  void accumulate(size_t shard_index,
73  Xapian::doccount subtf, Xapian::doccount subdbsize)
74  {
75  // Boolean terms may have wdf == 0, but treat that as 1 so such terms
76  // get a non-zero weight.
77  if (wdf == 0) wdf = 1;
78  ++rtermfreq;
79  rcollection_freq += wdf;
80 
81  multiplier += (expand_k + 1) * wdf / (expand_k * doclen / avlen + wdf);
82 
83  // If we've not seen this sub-database before, then update dbsize and
84  // termfreq and note that we have seen it.
85  if (shard_index >= dbs_seen.size() || !dbs_seen[shard_index]) {
86  if (shard_index >= dbs_seen.size()) {
87  dbs_seen.resize(shard_index + 1);
88  }
89  dbs_seen[shard_index] = true;
90  dbsize += subdbsize;
91  termfreq += subtf;
92  }
93  }
94 
96  double get_average_length() const { return avlen; }
97 
99  void clear_stats() {
100  dbs_seen.clear();
101  dbsize = 0;
102  termfreq = 0;
103  rcollection_freq = 0;
104  rtermfreq = 0;
105  multiplier = 0;
106  }
107 };
108 
113 
116 
119 
122 
125 
137 
140 
141  public:
155  Xapian::doccount rsize_,
156  bool use_exact_termfreq_,
157  bool want_collection_freq_,
158  double expand_k_ = 0.0)
159  : db(db_), dbsize(db.get_doccount()),
160  rsize(rsize_),
161  collection_len(db.get_total_length()),
162  use_exact_termfreq(use_exact_termfreq_),
163  want_collection_freq(want_collection_freq_),
164  stats(db.get_average_length(), expand_k_) {}
165 
171  void collect_stats(TermList* merger, const std::string& term);
172 
174  virtual double get_weight() const = 0;
175 
176  protected:
179 
181  double get_average_length() const { return stats.get_average_length(); }
182 
184  Xapian::doccount get_rsize() const { return rsize; }
185 
188 
191 
193  Xapian::doccount get_dbsize() const { return dbsize; }
194 };
195 
200 class ProbEWeight : public ExpandWeight {
201  public:
214  Xapian::doccount rsize_,
215  bool use_exact_termfreq_,
216  double expand_k_)
217  : ExpandWeight(db_, rsize_, use_exact_termfreq_, false, expand_k_) { }
218 
219  double get_weight() const;
220 };
221 
233 class Bo1EWeight : public ExpandWeight {
234  public:
246  Xapian::doccount rsize_,
247  bool use_exact_termfreq_)
248  : ExpandWeight(db_, rsize_, use_exact_termfreq_, true) {}
249 
250  double get_weight() const;
251 };
252 
253 }
254 }
255 
256 #endif // XAPIAN_INCLUDED_EXPANDWEIGHT_H
An indexed database of documents.
Definition: database.h:75
This class implements the Bo1 scheme for query expansion.
Definition: expandweight.h:233
double get_weight() const
Calculate the weight.
Definition: bo1eweight.cc:33
Bo1EWeight(const Xapian::Database &db_, Xapian::doccount rsize_, bool use_exact_termfreq_)
Constructor.
Definition: expandweight.h:245
Collates statistics while calculating term weight in an ESet.
Definition: expandweight.h:37
Xapian::doclength avlen
Average document length in the whole database.
Definition: expandweight.h:42
Xapian::doccount termfreq
Term frequency (for a multidb, may be for a subset of the databases).
Definition: expandweight.h:52
ExpandStats(Xapian::doclength avlen_, double expand_k_=0.0)
Constructor.
Definition: expandweight.h:68
void accumulate(size_t shard_index, Xapian::termcount wdf, Xapian::termcount doclen, Xapian::doccount subtf, Xapian::doccount subdbsize)
Definition: expandweight.h:71
double get_average_length() const
Return the average document length in the database.
Definition: expandweight.h:96
void clear_stats()
Reset for the next term.
Definition: expandweight.h:99
Xapian::doccount rtermfreq
The number of documents from the RSet indexed by the current term (r).
Definition: expandweight.h:58
double multiplier
The multiplier to be used in probabilistic query expansion.
Definition: expandweight.h:61
Xapian::doccount dbsize
Size of the subset of a multidb to which the value in termfreq applies.
Definition: expandweight.h:49
Xapian::termcount rcollection_freq
The number of times the term occurs in the rset.
Definition: expandweight.h:55
double expand_k
The parameter k to be used for probabilistic query expansion.
Definition: expandweight.h:45
std::vector< bool > dbs_seen
Which databases in a multidb are included in termfreq.
Definition: expandweight.h:39
Class for calculating ESet term weights.
Definition: expandweight.h:110
void collect_stats(TermList *merger, const std::string &term)
Get the term statistics.
Definition: expandweight.cc:37
Xapian::doccount dbsize
The number of documents in the whole database.
Definition: expandweight.h:115
Xapian::totallength get_collection_len() const
Return the length of the collection.
Definition: expandweight.h:190
bool use_exact_termfreq
Should we calculate the exact term frequency when generating an ESet?
Definition: expandweight.h:136
Xapian::doccount get_rsize() const
Return the number of documents in the RSet.
Definition: expandweight.h:184
double get_average_length() const
Return the average length of the database.
Definition: expandweight.h:181
virtual double get_weight() const =0
Calculate the weight.
ExpandWeight(const Xapian::Database &db_, Xapian::doccount rsize_, bool use_exact_termfreq_, bool want_collection_freq_, double expand_k_=0.0)
Constructor.
Definition: expandweight.h:154
const Xapian::Database db
The combined database.
Definition: expandweight.h:112
bool want_collection_freq
Does the expansion scheme use collection frequency?
Definition: expandweight.h:139
Xapian::doccount get_dbsize() const
Return the size of the database.
Definition: expandweight.h:193
Xapian::totallength collection_len
The total length of the database.
Definition: expandweight.h:124
Xapian::termcount collection_freq
The collection frequency of the term.
Definition: expandweight.h:121
Xapian::termcount get_collection_freq() const
Return the collection frequency of the term.
Definition: expandweight.h:187
ExpandStats stats
ExpandStats object to accumulate statistics.
Definition: expandweight.h:178
Xapian::doccount rsize
The number of documents in the RSet.
Definition: expandweight.h:118
This class implements the probabilistic scheme for query expansion.
Definition: expandweight.h:200
double get_weight() const
Calculate the weight.
Definition: probeweight.cc:32
ProbEWeight(const Xapian::Database &db_, Xapian::doccount rsize_, bool use_exact_termfreq_, double expand_k_)
Constructor.
Definition: expandweight.h:213
Abstract base class for termlists.
Definition: termlist.h:42
An indexed database of documents.
string term
#define true
Definition: header.h:8
#define false
Definition: header.h:9
Types used internally.
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
double doclength
A normalised document length.
Definition: types.h:58
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
Definition: types.h:114
Abstract base class for termlists.