xapian-core  1.4.25
weightinternal.h
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007 Lemur Consulting Ltd
5  * Copyright (C) 2009,2010,2011,2013,2014,2015,2020 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #ifndef XAPIAN_INCLUDED_WEIGHTINTERNAL_H
23 #define XAPIAN_INCLUDED_WEIGHTINTERNAL_H
24 
25 #include "xapian/weight.h"
26 
27 #include "xapian/database.h"
28 #include "xapian/query.h"
29 
30 #include "backends/database.h"
31 #include "internaltypes.h"
32 #include "omassert.h"
33 
34 #include <map>
35 #include <string>
36 
38 struct TermFreqs {
42  double max_part;
43 
44  TermFreqs() : termfreq(0), reltermfreq(0), collfreq(0), max_part(0.0) {}
46  Xapian::doccount reltermfreq_,
47  Xapian::termcount collfreq_,
48  double max_part_ = 0.0)
49  : termfreq(termfreq_),
50  reltermfreq(reltermfreq_),
51  collfreq(collfreq_),
52  max_part(max_part_) {}
53 
54  void operator+=(const TermFreqs & other) {
55  termfreq += other.termfreq;
56  reltermfreq += other.reltermfreq;
57  collfreq += other.collfreq;
58  max_part += other.max_part;
59  }
60 
62  std::string get_description() const;
63 };
64 
65 namespace Xapian {
66 
67 class RSet;
68 
71 #ifdef XAPIAN_ASSERTIONS
72 
73  size_t subdbs = 0;
74 
79  mutable bool finalised = false;
80 #endif
81 
82  public:
84  Xapian::totallength total_length = 0;
85 
87  Xapian::doccount collection_size = 0;
88 
90  Xapian::doccount rset_size = 0;
91 
96  bool have_max_part = false;
97 
100 
103 
106  std::map<std::string, TermFreqs> termfreqs;
107 
108  Internal() { }
109 
115  Internal & operator+=(const Internal & inc);
116 
117  void merge(const Weight::Internal& o);
118 
119  void set_query(const Xapian::Query &query_) {
120  AssertEq(subdbs, 0);
121  query = query_;
122  }
123 
125  void accumulate_stats(const Xapian::Database::Internal &sub_db,
126  const Xapian::RSet &rset);
127 
139  bool get_stats(const std::string & term,
142  Xapian::termcount & collfreq) const {
143 #ifdef XAPIAN_ASSERTIONS
144  finalised = true;
145 #endif
146  // We pass an empty std::string for term when calculating the extra
147  // weight.
148  if (term.empty()) {
149  termfreq = collection_size;
150  collfreq = collection_size;
151  reltermfreq = rset_size;
152  return true;
153  }
154 
155  auto i = termfreqs.find(term);
156  if (i == termfreqs.end()) {
157  termfreq = reltermfreq = collfreq = 0;
158  return false;
159  }
160 
161  termfreq = i->second.termfreq;
162  reltermfreq = i->second.reltermfreq;
163  collfreq = i->second.collfreq;
164  return true;
165  }
166 
168  bool get_stats(const std::string & term,
169  Xapian::doccount & termfreq) const {
170  Xapian::doccount dummy1;
171  Xapian::termcount dummy2;
172  return get_stats(term, termfreq, dummy1, dummy2);
173  }
174 
176  bool get_termweight(const std::string & term, double & termweight) const {
177 #ifdef XAPIAN_ASSERTIONS
178  finalised = true;
179 #endif
180  termweight = 0.0;
181  if (term.empty()) {
182  return false;
183  }
184 
185  auto i = termfreqs.find(term);
186  if (i == termfreqs.end()) {
187  return false;
188  }
189 
190  termweight = i->second.max_part;
191  return true;
192  }
193 
198  void get_max_termweight(double & min_tw, double & max_tw) {
199  auto i = termfreqs.begin();
200  while (i != termfreqs.end() && i->second.max_part == 0.0) ++i;
201  if (rare(i == termfreqs.end())) {
202  min_tw = max_tw = 0.0;
203  return;
204  }
205  min_tw = max_tw = i->second.max_part;
206  while (++i != termfreqs.end()) {
207  double max_part = i->second.max_part;
208  if (max_part > max_tw) {
209  max_tw = max_part;
210  } else if (max_part < min_tw && max_part != 0.0) {
211  min_tw = max_part;
212  }
213  }
214  }
215 
217  void set_max_part(const std::string & term, double max_part) {
218  have_max_part = true;
219  Assert(!term.empty());
220  auto i = termfreqs.find(term);
221  if (i != termfreqs.end())
222  i->second.max_part += max_part;
223  }
224 
226 #ifdef XAPIAN_ASSERTIONS
227  finalised = true;
228 #endif
229  if (rare(collection_size == 0)) return 0;
230  return Xapian::doclength(total_length) / collection_size;
231  }
232 
235  Assert(!finalised);
236  db = db_;
237  }
238 
240  std::string get_description() const;
241 };
242 
243 }
244 
245 #endif // XAPIAN_INCLUDED_WEIGHTINTERNAL_H
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
#define Assert(COND)
Definition: omassert.h:122
size_t sub_db(Xapian::docid did, size_t n_dbs)
Definition: omdatabase.cc:76
#define AssertEq(A, B)
Definition: omassert.h:124
double max_part
This class is used to access a database, or a group of databases.
Definition: database.h:68
Xapian::Database db
Database to get the bounds on doclength and wdf from.
void operator+=(const TermFreqs &other)
std::string get_description() const
Return a std::string describing this object.
Base class for databases.
Definition: database.h:57
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
Definition: types.h:139
void set_query(const Xapian::Query &query_)
bool get_stats(const std::string &term, Xapian::doccount &termfreq, Xapian::doccount &reltermfreq, Xapian::termcount &collfreq) const
Get the frequencies for the given term.
#define rare(COND)
Definition: config.h:565
Xapian::doccount termfreq
std::map< std::string, TermFreqs > termfreqs
Map of term frequencies and relevant term frequencies for the collection.
Xapian::Query API class.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
Xapian::Query query
The query.
double doclength
A normalised document length.
Definition: types.h:59
Class to hold statistics for a given collection.
TermFreqs(Xapian::doccount termfreq_, Xapian::doccount reltermfreq_, Xapian::termcount collfreq_, double max_part_=0.0)
API for working with Xapian databases.
void get_max_termweight(double &min_tw, double &max_tw)
Get the minimum and maximum termweights.
Xapian::termcount collfreq
bool get_termweight(const std::string &term, double &termweight) const
Get the termweight.
void set_bounds_from_db(const Xapian::Database &db_)
Set the "bounds" stats from Database db.
Weighting scheme API.
Xapian::doclength get_average_length() const
The frequencies for a term.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
bool get_stats(const std::string &term, Xapian::doccount &termfreq) const
Get just the termfreq.
void set_max_part(const std::string &term, double max_part)
Set max_part for a term.
Various assertion macros.
Class representing a query.
Definition: query.h:46
Xapian::doccount reltermfreq
Types used internally.
A relevance set (R-Set).
Definition: enquire.h:60