xapian-core  1.4.26
weightinternal.h
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007 Lemur Consulting Ltd
5  * Copyright (C) 2009,2010,2011,2013,2014,2015,2020 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #ifndef XAPIAN_INCLUDED_WEIGHTINTERNAL_H
23 #define XAPIAN_INCLUDED_WEIGHTINTERNAL_H
24 
25 #include "xapian/weight.h"
26 
27 #include "xapian/database.h"
28 #include "xapian/query.h"
29 
30 #include "backends/database.h"
31 #include "internaltypes.h"
32 #include "omassert.h"
33 
34 #include <algorithm>
35 #include <map>
36 #include <string>
37 
39 struct TermFreqs {
43 
44  double max_part = 0.0;
45 
46  TermFreqs() {}
48  Xapian::doccount reltermfreq_,
49  Xapian::termcount collfreq_,
50  double max_part_ = 0.0)
51  : termfreq(termfreq_),
52  reltermfreq(reltermfreq_),
53  collfreq(collfreq_),
54  max_part(max_part_) {}
55 
56  void operator+=(const TermFreqs & other) {
57  termfreq += other.termfreq;
58  reltermfreq += other.reltermfreq;
59  collfreq += other.collfreq;
60  // max_part shouldn't be set yet.
61  Assert(max_part == 0.0);
62  Assert(other.max_part == 0.0);
63  }
64 
66  std::string get_description() const;
67 };
68 
69 namespace Xapian {
70 
71 class RSet;
72 
75 #ifdef XAPIAN_ASSERTIONS
76 
77  size_t subdbs = 0;
78 
83  mutable bool finalised = false;
84 #endif
85 
86  public:
88  Xapian::totallength total_length = 0;
89 
91  Xapian::doccount collection_size = 0;
92 
94  Xapian::doccount rset_size = 0;
95 
100  bool have_max_part = false;
101 
104 
107 
110  std::map<std::string, TermFreqs> termfreqs;
111 
112  Internal() { }
113 
119  Internal & operator+=(const Internal & inc);
120 
121  void merge(const Weight::Internal& o);
122 
123  void set_query(const Xapian::Query &query_) {
124  AssertEq(subdbs, 0);
125  query = query_;
126  }
127 
129  void accumulate_stats(const Xapian::Database::Internal &sub_db,
130  const Xapian::RSet &rset);
131 
143  bool get_stats(const std::string & term,
146  Xapian::termcount & collfreq) const {
147 #ifdef XAPIAN_ASSERTIONS
148  finalised = true;
149 #endif
150  // We pass an empty std::string for term when calculating the extra
151  // weight.
152  if (term.empty()) {
153  termfreq = collection_size;
154  collfreq = collection_size;
155  reltermfreq = rset_size;
156  return true;
157  }
158 
159  auto i = termfreqs.find(term);
160  if (i == termfreqs.end()) {
161  termfreq = reltermfreq = collfreq = 0;
162  return false;
163  }
164 
165  termfreq = i->second.termfreq;
166  reltermfreq = i->second.reltermfreq;
167  collfreq = i->second.collfreq;
168  return true;
169  }
170 
172  bool get_stats(const std::string & term,
173  Xapian::doccount & termfreq) const {
174  Xapian::doccount dummy1;
175  Xapian::termcount dummy2;
176  return get_stats(term, termfreq, dummy1, dummy2);
177  }
178 
180  bool get_termweight(const std::string & term, double & termweight) const {
181 #ifdef XAPIAN_ASSERTIONS
182  finalised = true;
183 #endif
184  termweight = 0.0;
185  if (term.empty()) {
186  return false;
187  }
188 
189  auto i = termfreqs.find(term);
190  if (i == termfreqs.end()) {
191  return false;
192  }
193 
194  termweight = i->second.max_part;
195  return true;
196  }
197 
202  void get_max_termweight(double & min_tw, double & max_tw) {
203  auto i = termfreqs.begin();
204  while (i != termfreqs.end() && i->second.max_part == 0.0) ++i;
205  if (rare(i == termfreqs.end())) {
206  min_tw = max_tw = 0.0;
207  return;
208  }
209  min_tw = max_tw = i->second.max_part;
210  while (++i != termfreqs.end()) {
211  double max_part = i->second.max_part;
212  if (max_part > max_tw) {
213  max_tw = max_part;
214  } else if (max_part < min_tw && max_part != 0.0) {
215  min_tw = max_part;
216  }
217  }
218  }
219 
221  void set_max_part(const std::string & term, double max_part) {
222  Assert(!term.empty());
223  auto i = termfreqs.find(term);
224  if (i != termfreqs.end()) {
225  have_max_part = true;
226  double& val = i->second.max_part;
227  val = std::max(val, max_part);
228  }
229  }
230 
232 #ifdef XAPIAN_ASSERTIONS
233  finalised = true;
234 #endif
235  if (rare(collection_size == 0)) return 0;
236  return Xapian::doclength(total_length) / collection_size;
237  }
238 
241  Assert(!finalised);
242  db = db_;
243  }
244 
246  std::string get_description() const;
247 };
248 
249 }
250 
251 #endif // XAPIAN_INCLUDED_WEIGHTINTERNAL_H
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
#define Assert(COND)
Definition: omassert.h:122
size_t sub_db(Xapian::docid did, size_t n_dbs)
Definition: omdatabase.cc:76
#define AssertEq(A, B)
Definition: omassert.h:124
double max_part
This class is used to access a database, or a group of databases.
Definition: database.h:68
Xapian::Database db
Database to get the bounds on doclength and wdf from.
void operator+=(const TermFreqs &other)
std::string get_description() const
Return a std::string describing this object.
Base class for databases.
Definition: database.h:57
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
Definition: types.h:139
void set_query(const Xapian::Query &query_)
bool get_stats(const std::string &term, Xapian::doccount &termfreq, Xapian::doccount &reltermfreq, Xapian::termcount &collfreq) const
Get the frequencies for the given term.
#define rare(COND)
Definition: config.h:575
Xapian::doccount termfreq
std::map< std::string, TermFreqs > termfreqs
Map of term frequencies and relevant term frequencies for the collection.
Xapian::Query API class.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
Xapian::Query query
The query.
double doclength
A normalised document length.
Definition: types.h:59
Class to hold statistics for a given collection.
TermFreqs(Xapian::doccount termfreq_, Xapian::doccount reltermfreq_, Xapian::termcount collfreq_, double max_part_=0.0)
API for working with Xapian databases.
void get_max_termweight(double &min_tw, double &max_tw)
Get the minimum and maximum termweights.
Xapian::termcount collfreq
bool get_termweight(const std::string &term, double &termweight) const
Get the termweight.
void set_bounds_from_db(const Xapian::Database &db_)
Set the "bounds" stats from Database db.
Weighting scheme API.
Xapian::doclength get_average_length() const
The frequencies for a term.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
bool get_stats(const std::string &term, Xapian::doccount &termfreq) const
Get just the termfreq.
void set_max_part(const std::string &term, double max_part)
Set max_part for a term.
Various assertion macros.
Class representing a query.
Definition: query.h:46
Xapian::doccount reltermfreq
Types used internally.
A relevance set (R-Set).
Definition: enquire.h:60