xapian-core  2.0.0
weightinternal.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007 Lemur Consulting Ltd
5  * Copyright (C) 2009,2010,2011,2012,2013,2014,2015,2017,2020,2024 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, see
19  * <https://www.gnu.org/licenses/>.
20  */
21 
22 #include <config.h>
23 
24 #include "weightinternal.h"
25 
26 #include "xapian/enquire.h"
27 
28 #include "min_non_zero.h"
29 #include "omassert.h"
30 #include "api/rsetinternal.h"
31 #include "str.h"
32 #include "api/termlist.h"
33 
34 #include <memory>
35 
36 using namespace std;
37 
38 string
39 TermFreqs::get_description() const {
40  string desc("TermFreqs(termfreq=");
41  desc += str(termfreq);
42  desc += ", reltermfreq=";
43  desc += str(reltermfreq);
44  desc += ", collfreq=";
45  desc += str(collfreq);
46  desc += ", max_part=";
47  desc += str(max_part);
48  desc += ")";
49  return desc;
50 }
51 
52 namespace Xapian {
53 
55 Weight::Internal::operator+=(const Weight::Internal & inc)
56 {
57 #ifdef XAPIAN_ASSERTIONS
58  Assert(!finalised);
59  subdbs += inc.subdbs;
60 #endif
61  total_length += inc.total_length;
62  collection_size += inc.collection_size;
63  rset_size += inc.rset_size;
64 
65  db_doclength_lower_bound = min_non_zero(db_doclength_lower_bound,
67  db_doclength_upper_bound = std::max(db_doclength_upper_bound,
69 
70  db_unique_terms_lower_bound = min_non_zero(db_unique_terms_lower_bound,
72  db_unique_terms_upper_bound = std::max(db_unique_terms_upper_bound,
74 
75  // Add termfreqs and reltermfreqs
76  for (auto&& i : inc.termfreqs) {
77  termfreqs[i.first] += i.second;
78  }
79  return *this;
80 }
81 
82 void
83 Weight::Internal::accumulate_stats(const Xapian::Database::Internal &subdb,
84  const Xapian::RSet &rset)
85 {
86 #ifdef XAPIAN_ASSERTIONS
87  Assert(!finalised);
88  ++subdbs;
89 #endif
90  total_length += subdb.get_total_length();
91  collection_size += subdb.get_doccount();
92  rset_size += rset.size();
93 
94  db_doclength_lower_bound = min_non_zero(db_doclength_lower_bound,
96  db_doclength_upper_bound = std::max(db_doclength_upper_bound,
98  db_unique_terms_lower_bound =
99  min_non_zero(db_unique_terms_lower_bound,
101  db_unique_terms_upper_bound =
102  std::max(db_unique_terms_upper_bound,
104 
106  for (t = query.get_unique_terms_begin(); t != Xapian::TermIterator(); ++t) {
107  const string & term = *t;
108 
109  Xapian::doccount sub_tf;
110  Xapian::termcount sub_cf;
111  subdb.get_freqs(term, &sub_tf, &sub_cf);
112  TermFreqs & tf = termfreqs[term];
113  tf.termfreq += sub_tf;
114  tf.collfreq += sub_cf;
115  }
116 
117  if (!rset.internal)
118  return;
119 
120  for (Xapian::docid did : rset.internal->docs) {
121  Assert(did);
122  // The query is likely to contain far fewer terms than the documents,
123  // and we can skip the document's termlist, so look for each query term
124  // in the document.
125  unique_ptr<TermList> tl(subdb.open_term_list(did));
126  for (auto&& i : termfreqs) {
127  const string& term = i.first;
128  TermList * ret = tl->skip_to(term);
129  if (ret != NULL) {
130  // No more entries prune shouldn't happen).
131  Assert(ret == tl.get());
132  break;
133  }
134  if (term == tl->get_termname())
135  ++i.second.reltermfreq;
136  }
137  }
138 }
139 
140 void
141 Weight::Internal::merge(const Weight::Internal& o)
142 {
143  if (!o.have_max_part) return;
144  for (auto i : o.termfreqs) {
145  double& max_part = termfreqs[i.first].max_part;
146  max_part = max(max_part, i.second.max_part);
147  }
148 }
149 
150 string
151 Weight::Internal::get_description() const
152 {
153  string desc = "Weight::Internal(totlen=";
154  desc += str(total_length);
155  desc += ", collection_size=";
156  desc += str(collection_size);
157  desc += ", rset_size=";
158  desc += str(rset_size);
159 #ifdef XAPIAN_ASSERTIONS
160  desc += ", subdbs=";
161  desc += str(subdbs);
162  desc += ", finalised=";
163  desc += str(finalised);
164 #endif
165  desc += ", termfreqs={";
166  for (auto i = termfreqs.begin(); i != termfreqs.end(); ++i) {
167  if (i != termfreqs.begin())
168  desc += ", ";
169  desc += i->first;
170  desc += " => ";
171  desc += i->second.get_description();
172  }
173  desc += "})";
174  return desc;
175 }
176 
177 }
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
Definition: api_anydb.cc:62
Virtual base class for Database internals.
virtual TermList * open_term_list(docid did) const =0
virtual termcount get_unique_terms_lower_bound() const
Get a lower bound on the unique terms size of a document in this DB.
virtual termcount get_doclength_upper_bound() const =0
Get an upper bound on the length of a document in this DB.
virtual totallength get_total_length() const =0
Return the total length of all documents in this database.
virtual void get_freqs(std::string_view term, doccount *termfreq_ptr, termcount *collfreq_ptr) const =0
Returns frequencies for a term.
virtual doccount get_doccount() const =0
virtual termcount get_unique_terms_upper_bound() const
Get an upper bound on the unique terms size of a document in this DB.
virtual termcount get_doclength_lower_bound() const =0
Get a lower bound on the length of a document in this DB.
const TermIterator get_unique_terms_begin() const
Begin iterator for unique terms in the query object.
Definition: query.cc:223
Class representing a set of documents judged as relevant.
Definition: rset.h:39
Xapian::Internal::intrusive_ptr< Internal > internal
Definition: rset.h:42
Xapian::doccount size() const
Return number of documents in this RSet object.
Definition: rset.cc:49
Abstract base class for termlists.
Definition: termlist.h:42
virtual Internal * skip_to(std::string_view term)=0
Skip forward to the specified term.
Class for iterating over a list of terms.
Definition: termiterator.h:41
Class to hold statistics for a given collection.
Xapian::totallength total_length
Total length of all documents in the collection.
Xapian::termcount db_doclength_upper_bound
An upper bound on the maximum length of any document in the database.
bool have_max_part
Has max_part been set for any term?
Xapian::termcount db_doclength_lower_bound
A lower bound on the minimum length of any document in the database.
Xapian::termcount db_unique_terms_lower_bound
A lower bound on the number of unique terms in any document.
Xapian::doccount rset_size
Number of relevant documents in the collection.
Xapian::doccount collection_size
Number of documents in the collection.
std::map< std::string, TermFreqs, std::less<> > termfreqs
Map of term frequencies and relevant term frequencies for the collection.
Xapian::termcount db_unique_terms_upper_bound
An upper bound on the number of unique terms in any document.
string term
Querying session.
Return the smaller of two numbers which isn't zero.
constexpr std::enable_if_t< std::is_unsigned_v< T >, T > min_non_zero(const T &a, const T &b)
Return the smaller of two unsigned integers which isn't zero.
Definition: min_non_zero.h:39
string str(int value)
Convert int to std::string.
Definition: str.cc:91
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
Various assertion macros.
#define Assert(COND)
Definition: omassert.h:122
Set of documents judged as relevant.
Convert types to std::string.
The frequencies for a term.
Xapian::termcount collfreq
Abstract base class for termlists.
Xapian::Weight::Internal class, holding database and term statistics.