xapian-core  2.0.0
esetinternal.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2008,2010,2011,2013,2016,2017,2018 Olly Betts
5  * Copyright (C) 2011 Action Without Borders
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, see
19  * <https://www.gnu.org/licenses/>.
20  */
21 
22 #include <config.h>
23 
24 #include "esetinternal.h"
25 
26 #include "xapian/enquire.h"
27 #include "xapian/expanddecider.h"
29 #include "backends/multi.h"
30 #include "debuglog.h"
31 #include "api/rsetinternal.h"
32 #include "expandweight.h"
33 #include "heap.h"
34 #include "omassert.h"
35 #include "ortermlist.h"
36 #include "str.h"
37 #include "api/termlist.h"
38 #include "termlistmerger.h"
40 
41 #include <functional>
42 #include <memory>
43 #include <set>
44 #include <string>
45 #include <vector>
46 
47 using namespace std;
48 
49 namespace Xapian {
50 
51 string
52 Internal::ExpandTerm::get_description() const
53 {
54  string desc("ExpandTerm(");
55  desc += str(wt);
56  desc += ", ";
57  description_append(desc, term);
58  desc += ')';
59  return desc;
60 }
61 
65 static TermList *
66 build_termlist_tree(const Xapian::Database &db, const RSet & rset)
67 {
68  Assert(!rset.empty());
69 
70  const set<Xapian::docid> & docids = rset.internal->docs;
71 
72  vector<TermList*> termlists;
73  termlists.reserve(docids.size());
74 
75  try {
76  for (Xapian::docid did : docids) {
77  termlists.push_back(db.internal->open_term_list_direct(did));
78  }
79  Assert(!termlists.empty());
80  return make_termlist_merger(termlists);
81  } catch (...) {
82  for_each(termlists.begin(), termlists.end(),
83  [](TermList* p) { delete p; });
84  throw;
85  }
86 }
87 
88 void
89 ESet::Internal::expand(Xapian::termcount max_esize,
90  const Xapian::Database & db,
91  const RSet & rset,
92  const Xapian::ExpandDecider * edecider,
94  double min_wt)
95 {
96  LOGCALL_VOID(EXPAND, "ESet::Internal::expand", max_esize | db | rset | edecider | eweight);
97  // These two cases are handled by our caller.
98  Assert(max_esize);
99  Assert(!rset.empty());
100  // This method should only be called once for a given ESet::Internal, so
101  // check we're empty.
102  Assert(ebound == 0);
103  Assert(items.empty());
104 
105  unique_ptr<TermList> tree(build_termlist_tree(db, rset));
106  Assert(tree);
107 
108  bool is_heap = false;
109  while (true) {
110  // See if the root needs replacing.
111  TermList * new_root = tree->next();
112  if (new_root == tree.get()) {
113  // No more entries.
114  break;
115  }
116  if (new_root) {
117  LOGLINE(EXPAND, "Replacing the root of the termlist tree");
118  tree.reset(new_root);
119  }
120 
121  string term = tree->get_termname();
122 
123  // If there's an ExpandDecider, see if it accepts the term.
124  if (edecider && !(*edecider)(term)) continue;
125 
126  ++ebound;
127 
128  /* Set up the ExpandWeight by clearing the existing statistics and
129  collecting statistics for the new term. */
130  eweight.collect_stats(tree.get(), term);
131 
132  double wt = eweight.get_weight();
133 
134  // If the weights are equal, we prefer the lexically smaller term and
135  // since we process terms in ascending order we use "<=" not "<" here.
136  if (wt <= min_wt) continue;
137 
138  if (items.size() < max_esize) {
139  items.emplace_back(wt, term);
140  continue;
141  }
142 
143  // We have the desired number of items, so it's one-in one-out from
144  // now on.
145  Assert(items.size() == max_esize);
146  if (rare(!is_heap)) {
147  Heap::make(items.begin(), items.end(),
148  std::less<Xapian::Internal::ExpandTerm>());
149  min_wt = items.front().wt;
150  is_heap = true;
151  if (wt <= min_wt) continue;
152  }
153 
154  items.front() = Xapian::Internal::ExpandTerm(wt, term);
155  Heap::replace(items.begin(), items.end(),
156  std::less<Xapian::Internal::ExpandTerm>());
157  min_wt = items.front().wt;
158  }
159 
160  // Now sort the contents of the new ESet.
161  if (is_heap) {
162  Heap::sort(items.begin(), items.end(),
163  std::less<Xapian::Internal::ExpandTerm>());
164  } else {
165  sort(items.begin(), items.end());
166  }
167 }
168 
169 string
170 ESet::Internal::get_description() const
171 {
172  string desc("ESet::Internal(ebound=");
173  desc += str(ebound);
174 
175  for (auto&& i : items) {
176  desc += ", ";
177  desc += i.get_description();
178  }
179  desc += ')';
180 
181  return desc;
182 }
183 
184 ESet::ESet() : internal(new ESet::Internal) {}
185 
186 ESet::ESet(const ESet &) = default;
187 
188 ESet&
189 ESet::operator=(const ESet &) = default;
190 
191 ESet::ESet(ESet &&) = default;
192 
193 ESet&
194 ESet::operator=(ESet &&) = default;
195 
197 
199 ESet::size() const
200 {
201  return Xapian::termcount(internal->items.size());
202 }
203 
206 {
207  return internal->ebound;
208 }
209 
210 std::string
212 {
213  string desc = "ESet(";
214  desc += ')';
215  return desc;
216 }
217 
218 
219 std::string
221 {
222  Assert(off_from_end != 0);
223  AssertRel(off_from_end, <=, eset.internal->items.size());
224  return (eset.internal->items.end() - off_from_end)->get_term();
225 }
226 
227 double
229 {
230  Assert(off_from_end != 0);
231  AssertRel(off_from_end, <=, eset.internal->items.size());
232  return (eset.internal->items.end() - off_from_end)->get_weight();
233 }
234 
235 std::string
237 {
238  string desc = "ESetIterator(";
239  if (off_from_end == 0) {
240  desc += "end";
241  } else {
242  desc += str(eset.internal->items.size() - off_from_end);
243  }
244  desc += ')';
245  return desc;
246 }
247 
248 }
An indexed database of documents.
Definition: database.h:75
Xapian::Internal::intrusive_ptr_nonnull< Internal > internal
Definition: database.h:95
Xapian::ESet eset
Definition: eset.h:165
Xapian::ESet::size_type off_from_end
Definition: eset.h:172
std::string get_description() const
Return a string describing this object.
std::string operator*() const
Get the term at the current position.
double get_weight() const
Get the weight for the current position.
Class which actually implements Xapian::ESet.
Definition: esetinternal.h:80
std::vector< Xapian::Internal::ExpandTerm > items
The ExpandTerm objects which represent the items in the ESet.
Definition: esetinternal.h:92
Class representing a list of search results.
Definition: eset.h:42
~ESet()
Destructor.
std::string get_description() const
Return a string describing this object.
Xapian::termcount size() const
Return number of items in this ESet object.
Xapian::termcount get_ebound() const
Return a bound on the full size of this ESet object.
ESet & operator=(const ESet &o)
Copying is allowed.
Xapian::Internal::intrusive_ptr_nonnull< Internal > internal
Definition: eset.h:47
ESet()
Default constructor.
Virtual base class for expand decider functor.
Definition: expanddecider.h:38
Class combining a term and its expand weight.
Definition: esetinternal.h:42
Class for calculating ESet term weights.
Definition: expandweight.h:110
void collect_stats(TermList *merger, const std::string &term)
Get the term statistics.
Definition: expandweight.cc:37
virtual double get_weight() const =0
Calculate the weight.
Class representing a set of documents judged as relevant.
Definition: rset.h:39
Xapian::Internal::intrusive_ptr< Internal > internal
Definition: rset.h:42
bool empty() const
Return true if this RSet object is empty.
Definition: rset.h:81
Abstract base class for termlists.
Definition: termlist.h:42
virtual Internal * next()=0
Advance the current position to the next term in the termlist.
#define rare(COND)
Definition: config.h:607
string term
PositionList * p
Virtual base class for Database internals.
Debug logging macros.
#define LOGLINE(a, b)
Definition: debuglog.h:485
#define LOGCALL_VOID(CATEGORY, FUNC, PARAMS)
Definition: debuglog.h:479
Append a string to an object description, escaping invalid UTF-8.
Querying session.
Xapian::ESet::Internal class.
Allow rejection of terms during ESet generation.
Collate statistics and calculate the term weights for the ESet.
C++ STL heap implementation with extensions.
Multi-database support functions.
void replace(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:230
void make(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:259
void sort(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:277
string str(int value)
Convert int to std::string.
Definition: str.cc:91
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
static TermList * build_termlist_tree(const Xapian::Database &db, const RSet &rset)
Build a tree of binary TermList objects like QueryOptimiser does for OrPostList objects.
Definition: esetinternal.cc:66
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
Various assertion macros.
#define AssertRel(A, REL, B)
Definition: omassert.h:123
#define Assert(COND)
Definition: omassert.h:122
Merge two TermList objects using an OR operation.
Set of documents judged as relevant.
Convert types to std::string.
Abstract base class for termlists.
Build tree to merge TermList objects.
TermList * make_termlist_merger(std::vector< TermList * > &termlists)
void description_append(std::string &desc, std::string_view s)
Definition: unittest.cc:105