xapian-core  1.4.27
esetinternal.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2008,2010,2011,2013,2016 Olly Betts
5  * Copyright (C) 2011 Action Without Borders
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include <config.h>
23 
24 #include "esetinternal.h"
25 
26 #include "xapian/enquire.h"
27 #include "xapian/expanddecider.h"
28 #include "backends/database.h"
29 #include "debuglog.h"
30 #include "api/omenquireinternal.h"
31 #include "expandweight.h"
32 #include "omassert.h"
33 #include "ortermlist.h"
34 #include "str.h"
35 #include "api/termlist.h"
37 
38 #include "autoptr.h"
39 #include <set>
40 #include <string>
41 #include <vector>
42 
43 using namespace std;
44 
45 namespace Xapian {
46 
47 string
48 Internal::ExpandTerm::get_description() const
49 {
50  string desc("ExpandTerm(");
51  desc += str(wt);
52  desc += ", ";
53  description_append(desc, term);
54  desc += ')';
55  return desc;
56 }
57 
58 template<class CLASS> struct delete_ptr {
59  void operator()(CLASS *p) const { delete p; }
60 };
61 
63  bool operator()(const TermList *a, const TermList *b) const {
64  return a->get_approx_size() > b->get_approx_size();
65  }
66 };
67 
71 static TermList *
72 build_termlist_tree(const Xapian::Database &db, const RSet & rset)
73 {
74  Assert(!rset.empty());
75 
76  const set<Xapian::docid> & docids = rset.internal->get_items();
77 
78  vector<TermList*> termlists;
79  termlists.reserve(docids.size());
80 
81  try {
82  const size_t multiplier = db.internal.size();
83  set<Xapian::docid>::const_iterator i;
84  for (i = docids.begin(); i != docids.end(); ++i) {
85  Xapian::docid realdid = (*i - 1) / multiplier + 1;
86  Xapian::doccount dbnumber = (*i - 1) % multiplier;
87 
88  // Push NULL first to avoid leaking the new TermList if push_back()
89  // throws.
90  termlists.push_back(0);
91  termlists.back() = db.internal[dbnumber]->open_term_list(realdid);
92  termlists.back()->shard_index = dbnumber;
93  }
94 
95  Assert(!termlists.empty());
96  if (termlists.size() == 1) return termlists[0];
97 
98  // Make termlists into a heap so that the longest termlist is at the
99  // top of the heap.
100  make_heap(termlists.begin(), termlists.end(),
102 
103  // Now build a tree of binary TermList objects. The algorithm used to
104  // build the tree is like that used to build an optimal Huffman coding
105  // tree. If we called next() repeatedly, this arrangement would
106  // minimise the number of method calls. Generally we don't actually do
107  // that, but this arrangement is still likely to be a good one, and it
108  // does minimise the work in the worst case.
109  while (true) {
110  AssertRel(termlists.size(), >=, 2);
111  // We build the tree such that at each branch:
112  //
113  // l.get_approx_size() >= r.get_approx_size()
114  //
115  // We do this so that the OrTermList class can be optimised
116  // assuming that this is the case.
117  TermList * r = termlists.front();
118  pop_heap(termlists.begin(), termlists.end(),
120  termlists.pop_back();
121  TermList * l = termlists.front();
122 
123  TermList * pl = new OrTermList(l, r);
124 
125  if (termlists.size() == 1) return pl;
126 
127  pop_heap(termlists.begin(), termlists.end(),
129  termlists.back() = pl;
130  push_heap(termlists.begin(), termlists.end(),
132  }
133  } catch (...) {
134  for_each(termlists.begin(), termlists.end(), delete_ptr<TermList>());
135  throw;
136  }
137 }
138 
139 void
140 ESet::Internal::expand(Xapian::termcount max_esize,
141  const Xapian::Database & db,
142  const RSet & rset,
143  const Xapian::ExpandDecider * edecider,
145  double min_wt)
146 {
147  LOGCALL_VOID(EXPAND, "ESet::Internal::expand", max_esize | db | rset | edecider | eweight);
148  // These two cases are handled by our caller.
149  Assert(max_esize);
150  Assert(!rset.empty());
151  // This method should only be called once for a given ESet::Internal, so
152  // check we're empty.
153  Assert(ebound == 0);
154  Assert(items.empty());
155 
156  AutoPtr<TermList> tree(build_termlist_tree(db, rset));
157  Assert(tree.get());
158 
159  bool is_heap = false;
160  while (true) {
161  // See if the root needs replacing.
162  TermList * new_root = tree->next();
163  if (new_root) {
164  LOGLINE(EXPAND, "Replacing the root of the termlist tree");
165  tree.reset(new_root);
166  }
167 
168  if (tree->at_end()) break;
169 
170  string term = tree->get_termname();
171 
172  // If there's an ExpandDecider, see if it accepts the term.
173  if (edecider && !(*edecider)(term)) continue;
174 
175  ++ebound;
176 
177  /* Set up the ExpandWeight by clearing the existing statistics and
178  collecting statistics for the new term. */
179  eweight.collect_stats(tree.get(), term);
180 
181  double wt = eweight.get_weight();
182 
183  // If the weights are equal, we prefer the lexically smaller term and
184  // so we use "<=" not "<" here.
185  if (wt <= min_wt) continue;
186 
187  items.push_back(Xapian::Internal::ExpandTerm(wt, term));
188 
189  // The candidate ESet is overflowing, so remove the worst element in it
190  // using a min-heap.
191  if (items.size() > max_esize) {
192  if (rare(!is_heap)) {
193  is_heap = true;
194  make_heap(items.begin(), items.end());
195  } else {
196  push_heap(items.begin(), items.end());
197  }
198  pop_heap(items.begin(), items.end());
199  items.pop_back();
200  min_wt = items.front().wt;
201  }
202  }
203 
204  // Now sort the contents of the new ESet.
205  if (is_heap) {
206  sort_heap(items.begin(), items.end());
207  } else {
208  sort(items.begin(), items.end());
209  }
210 }
211 
212 string
213 ESet::Internal::get_description() const
214 {
215  string desc("ESet::Internal(ebound=");
216  desc += str(ebound);
217 
218  vector<Xapian::Internal::ExpandTerm>::const_iterator i;
219  for (i = items.begin(); i != items.end(); ++i) {
220  desc += ", ";
221  desc += i->get_description();
222  }
223  desc += ')';
224 
225  return desc;
226 }
227 
228 ESet::ESet() { }
229 
230 ESet::ESet(const ESet & o) : internal(o.internal) { }
231 
232 ESet&
234 {
235  internal = o.internal;
236  return *this;
237 }
238 
239 ESet::ESet(ESet &&) = default;
240 
241 ESet&
242 ESet::operator=(ESet &&) = default;
243 
245 
247 ESet::size() const
248 {
249  if (!internal.get()) return 0;
250  return internal->items.size();
251 }
252 
255 {
256  if (!internal.get()) return 0;
257  return internal->ebound;
258 }
259 
260 std::string
262 {
263  string desc = "ESet(";
264  if (internal.get())
265  desc += internal->get_description();
266  desc += ')';
267  return desc;
268 }
269 
270 
271 std::string
273 {
274  return (eset.internal->items.end() - off_from_end)->term;
275 }
276 
277 double
279 {
280  return (eset.internal->items.end() - off_from_end)->wt;
281 }
282 
283 std::string
285 {
286  string desc = "ESetIterator(";
287  if (eset.internal.get())
288  desc += str(eset.internal->items.size() - off_from_end);
289  desc += ')';
290  return desc;
291 }
292 
293 }
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
#define Assert(COND)
Definition: omassert.h:122
void collect_stats(TermList *merger, const std::string &term)
Get the term statistics.
Definition: expandweight.cc:37
virtual Internal * next()=0
Advance the current position to the next term in the termlist.
This class is used to access a database, or a group of databases.
Definition: database.h:68
void operator()(CLASS *p) const
Definition: esetinternal.cc:59
virtual double get_weight() const =0
Calculate the weight.
#define AssertRel(A, REL, B)
Definition: omassert.h:123
bool operator()(const TermList *a, const TermList *b) const
Definition: esetinternal.cc:63
Merge two TermList objects using an OR operation.
Class combining a term and its expand weight.
Definition: esetinternal.h:42
#define LOGCALL_VOID(CATEGORY, FUNC, PARAMS)
Definition: debuglog.h:488
Abstract base class for termlists.
Definition: termlist.h:39
STL namespace.
Convert types to std::string.
Virtual base class for expand decider functor.
Definition: expanddecider.h:37
std::vector< Xapian::Internal::intrusive_ptr< Internal > > internal
Definition: database.h:81
#define rare(COND)
Definition: config.h:575
double get_weight() const
Get the weight for the current position.
ESet()
Default constructor.
Xapian::Internal::intrusive_ptr< Internal > internal
Definition: enquire.h:63
virtual Xapian::termcount get_approx_size() const =0
Return approximate size of this termlist.
API for running queries.
Class for calculating ESet term weights.
Definition: expandweight.h:114
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
Xapian::doccount size() const
Return number of items in this ESet object.
void description_append(std::string &desc, const std::string &s)
Definition: unittest.cc:102
Collate statistics and calculate the term weights for the ESet.
std::string get_description() const
Return a string describing this object.
std::string operator*() const
Get the term at the current position.
string str(int value)
Convert int to std::string.
Definition: str.cc:90
Xapian::termcount get_ebound() const
Return a bound on the full size of this ESet object.
~ESet()
Destructor.
Allow rejection of terms during ESet generation.
Xapian::Internal::intrusive_ptr< Internal > internal
Definition: eset.h:48
Append a string to an object description, escaping invalid UTF-8.
Xapian::ESet::Internal class.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
static TermList * build_termlist_tree(const Xapian::Database &db, const RSet &rset)
Build a tree of binary TermList objects like QueryOptimiser does for OrPostList objects.
Definition: esetinternal.cc:72
Abstract base class for termlists.
Class representing a list of search results.
Definition: eset.h:43
std::string get_description() const
Return a string describing this object.
Various assertion macros.
#define LOGLINE(a, b)
Definition: debuglog.h:494
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
bool empty() const
Test if this R-Set is empty.
Definition: omenquire.cc:98
ESet & operator=(const ESet &o)
Copying is allowed.
Wrapper around standard unique_ptr template.
Debug logging macros.
A relevance set (R-Set).
Definition: enquire.h:60