xapian-core  2.0.0
collapser.h
Go to the documentation of this file.
1 
4 /* Copyright (C) 2009,2011,2017 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #ifndef XAPIAN_INCLUDED_COLLAPSER_H
22 #define XAPIAN_INCLUDED_COLLAPSER_H
23 
25 #include "msetcmp.h"
26 #include "omassert.h"
27 #include "api/result.h"
28 
29 #include <unordered_map>
30 #include <vector>
31 
33 typedef enum {
35  NEW,
36  ADD,
38  REPLACE
40 
42 class CollapseData {
55  std::vector<std::pair<Xapian::doccount, Xapian::docid>> items;
56 
58  double next_best_weight = 0.0;
59 
62 
63  public:
66  : items(1, { item, did })
67  { }
68 
84  collapse_result check_item(const std::vector<Result>& results,
85  const Result& result,
86  Xapian::doccount collapse_max,
87  MSetCmp mcmp,
88  Xapian::doccount& old_item);
89 
94  void set_item(Xapian::doccount item);
95 
103  void add_item(const std::vector<Result>& results,
104  Xapian::doccount item,
105  Xapian::doccount collapse_max,
106  MSetCmp mcmp);
107 
114  // Yes, this *is* a linear search, but only through up to collapse_max
115  // items and we expect collapse_max to be very small (it's the maximum
116  // number of items to keep for each collapse key value).
117  for (auto&& item : items) {
118  if (item.first == from) {
119  item.first = to;
120  return;
121  }
122  }
123  // The entry ought to be present.
124  Assert(false);
125  }
126 
128  double get_next_best_weight() const { return next_best_weight; }
129 
132 };
133 
135 class Collapser {
137  std::unordered_map<std::string, CollapseData> table;
138 
141 
147 
154 
161 
164 
167 
168  std::vector<Result>& results;
169 
171 
173  CollapseData* ptr = NULL;
174 
177  return mcmp(results[a], results[b]);
178  }
179 
180  public:
183 
185  Xapian::doccount collapse_max_,
186  std::vector<Result>& results_,
187  MSetCmp mcmp_)
188  : slot(slot_),
189  collapse_max(collapse_max_),
190  results(results_),
191  mcmp(mcmp_) { }
192 
194  operator bool() const { return collapse_max != 0; }
195 
208  collapse_result check(Result& result,
210 
216  void process(collapse_result action, Xapian::doccount item);
217 
224  const std::string& collapse_key = results[to].get_collapse_key();
225  if (collapse_key.empty()) {
226  return;
227  }
228  auto it = table.find(collapse_key);
229  if (rare(it == table.end())) {
230  // The entry ought to be present.
231  Assert(false);
232  return;
233  }
234 
235  CollapseData& collapse_data = it->second;
236  collapse_data.result_has_moved(from, to);
237  }
238 
239  Xapian::doccount get_collapse_count(const std::string & collapse_key,
240  int percent_threshold,
241  double min_weight) const;
242 
244 
246 
248 
250 
251  void finalise(double min_weight, int percent_threshold);
252 };
253 
262  std::unordered_map<std::string, Xapian::doccount> table;
263 
266 
272 
279 
286 
289 
290  public:
292  : collapse_max(collapse_max_) {}
293 
295  operator bool() const { return collapse_max != 0; }
296 
301  bool add(const std::string& key) {
302  ++docs_considered;
303 
304  if (key.empty()) {
305  ++no_collapse_key;
306  return true;
307  }
308 
309  auto r = table.emplace(key, 1);
310  if (r.second) {
311  // New entry, set to 1.
312  } else if (r.first->second == collapse_max) {
313  // Already seen collapse_max with this key so reject.
314  ++dups_ignored;
315  return false;
316  } else {
317  // Increment count.
318  ++r.first->second;
319  }
320  ++entry_count;
321  return true;
322  }
323 
325 
327 
329 
331  return no_collapse_key + entry_count;
332  }
333 
334  void finalise(std::vector<Result>& results, int percent_threshold) {
335  if (table.empty() || results.empty())
336  return;
337 
338  // We need to fill in collapse_count values in results using the
339  // information stored in table.
341  for (Result& result : results) {
342  const std::string& key = result.get_collapse_key();
343  if (key.empty())
344  continue;
345 
346  // Adjust collapse_count.
347  if (percent_threshold) {
348  // FIXME: We can probably do better here.
349  result.set_collapse_count(1);
350  } else {
351  auto c = result.get_collapse_count() + table[key];
352  result.set_collapse_count(c);
353  }
354 
355  if (--todo == 0) {
356  // Terminate early if we've handled all non-empty entries.
357  break;
358  }
359  }
360  }
361 };
362 
363 #endif // XAPIAN_INCLUDED_COLLAPSER_H
Class tracking information for a given value of the collapse key.
Definition: collapser.h:42
Xapian::doccount collapse_count
The number of documents we've rejected.
Definition: collapser.h:61
std::vector< std::pair< Xapian::doccount, Xapian::docid > > items
Currently kept MSet entries for this value of the collapse key.
Definition: collapser.h:55
CollapseData(Xapian::doccount item, Xapian::docid did)
Construct with the given item.
Definition: collapser.h:65
void set_item(Xapian::doccount item)
Set item after constructing with a placeholder.
Definition: collapser.cc:98
Xapian::doccount get_collapse_count() const
The number of documents we've rejected.
Definition: collapser.h:131
void add_item(const std::vector< Result > &results, Xapian::doccount item, Xapian::doccount collapse_max, MSetCmp mcmp)
Complete update of new result with this collapse key value.
Definition: collapser.cc:106
collapse_result check_item(const std::vector< Result > &results, const Result &result, Xapian::doccount collapse_max, MSetCmp mcmp, Xapian::doccount &old_item)
Check a new result with this collapse key value.
Definition: collapser.cc:33
double next_best_weight
The highest weight of a document we've rejected.
Definition: collapser.h:58
double get_next_best_weight() const
The highest weight of a document we've rejected.
Definition: collapser.h:128
void result_has_moved(Xapian::doccount from, Xapian::doccount to)
Process relocation of entry in results.
Definition: collapser.h:113
Simpler version of Collapser used when merging MSet objects.
Definition: collapser.h:260
Xapian::doccount entry_count
How many items we're currently keeping in table.
Definition: collapser.h:265
std::unordered_map< std::string, Xapian::doccount > table
Map from collapse key values to collapse counts.
Definition: collapser.h:262
bool add(const std::string &key)
Try to add a new key.
Definition: collapser.h:301
Xapian::doccount get_docs_considered() const
Definition: collapser.h:324
Xapian::doccount dups_ignored
How many documents with duplicate collapse keys we have ignored.
Definition: collapser.h:278
CollapserLite(Xapian::doccount collapse_max_)
Definition: collapser.h:291
Xapian::doccount no_collapse_key
How many documents have we seen without a collapse key?
Definition: collapser.h:271
void finalise(std::vector< Result > &results, int percent_threshold)
Definition: collapser.h:334
Xapian::doccount docs_considered
How many documents we've considered for collapsing.
Definition: collapser.h:285
Xapian::doccount collapse_max
The maximum number of items to keep for each collapse key value.
Definition: collapser.h:288
Xapian::doccount get_matches_lower_bound() const
Definition: collapser.h:330
Xapian::doccount get_entries() const
Definition: collapser.h:328
Xapian::doccount get_dups_ignored() const
Definition: collapser.h:326
The Collapser class tracks collapse keys and the documents they match.
Definition: collapser.h:135
Xapian::doccount get_docs_considered() const
Definition: collapser.h:243
std::vector< Result > & results
Definition: collapser.h:168
Xapian::doccount no_collapse_key
How many documents have we seen without a collapse key?
Definition: collapser.h:146
MSetCmp mcmp
Definition: collapser.h:170
Xapian::doccount dups_ignored
How many documents with duplicate collapse keys we have ignored.
Definition: collapser.h:153
collapse_result check(Result &result, Xapian::Document::Internal &vsdoc)
Check a new result.
Definition: collapser.cc:126
void finalise(double min_weight, int percent_threshold)
Definition: collapser.cc:237
Collapser(Xapian::valueno slot_, Xapian::doccount collapse_max_, std::vector< Result > &results_, MSetCmp mcmp_)
Definition: collapser.h:184
Xapian::doccount old_item
Replaced item when REPLACE is returned by collapse().
Definition: collapser.h:182
Xapian::doccount get_dups_ignored() const
Definition: collapser.h:245
Xapian::valueno slot
The value slot we're getting collapse keys from.
Definition: collapser.h:163
void process(collapse_result action, Xapian::doccount item)
Handle a new Result.
Definition: collapser.cc:164
bool operator()(Xapian::doccount a, Xapian::doccount b) const
Adapt mcmp to be usable with min_heap.
Definition: collapser.h:176
Xapian::doccount collapse_max
The maximum number of items to keep for each collapse key value.
Definition: collapser.h:166
Xapian::doccount get_entries() const
Definition: collapser.h:247
std::unordered_map< std::string, CollapseData > table
Map from collapse key values to the items we're keeping for them.
Definition: collapser.h:137
Xapian::doccount get_matches_lower_bound() const
Definition: collapser.cc:212
Xapian::doccount docs_considered
How many documents we've considered for collapsing.
Definition: collapser.h:160
Xapian::doccount get_collapse_count(const std::string &collapse_key, int percent_threshold, double min_weight) const
Definition: collapser.cc:185
CollapseData * ptr
Pointer to CollapseData when NEW or ADD is in progress.
Definition: collapser.h:173
void result_has_moved(Xapian::doccount from, Xapian::doccount to)
Process relocation of entry in results.
Definition: collapser.h:223
Xapian::doccount entry_count
How many items we're currently keeping in table.
Definition: collapser.h:140
A result in an MSet.
Definition: result.h:30
Abstract base class for a document.
collapse_result
Enumeration reporting how a result will be handled by the Collapser.
Definition: collapser.h:33
@ EMPTY
Definition: collapser.h:34
@ REPLACE
Definition: collapser.h:38
@ REJECT
Definition: collapser.h:37
@ ADD
Definition: collapser.h:36
@ NEW
Definition: collapser.h:35
#define rare(COND)
Definition: config.h:607
Abstract base class for a document.
Result comparison functions.
bool(* MSetCmp)(const Result &, const Result &)
Definition: msetcmp.h:29
unsigned valueno
The number for a value slot in a document.
Definition: types.h:90
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
Various assertion macros.
#define Assert(COND)
Definition: omassert.h:122
A result in an MSet.