xapian-core  2.0.0
collapser.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2009,2011,2017 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include "collapser.h"
24 
25 #include "heap.h"
26 #include "omassert.h"
27 
28 #include <algorithm>
29 
30 using namespace std;
31 
33 CollapseData::check_item(const vector<Result>& results,
34  const Result& result,
35  Xapian::doccount collapse_max, MSetCmp mcmp,
36  Xapian::doccount& old_item)
37 {
38  if (items.size() < collapse_max) {
39  return ADD;
40  }
41 
42  // We already have collapse_max items better than result so we need to
43  // eliminate the lowest ranked.
44  if (collapse_count == 0 && collapse_max != 1) {
45  // Be lazy about building the heap - if we see <= collapse_max items
46  // with a particular collapse key, we never need to use the heap.
47  Heap::make(items.begin(), items.end(),
48  [&](pair<Xapian::doccount, Xapian::docid> a,
49  pair<Xapian::doccount, Xapian::docid> b) {
50  return mcmp(results[a.first], results[b.first]);
51  });
52  }
53  ++collapse_count;
54 
55  Xapian::doccount old_item_candidate = items.front().first;
56  const Result& old_result = results[old_item_candidate];
57  if (items.front().second != old_result.get_docid()) {
58  // The previous result with this collapse key we were going to replace
59  // has been pushed out of the protomset by higher ranking results.
60  //
61  // Awkwardly that means we don't know its exact weight, but we
62  // only need next_best_weight to know if we can zero collapse_count
63  // when there's a percentage cut-off, so setting it to an overestimate
64  // is OK, and it's not critically important as it makes no difference
65  // at all unless there's a percentage cut-off set.
66  //
67  // For now use the new result's weight. FIXME: The lowest weight in
68  // the current MSet would be a tighter bound if we're sorting primarily
69  // by weight (if we're not, then is next_best_weight useful at all?)
70  // We could also check other entries in items to find an upper bound
71  // weight.
72  next_best_weight = result.get_weight();
73 
74  return REPLACE;
75  }
76 
77  if (mcmp(old_result, result)) {
78  // If this is the "best runner-up", update next_best_weight.
79  if (result.get_weight() > next_best_weight)
80  next_best_weight = result.get_weight();
81  return REJECT;
82  }
83 
84  old_item = old_item_candidate;
85  next_best_weight = old_result.get_weight();
86 
87  items.front() = { old_item, result.get_docid() };
88  Heap::replace(items.begin(), items.end(),
89  [&](pair<Xapian::doccount, Xapian::docid> a,
90  pair<Xapian::doccount, Xapian::docid> b) {
91  return mcmp(results[a.first], results[b.first]);
92  });
93 
94  return REPLACE;
95 }
96 
97 void
99 {
100  AssertEq(items.size(), 1);
101  AssertEq(items[0].first, 0);
102  items[0].first = item;
103 }
104 
105 void
106 CollapseData::add_item(const vector<Result>& results,
107  Xapian::doccount item,
108  Xapian::doccount collapse_max,
109  MSetCmp mcmp)
110 {
111  const Result& result = results[item];
112  if (items.size() < collapse_max) {
113  items.emplace_back(item, result.get_docid());
114  return;
115  }
116 
117  items.front() = { item, result.get_docid() };
118  Heap::replace(items.begin(), items.end(),
119  [&](pair<Xapian::doccount, Xapian::docid> a,
120  pair<Xapian::doccount, Xapian::docid> b) {
121  return mcmp(results[a.first], results[b.first]);
122  });
123 }
124 
128 {
129  ptr = NULL;
130  ++docs_considered;
131  result.set_collapse_key(vsdoc.get_value(slot));
132 
133  if (result.get_collapse_key().empty()) {
134  // We don't collapse results with an empty collapse key.
135  ++no_collapse_key;
136  return EMPTY;
137  }
138 
139  // Use dummy value 0 for item - if process() is called, this will get
140  // updated to the appropriate value, and if it isn't then the docid won't
141  // match and we'll know the item isn't in the current proto-mset.
142  auto r = table.emplace(result.get_collapse_key(),
143  CollapseData(0, result.get_docid()));
144  ptr = &r.first->second;
145  if (r.second) {
146  // We've not seen this collapse key before.
147  ++entry_count;
148  return NEW;
149  }
150 
151  collapse_result res;
152  CollapseData& collapse_data = *ptr;
153  res = collapse_data.check_item(results, result, collapse_max, mcmp,
154  old_item);
155  if (res == ADD) {
156  ++entry_count;
157  } else if (res == REJECT || res == REPLACE) {
158  ++dups_ignored;
159  }
160  return res;
161 }
162 
163 void
165  Xapian::doccount item)
166 {
167  switch (action) {
168  case NEW:
169  // We've not seen this collapse key before.
170  Assert(ptr);
171  ptr->set_item(item);
172  return;
173  case ADD: {
174  Assert(ptr);
175  ptr->add_item(results, item, collapse_max, mcmp);
176  break;
177  }
178  default:
179  // Shouldn't be called for other actions.
180  Assert(false);
181  }
182 }
183 
185 Collapser::get_collapse_count(const string & collapse_key,
186  int percent_threshold,
187  double min_weight) const
188 {
189  auto key = table.find(collapse_key);
190  // If a collapse key is present in the MSet, it must be in our table.
191  Assert(key != table.end());
192 
193  if (!percent_threshold) {
194  // The recorded collapse_count is correct.
195  return key->second.get_collapse_count();
196  }
197 
198  if (key->second.get_next_best_weight() < min_weight) {
199  // We know for certain that all collapsed items would have failed the
200  // percentage cutoff, so collapse_count should be 0.
201  return 0;
202  }
203 
204  // We know that the highest weighted collapsed item would have survived the
205  // percentage cutoff, but it's possible all other collapsed items would
206  // have failed it. Since collapse_count is a lower bound, we must set it
207  // to 1.
208  return 1;
209 }
210 
213 {
214  // We've seen this many matches, but all other documents matching the query
215  // could be collapsed onto values already seen.
216  Xapian::doccount matches_lower_bound = no_collapse_key + entry_count;
217  return matches_lower_bound;
218  // FIXME: *Unless* we haven't achieved collapse_max occurrences of *any*
219  // collapse key value, so we can increase matches_lower_bound like the
220  // code below, but it isn't quite that simple - there may not be that
221  // many documents.
222 #if 0
223  Xapian::doccount max_kept = 0;
224  for (auto i = table.begin(); i != table.end(); ++i) {
225  if (i->second.get_collapse_count() > max_kept) {
226  max_kept = i->second.get_collapse_count();
227  if (max_kept == collapse_max) {
228  return matches_lower_bound;
229  }
230  }
231  }
232  return matches_lower_bound + (collapse_max - max_kept);
233 #endif
234 }
235 
236 void
237 Collapser::finalise(double min_weight, int percent_threshold)
238 {
239  if (table.empty() || results.empty())
240  return;
241 
242  // We need to fill in collapse_count values in results using the
243  // information stored in table.
244  Xapian::doccount todo = entry_count;
245  for (Result& result : results) {
246  const string& key = result.get_collapse_key();
247  if (key.empty())
248  continue;
249 
250  // Fill in collapse_count.
251  result.set_collapse_count(get_collapse_count(key, percent_threshold,
252  min_weight));
253  if (--todo == 0) {
254  // Terminate early if we've handled all non-empty entries.
255  break;
256  }
257  }
258 }
Class tracking information for a given value of the collapse key.
Definition: collapser.h:42
void set_item(Xapian::doccount item)
Set item after constructing with a placeholder.
Definition: collapser.cc:98
void add_item(const std::vector< Result > &results, Xapian::doccount item, Xapian::doccount collapse_max, MSetCmp mcmp)
Complete update of new result with this collapse key value.
Definition: collapser.cc:106
collapse_result check_item(const std::vector< Result > &results, const Result &result, Xapian::doccount collapse_max, MSetCmp mcmp, Xapian::doccount &old_item)
Check a new result with this collapse key value.
Definition: collapser.cc:33
collapse_result check(Result &result, Xapian::Document::Internal &vsdoc)
Check a new result.
Definition: collapser.cc:126
void finalise(double min_weight, int percent_threshold)
Definition: collapser.cc:237
void process(collapse_result action, Xapian::doccount item)
Handle a new Result.
Definition: collapser.cc:164
Xapian::doccount get_matches_lower_bound() const
Definition: collapser.cc:212
Xapian::doccount get_collapse_count(const std::string &collapse_key, int percent_threshold, double min_weight) const
Definition: collapser.cc:185
A result in an MSet.
Definition: result.h:30
double get_weight() const
Definition: result.h:70
void set_collapse_key(const std::string &k)
Definition: result.h:82
const std::string & get_collapse_key() const
Definition: result.h:74
Xapian::docid get_docid() const
Definition: result.h:68
Abstract base class for a document.
std::string get_value(Xapian::valueno slot) const
Read a value slot in this document.
Collapse documents with the same collapse key during the match.
collapse_result
Enumeration reporting how a result will be handled by the Collapser.
Definition: collapser.h:33
@ EMPTY
Definition: collapser.h:34
@ REPLACE
Definition: collapser.h:38
@ REJECT
Definition: collapser.h:37
@ ADD
Definition: collapser.h:36
@ NEW
Definition: collapser.h:35
C++ STL heap implementation with extensions.
bool(* MSetCmp)(const Result &, const Result &)
Definition: msetcmp.h:29
void replace(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:230
void make(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:259
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
Various assertion macros.
#define AssertEq(A, B)
Definition: omassert.h:124
#define Assert(COND)
Definition: omassert.h:122