xapian-core  1.4.20
collapser.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2009,2011 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <config.h>
22 
23 #include "collapser.h"
24 
25 #include "omassert.h"
26 
27 #include <algorithm>
28 
29 using namespace std;
30 
33  Xapian::doccount collapse_max, MSetCmp mcmp,
34  Xapian::Internal::MSetItem & old_item)
35 {
36  if (items.size() < collapse_max) {
37  items.push_back(item);
38  items.back().collapse_key = string();
39  return ADDED;
40  }
41 
42  // We already have collapse_max items better than item so we need to
43  // eliminate the lowest ranked.
44  if (collapse_count == 0 && collapse_max != 1) {
45  // Be lazy about calling make_heap - if we see <= collapse_max
46  // items with a particular collapse key, we never need to use
47  // the heap.
48  make_heap(items.begin(), items.end(), mcmp);
49  }
50  ++collapse_count;
51 
52  if (mcmp(items.front(), item)) {
53  // If this is the "best runner-up", update next_best_weight.
54  if (item.wt > next_best_weight) next_best_weight = item.wt;
55  return REJECTED;
56  }
57 
58  next_best_weight = items.front().wt;
59 
60  items.push_back(item);
61  push_heap(items.begin(), items.end(), mcmp);
62  pop_heap(items.begin(), items.end(), mcmp);
63  swap(old_item, items.back());
64  items.pop_back();
65 
66  return REPLACED;
67 }
68 
71  PostList * postlist,
73  MSetCmp mcmp)
74 {
75  ++docs_considered;
76  // The postlist will supply the collapse key for a remote match.
77  const string * key_ptr = postlist->get_collapse_key();
78  if (key_ptr) {
79  item.collapse_key = *key_ptr;
80  } else {
81  // Otherwise use the Document object to get the value.
82  item.collapse_key = vsdoc.get_value(slot);
83  }
84 
85  if (item.collapse_key.empty()) {
86  // We don't collapse items with an empty collapse key.
87  ++no_collapse_key;
88  return EMPTY;
89  }
90 
91  auto oldkey = table.find(item.collapse_key);
92  if (oldkey == table.end()) {
93  // We've not seen this collapse key before.
94  table.insert(make_pair(item.collapse_key, CollapseData(item)));
95  ++entry_count;
96  return ADDED;
97  }
98 
99  collapse_result res;
100  CollapseData & collapse_data = oldkey->second;
101  res = collapse_data.add_item(item, collapse_max, mcmp, old_item);
102  if (res == ADDED) {
103  ++entry_count;
104  } else if (res == REJECTED || res == REPLACED) {
105  ++dups_ignored;
106  }
107  return res;
108 }
109 
111 Collapser::get_collapse_count(const string & collapse_key, int percent_cutoff,
112  double min_weight) const
113 {
114  auto key = table.find(collapse_key);
115  // If a collapse key is present in the MSet, it must be in our table.
116  Assert(key != table.end());
117 
118  if (!percent_cutoff) {
119  // The recorded collapse_count is correct.
120  return key->second.get_collapse_count();
121  }
122 
123  if (key->second.get_next_best_weight() < min_weight) {
124  // We know for certain that all collapsed items would have failed the
125  // percentage cutoff, so collapse_count should be 0.
126  return 0;
127  }
128 
129  // We know that the highest weighted collapsed item would have survived the
130  // percentage cutoff, but it's possible all other collapsed items would
131  // have failed it. Since collapse_count is a lower bound, we must set it
132  // to 1.
133  return 1;
134 }
135 
138 {
139  // We've seen this many matches, but all other documents matching the query
140  // could be collapsed onto values already seen.
141  Xapian::doccount matches_lower_bound = no_collapse_key + entry_count;
142  return matches_lower_bound;
143  // FIXME: *Unless* we haven't achieved collapse_max occurrences of *any*
144  // collapse key value, so we can increase matches_lower_bound like the
145  // code below, but it isn't quite that simple - there may not be that
146  // many documents.
147 #if 0
148  Xapian::doccount max_kept = 0;
149  for (auto i = table.begin(); i != table.end(); ++i) {
150  if (i->second.get_collapse_count() > max_kept) {
151  max_kept = i->second.get_collapse_count();
152  if (max_kept == collapse_max) {
153  return matches_lower_bound;
154  }
155  }
156  }
157  return matches_lower_bound + (collapse_max - max_kept);
158 #endif
159 }
#define Assert(COND)
Definition: omassert.h:122
collapse_result process(Xapian::Internal::MSetItem &item, PostList *postlist, Xapian::Document::Internal &vsdoc, MSetCmp mcmp)
Handle a new MSetItem.
Definition: collapser.cc:70
Abstract base class for postlists.
Definition: postlist.h:37
collapse_result add_item(const Xapian::Internal::MSetItem &item, Xapian::doccount collapse_max, MSetCmp mcmp, Xapian::Internal::MSetItem &old_item)
Handle a new MSetItem with this collapse key value.
Definition: collapser.cc:32
A document in the database, possibly plus modifications.
Definition: document.h:41
Collapse documents with the same collapse key during the match.
STL namespace.
virtual const std::string * get_collapse_key() const
If the collapse key is already known, return it.
Definition: postlist.cc:56
bool(* MSetCmp)(const Xapian::Internal::MSetItem &, const Xapian::Internal::MSetItem &)
Definition: msetcmp.h:28
double wt
Weight calculated.
string collapse_key
Value which was used to collapse upon.
An item resulting from a query.
string get_value(Xapian::valueno slot) const
Get value by value number.
Definition: omdocument.cc:400
collapse_result
Enumeration reporting how a document was handled by the Collapser.
Definition: collapser.h:33
Xapian::doccount get_collapse_count(const std::string &collapse_key, int percent_cutoff, double min_weight) const
Definition: collapser.cc:111
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Xapian::doccount get_matches_lower_bound() const
Definition: collapser.cc:137
Various assertion macros.
Class tracking information for a given value of the collapse key.
Definition: collapser.h:41