00001
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <config.h>
00022
00023 #include "collapser.h"
00024
00025 #include "omassert.h"
00026
00027 #include <algorithm>
00028
00029 using namespace std;
00030
00031 collapse_result
00032 CollapseData::add_item(const Xapian::Internal::MSetItem & item,
00033 Xapian::doccount collapse_max, const MSetCmp & mcmp,
00034 Xapian::Internal::MSetItem & old_item)
00035 {
00036 if (items.size() < collapse_max) {
00037 items.push_back(item);
00038 items.back().collapse_key = string();
00039 return ADDED;
00040 }
00041
00042
00043
00044 if (collapse_count == 0 && collapse_max != 1) {
00045
00046
00047
00048 make_heap(items.begin(), items.end(), mcmp);
00049 }
00050 ++collapse_count;
00051
00052 if (mcmp(items.front(), item)) {
00053
00054 if (item.wt > next_best_weight) next_best_weight = item.wt;
00055 return REJECTED;
00056 }
00057
00058 next_best_weight = items.front().wt;
00059
00060 items.push_back(item);
00061 push_heap(items.begin(), items.end(), mcmp);
00062 pop_heap(items.begin(), items.end(), mcmp);
00063 swap(old_item, items.back());
00064 items.pop_back();
00065
00066 return REPLACED;
00067 }
00068
00069 collapse_result
00070 Collapser::process(Xapian::Internal::MSetItem & item,
00071 PostList * postlist,
00072 Xapian::Document::Internal & vsdoc,
00073 const MSetCmp & mcmp)
00074 {
00075 ++docs_considered;
00076
00077 const string * key_ptr = postlist->get_collapse_key();
00078 if (key_ptr) {
00079 item.collapse_key = *key_ptr;
00080 } else {
00081
00082 item.collapse_key = vsdoc.get_value(slot);
00083 }
00084
00085 if (item.collapse_key.empty()) {
00086
00087 ++no_collapse_key;
00088 return EMPTY;
00089 }
00090
00091 map<string, CollapseData>::iterator oldkey;
00092 oldkey = table.find(item.collapse_key);
00093 if (oldkey == table.end()) {
00094
00095 table.insert(make_pair(item.collapse_key, CollapseData(item)));
00096 ++entry_count;
00097 return ADDED;
00098 }
00099
00100 collapse_result res;
00101 CollapseData & collapse_data = oldkey->second;
00102 res = collapse_data.add_item(item, collapse_max, mcmp, old_item);
00103 if (res == ADDED) {
00104 ++entry_count;
00105 } else if (res == REJECTED || res == REPLACED) {
00106 ++dups_ignored;
00107 }
00108 return res;
00109 }
00110
00111 Xapian::doccount
00112 Collapser::get_collapse_count(const string & collapse_key, int percent_cutoff,
00113 Xapian::weight min_weight) const
00114 {
00115 map<string, CollapseData>::const_iterator key = table.find(collapse_key);
00116
00117 Assert(key != table.end());
00118
00119 if (!percent_cutoff) {
00120
00121 return key->second.get_collapse_count();
00122 }
00123
00124 if (key->second.get_next_best_weight() < min_weight) {
00125
00126
00127 return 0;
00128 }
00129
00130
00131
00132
00133
00134 return 1;
00135 }
00136
00137 Xapian::doccount
00138 Collapser::get_matches_lower_bound() const
00139 {
00140
00141
00142 Xapian::doccount matches_lower_bound = no_collapse_key + entry_count;
00143 return matches_lower_bound;
00144
00145
00146
00147
00148 #if 0
00149 Xapian::doccount max_kept = 0;
00150 map<string, CollapseData>::const_iterator i;
00151 for (i = table.begin(); i != table.end(); ++i) {
00152 if (i->second.get_collapse_count() > max_kept) {
00153 max_kept = i->second.get_collapse_count();
00154 if (max_kept == collapse_max) {
00155 return matches_lower_bound;
00156 }
00157 }
00158 }
00159 return matches_lower_bound + (collapse_max - max_kept);
00160 #endif
00161 }