00001
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include <config.h>
00024 #include <xapian/matchspy.h>
00025
00026 #include <xapian/document.h>
00027 #include <xapian/error.h>
00028 #include <xapian/queryparser.h>
00029 #include <xapian/registry.h>
00030
00031 #include <map>
00032 #include <string>
00033 #include <vector>
00034
00035 #include "autoptr.h"
00036 #include "debuglog.h"
00037 #include "omassert.h"
00038 #include "serialise.h"
00039 #include "stringutils.h"
00040 #include "str.h"
00041 #include "termlist.h"
00042
00043 #include <cfloat>
00044 #include <cmath>
00045
00046 using namespace std;
00047 using namespace Xapian;
00048
00049 MatchSpy::~MatchSpy() {}
00050
00051 MatchSpy *
00052 MatchSpy::clone() const {
00053 throw UnimplementedError("MatchSpy not suitable for use with remote searches - clone() method unimplemented");
00054 }
00055
00056 string
00057 MatchSpy::name() const {
00058 throw UnimplementedError("MatchSpy not suitable for use with remote searches - name() method unimplemented");
00059 }
00060
00061 string
00062 MatchSpy::serialise() const {
00063 throw UnimplementedError("MatchSpy not suitable for use with remote searches - serialise() method unimplemented");
00064 }
00065
00066 MatchSpy *
00067 MatchSpy::unserialise(const string &, const Registry &) const {
00068 throw UnimplementedError("MatchSpy not suitable for use with remote searches - unserialise() method unimplemented");
00069 }
00070
00071 string
00072 MatchSpy::serialise_results() const {
00073 throw UnimplementedError("MatchSpy not suitable for use with remote searches - serialise_results() method unimplemented");
00074 }
00075
00076 void
00077 MatchSpy::merge_results(const string &) {
00078 throw UnimplementedError("MatchSpy not suitable for use with remote searches - merge_results() method unimplemented");
00079 }
00080
00081 string
00082 MatchSpy::get_description() const {
00083 return "Xapian::MatchSpy()";
00084 }
00085
00086 XAPIAN_NORETURN(static void unsupported_method());
00087 static void unsupported_method() {
00088 throw Xapian::InvalidOperationError("Method not supported for this type of termlist");
00089 }
00090
00092 class ValueCountTermList : public TermList {
00093 private:
00094 map<string, Xapian::doccount>::const_iterator it;
00095 bool started;
00096 Xapian::Internal::RefCntPtr<Xapian::ValueCountMatchSpy::Internal> spy;
00097 public:
00098
00099 ValueCountTermList(ValueCountMatchSpy::Internal * spy_) : spy(spy_) {
00100 it = spy->values.begin();
00101 started = false;
00102 }
00103
00104 string get_termname() const {
00105 Assert(started);
00106 Assert(!at_end());
00107 return it->first;
00108 }
00109
00110 Xapian::doccount get_termfreq() const {
00111 Assert(started);
00112 Assert(!at_end());
00113 return it->second;
00114 }
00115
00116 TermList * next() {
00117 if (!started) {
00118 started = true;
00119 } else {
00120 Assert(!at_end());
00121 ++it;
00122 }
00123 return NULL;
00124 }
00125
00126 TermList * skip_to(const string & term) {
00127 while (it != spy->values.end() && it->first < term) {
00128 ++it;
00129 }
00130 started = true;
00131 return NULL;
00132 }
00133
00134 bool at_end() const {
00135 Assert(started);
00136 return it == spy->values.end();
00137 }
00138
00139 Xapian::termcount get_approx_size() const { unsupported_method(); return 0; }
00140 Xapian::termcount get_wdf() const { unsupported_method(); return 0; }
00141 Xapian::PositionIterator positionlist_begin() const {
00142 unsupported_method();
00143 return Xapian::PositionIterator();
00144 }
00145 Xapian::termcount positionlist_count() const { unsupported_method(); return 0; }
00146 };
00147
00150 class StringAndFrequency {
00151 std::string str;
00152 Xapian::doccount frequency;
00153 public:
00155 StringAndFrequency(std::string str_, Xapian::doccount frequency_)
00156 : str(str_), frequency(frequency_) {}
00157
00159 std::string get_string() const { return str; }
00160
00162 Xapian::doccount get_frequency() const { return frequency; }
00163 };
00164
00170 class StringAndFreqCmpByFreq {
00171 public:
00173 StringAndFreqCmpByFreq() {}
00174
00177 bool operator()(const StringAndFrequency &a,
00178 const StringAndFrequency &b) const {
00179 if (a.get_frequency() > b.get_frequency()) return true;
00180 if (a.get_frequency() < b.get_frequency()) return false;
00181 if (a.get_string() > b.get_string()) return false;
00182 return true;
00183 }
00184 };
00185
00187 class StringAndFreqTermList : public TermList {
00188 private:
00189 vector<StringAndFrequency>::const_iterator it;
00190 bool started;
00191 public:
00192 vector<StringAndFrequency> values;
00193
00197 void init() {
00198 it = values.begin();
00199 started = false;
00200 }
00201
00202 string get_termname() const {
00203 Assert(started);
00204 Assert(!at_end());
00205 return it->get_string();
00206 }
00207
00208 Xapian::doccount get_termfreq() const {
00209 Assert(started);
00210 Assert(!at_end());
00211 return it->get_frequency();
00212 }
00213
00214 TermList * next() {
00215 if (!started) {
00216 started = true;
00217 } else {
00218 Assert(!at_end());
00219 ++it;
00220 }
00221 return NULL;
00222 }
00223
00224 TermList * skip_to(const string & term) {
00225 while (it != values.end() && it->get_string() < term) {
00226 ++it;
00227 }
00228 started = true;
00229 return NULL;
00230 }
00231
00232 bool at_end() const {
00233 Assert(started);
00234 return it == values.end();
00235 }
00236
00237 Xapian::termcount get_approx_size() const { unsupported_method(); return 0; }
00238 Xapian::termcount get_wdf() const { unsupported_method(); return 0; }
00239 Xapian::PositionIterator positionlist_begin() const {
00240 unsupported_method();
00241 return Xapian::PositionIterator();
00242 }
00243 Xapian::termcount positionlist_count() const { unsupported_method(); return 0; }
00244 };
00245
00261 static void
00262 get_most_frequent_items(vector<StringAndFrequency> & result,
00263 const map<string, doccount> & items,
00264 size_t maxitems)
00265 {
00266 result.clear();
00267 result.reserve(maxitems);
00268 StringAndFreqCmpByFreq cmpfn;
00269 bool is_heap(false);
00270
00271 for (map<string, doccount>::const_iterator i = items.begin();
00272 i != items.end(); i++) {
00273 Assert(result.size() <= maxitems);
00274 result.push_back(StringAndFrequency(i->first, i->second));
00275 if (result.size() > maxitems) {
00276
00277 if (is_heap) {
00278
00279 push_heap(result.begin(), result.end(), cmpfn);
00280 } else {
00281
00282 make_heap(result.begin(), result.end(), cmpfn);
00283 is_heap = true;
00284 }
00285 pop_heap(result.begin(), result.end(), cmpfn);
00286 result.pop_back();
00287 }
00288 }
00289
00290 if (is_heap) {
00291 sort_heap(result.begin(), result.end(), cmpfn);
00292 } else {
00293 sort(result.begin(), result.end(), cmpfn);
00294 }
00295 }
00296
00297 void
00298 ValueCountMatchSpy::operator()(const Document &doc, weight) {
00299 ++(internal->total);
00300 string val(doc.get_value(internal->slot));
00301 if (!val.empty()) ++(internal->values[val]);
00302 }
00303
00304 TermIterator
00305 ValueCountMatchSpy::values_begin() const
00306 {
00307 AutoPtr<ValueCountTermList> termlist(new ValueCountTermList(internal.get()));
00308 return Xapian::TermIterator(termlist.release());
00309 }
00310
00311 TermIterator
00312 ValueCountMatchSpy::top_values_begin(size_t maxvalues) const
00313 {
00314 AutoPtr<StringAndFreqTermList> termlist(new StringAndFreqTermList);
00315 get_most_frequent_items(termlist->values, internal->values, maxvalues);
00316 termlist->init();
00317 return Xapian::TermIterator(termlist.release());
00318 }
00319
00320 MatchSpy *
00321 ValueCountMatchSpy::clone() const {
00322 return new ValueCountMatchSpy(internal->slot);
00323 }
00324
00325 string
00326 ValueCountMatchSpy::name() const {
00327 return "Xapian::ValueCountMatchSpy";
00328 }
00329
00330 string
00331 ValueCountMatchSpy::serialise() const {
00332 string result;
00333 result += encode_length(internal->slot);
00334 return result;
00335 }
00336
00337 MatchSpy *
00338 ValueCountMatchSpy::unserialise(const string & s, const Registry &) const
00339 {
00340 const char * p = s.data();
00341 const char * end = p + s.size();
00342
00343 valueno new_slot = decode_length(&p, end, false);
00344 if (p != end) {
00345 throw NetworkError("Junk at end of serialised ValueCountMatchSpy");
00346 }
00347
00348 return new ValueCountMatchSpy(new_slot);
00349 }
00350
00351 string
00352 ValueCountMatchSpy::serialise_results() const {
00353 LOGCALL(REMOTE, string, "ValueCountMatchSpy::serialise_results", NO_ARGS);
00354 string result;
00355 result += encode_length(internal->total);
00356 result += encode_length(internal->values.size());
00357 for (map<string, doccount>::const_iterator i = internal->values.begin();
00358 i != internal->values.end(); ++i) {
00359 result += encode_length(i->first.size());
00360 result += i->first;
00361 result += encode_length(i->second);
00362 }
00363 RETURN(result);
00364 }
00365
00366 void
00367 ValueCountMatchSpy::merge_results(const string & s) {
00368 LOGCALL_VOID(REMOTE, "ValueCountMatchSpy::merge_results", s);
00369 const char * p = s.data();
00370 const char * end = p + s.size();
00371
00372 internal->total += decode_length(&p, end, false);
00373
00374 map<string, doccount>::size_type items = decode_length(&p, end, false);
00375 while (p != end) {
00376 while (items != 0) {
00377 size_t vallen = decode_length(&p, end, true);
00378 string val(p, vallen);
00379 p += vallen;
00380 doccount freq = decode_length(&p, end, false);
00381 internal->values[val] += freq;
00382 --items;
00383 }
00384 }
00385 }
00386
00387 string
00388 ValueCountMatchSpy::get_description() const {
00389 return "Xapian::ValueCountMatchSpy(" + str(internal->total) +
00390 " docs seen, looking in " + str(internal->values.size()) + " slots)";
00391 }