xapian-core  1.4.20
matchspy.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007,2008,2009,2010,2011,2012,2013,2014,2015 Olly Betts
5  * Copyright (C) 2007,2009 Lemur Consulting Ltd
6  * Copyright (C) 2010 Richard Boulton
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include <config.h>
24 #include <xapian/matchspy.h>
25 
26 #include <xapian/document.h>
27 #include <xapian/error.h>
28 #include <xapian/queryparser.h>
29 #include <xapian/registry.h>
30 
31 #include <map>
32 #include <string>
33 #include <vector>
34 
35 #include "autoptr.h"
36 #include "debuglog.h"
37 #include "noreturn.h"
38 #include "omassert.h"
39 #include "net/length.h"
40 #include "stringutils.h"
41 #include "str.h"
42 #include "termlist.h"
43 
44 #include <cfloat>
45 #include <cmath>
46 
47 using namespace std;
48 using namespace Xapian;
50 
51 MatchSpy::~MatchSpy() {}
52 
53 MatchSpy *
54 MatchSpy::clone() const {
55  throw UnimplementedError("MatchSpy not suitable for use with remote searches - clone() method unimplemented");
56 }
57 
58 string
59 MatchSpy::name() const {
60  throw UnimplementedError("MatchSpy not suitable for use with remote searches - name() method unimplemented");
61 }
62 
63 string
64 MatchSpy::serialise() const {
65  throw UnimplementedError("MatchSpy not suitable for use with remote searches - serialise() method unimplemented");
66 }
67 
68 MatchSpy *
69 MatchSpy::unserialise(const string &, const Registry &) const {
70  throw UnimplementedError("MatchSpy not suitable for use with remote searches - unserialise() method unimplemented");
71 }
72 
73 string
74 MatchSpy::serialise_results() const {
75  throw UnimplementedError("MatchSpy not suitable for use with remote searches - serialise_results() method unimplemented");
76 }
77 
78 void
79 MatchSpy::merge_results(const string &) {
80  throw UnimplementedError("MatchSpy not suitable for use with remote searches - merge_results() method unimplemented");
81 }
82 
83 string
84 MatchSpy::get_description() const {
85  return "Xapian::MatchSpy()";
86 }
87 
88 XAPIAN_NORETURN(static void unsupported_method());
89 static void unsupported_method() {
90  throw Xapian::InvalidOperationError("Method not supported for this type of termlist");
91 }
92 
94 class ValueCountTermList final : public TermList {
95  private:
96  map<string, Xapian::doccount>::const_iterator it;
97  bool started;
99  public:
100 
102  : spy(spy_)
103  {
104  it = spy->values.begin();
105  started = false;
106  }
107 
108  string get_termname() const {
109  Assert(started);
110  Assert(!at_end());
111  return it->first;
112  }
113 
115  Assert(started);
116  Assert(!at_end());
117  return it->second;
118  }
119 
121  if (!started) {
122  started = true;
123  } else {
124  Assert(!at_end());
125  ++it;
126  }
127  return NULL;
128  }
129 
130  TermList * skip_to(const string & term) {
131  while (it != spy->values.end() && it->first < term) {
132  ++it;
133  }
134  started = true;
135  return NULL;
136  }
137 
138  bool at_end() const {
139  Assert(started);
140  return it == spy->values.end();
141  }
142 
147  return Xapian::PositionIterator();
148  }
150 };
151 
155  std::string str;
157  public:
159  StringAndFrequency(const std::string & str_, Xapian::doccount frequency_)
160  : str(str_), frequency(frequency_) {}
161 
163  std::string get_string() const { return str; }
164 
166  Xapian::doccount get_frequency() const { return frequency; }
167 };
168 
175  public:
178 
182  const StringAndFrequency &b) const {
183  if (a.get_frequency() > b.get_frequency()) return true;
184  if (a.get_frequency() < b.get_frequency()) return false;
185  return a.get_string() < b.get_string();
186  }
187 };
188 
190 class StringAndFreqTermList final : public TermList {
191  private:
192  vector<StringAndFrequency>::const_iterator it;
193  bool started;
194  public:
195  vector<StringAndFrequency> values;
196 
200  void init() {
201  it = values.begin();
202  started = false;
203  }
204 
205  string get_termname() const {
206  Assert(started);
207  Assert(!at_end());
208  return it->get_string();
209  }
210 
212  Assert(started);
213  Assert(!at_end());
214  return it->get_frequency();
215  }
216 
218  if (!started) {
219  started = true;
220  } else {
221  Assert(!at_end());
222  ++it;
223  }
224  return NULL;
225  }
226 
227  TermList * skip_to(const string & term) {
228  while (it != values.end() && it->get_string() < term) {
229  ++it;
230  }
231  started = true;
232  return NULL;
233  }
234 
235  bool at_end() const {
236  Assert(started);
237  return it == values.end();
238  }
239 
244  return Xapian::PositionIterator();
245  }
247 };
248 
264 static void
265 get_most_frequent_items(vector<StringAndFrequency> & result,
266  const map<string, doccount> & items,
267  size_t maxitems)
268 {
269  result.clear();
270  result.reserve(maxitems);
272  bool is_heap(false);
273 
274  for (map<string, doccount>::const_iterator i = items.begin();
275  i != items.end(); ++i) {
276  Assert(result.size() <= maxitems);
277  result.push_back(StringAndFrequency(i->first, i->second));
278  if (result.size() > maxitems) {
279  // Make the list back into a heap.
280  if (is_heap) {
281  // Only the new element isn't in the right place.
282  push_heap(result.begin(), result.end(), cmpfn);
283  } else {
284  // Need to build heap from scratch.
285  make_heap(result.begin(), result.end(), cmpfn);
286  is_heap = true;
287  }
288  pop_heap(result.begin(), result.end(), cmpfn);
289  result.pop_back();
290  }
291  }
292 
293  if (is_heap) {
294  sort_heap(result.begin(), result.end(), cmpfn);
295  } else {
296  sort(result.begin(), result.end(), cmpfn);
297  }
298 }
299 
300 void
301 ValueCountMatchSpy::operator()(const Document &doc, double) {
302  Assert(internal.get());
303  ++(internal->total);
304  string val(doc.get_value(internal->slot));
305  if (!val.empty()) ++(internal->values[val]);
306 }
307 
309 ValueCountMatchSpy::values_begin() const
310 {
311  Assert(internal.get());
312  return Xapian::TermIterator(new ValueCountTermList(internal.get()));
313 }
314 
316 ValueCountMatchSpy::top_values_begin(size_t maxvalues) const
317 {
318  Assert(internal.get());
319  AutoPtr<StringAndFreqTermList> termlist(new StringAndFreqTermList);
320  get_most_frequent_items(termlist->values, internal->values, maxvalues);
321  termlist->init();
322  return Xapian::TermIterator(termlist.release());
323 }
324 
325 MatchSpy *
326 ValueCountMatchSpy::clone() const {
327  Assert(internal.get());
328  return new ValueCountMatchSpy(internal->slot);
329 }
330 
331 string
333  return "Xapian::ValueCountMatchSpy";
334 }
335 
336 string
337 ValueCountMatchSpy::serialise() const {
338  Assert(internal.get());
339  string result;
340  result += encode_length(internal->slot);
341  return result;
342 }
343 
344 MatchSpy *
345 ValueCountMatchSpy::unserialise(const string & s, const Registry &) const
346 {
347  const char * p = s.data();
348  const char * end = p + s.size();
349 
350  valueno new_slot;
351  decode_length(&p, end, new_slot);
352  if (p != end) {
353  throw NetworkError("Junk at end of serialised ValueCountMatchSpy");
354  }
355 
356  return new ValueCountMatchSpy(new_slot);
357 }
358 
359 string
360 ValueCountMatchSpy::serialise_results() const {
361  LOGCALL(REMOTE, string, "ValueCountMatchSpy::serialise_results", NO_ARGS);
362  Assert(internal.get());
363  string result;
364  result += encode_length(internal->total);
365  result += encode_length(internal->values.size());
366  for (map<string, doccount>::const_iterator i = internal->values.begin();
367  i != internal->values.end(); ++i) {
368  result += encode_length(i->first.size());
369  result += i->first;
370  result += encode_length(i->second);
371  }
372  RETURN(result);
373 }
374 
375 void
376 ValueCountMatchSpy::merge_results(const string & s) {
377  LOGCALL_VOID(REMOTE, "ValueCountMatchSpy::merge_results", s);
378  Assert(internal.get());
379  const char * p = s.data();
380  const char * end = p + s.size();
381 
383  decode_length(&p, end, n);
384  internal->total += n;
385 
386  map<string, doccount>::size_type items;
387  decode_length(&p, end, items);
388  while (items != 0) {
389  size_t vallen;
390  decode_length_and_check(&p, end, vallen);
391  string val(p, vallen);
392  p += vallen;
393  doccount freq;
394  decode_length(&p, end, freq);
395  internal->values[val] += freq;
396  --items;
397  }
398  if (p != end) {
399  throw NetworkError("Junk at end of serialised ValueCountMatchSpy "
400  "results");
401  }
402 }
403 
404 string
405 ValueCountMatchSpy::get_description() const {
406  string d = "ValueCountMatchSpy(";
407  if (internal.get()) {
408  d += str(internal->total);
409  d += " docs seen, looking in ";
410  d += str(internal->values.size());
411  d += " slots)";
412  } else {
413  d += ")";
414  }
415  return d;
416 }
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
#define RETURN(A)
Definition: debuglog.h:482
#define Assert(COND)
Definition: omassert.h:122
Define the XAPIAN_NORETURN macro.
Xapian::doccount get_termfreq() const
Return the term frequency for the term at the current position.
Definition: matchspy.cc:211
A termlist iterator over a vector of StringAndFrequency objects.
Definition: matchspy.cc:190
length encoded as a string
Xapian::termcount positionlist_count() const
Return the length of the position list for the current position.
Definition: matchspy.cc:149
TermList * skip_to(const string &term)
Skip forward to the specified term.
Definition: matchspy.cc:130
string get_termname() const
Return the termname at the current position.
Definition: matchspy.cc:108
Xapian::PositionIterator positionlist_begin() const
Return a PositionIterator for the current position.
Definition: matchspy.cc:145
Xapian::doccount get_frequency() const
Return the frequency.
Definition: matchspy.cc:166
InvalidOperationError indicates the API was used in an invalid way.
Definition: error.h:283
A string with a corresponding frequency.
Definition: matchspy.cc:154
Abstract base class for match spies.
Definition: matchspy.h:49
A termlist iterator over the contents of a ValueCountMatchSpy.
Definition: matchspy.cc:94
ValueCountTermList(ValueCountMatchSpy::Internal *spy_)
Definition: matchspy.cc:101
intrusive_ptr< Xapian::ValueCountMatchSpy::Internal > spy
Definition: matchspy.cc:98
#define LOGCALL_VOID(CATEGORY, FUNC, PARAMS)
Definition: debuglog.h:477
Abstract base class for termlists.
Definition: termlist.h:39
STL namespace.
Convert types to std::string.
std::string get_string() const
Return the string.
Definition: matchspy.cc:163
std::string encode_length(T len)
Encode a length as a variable-length string.
Definition: length.h:36
static void unsupported_method()
Definition: matchspy.cc:89
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
Definition: matchspy.cc:241
Hierarchy of classes which Xapian can throw as exceptions.
Class for iterating over a list of terms.
Definition: termiterator.h:41
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
string get_termname() const
Return the termname at the current position.
Definition: matchspy.cc:205
map< string, Xapian::doccount >::const_iterator it
Definition: matchspy.cc:96
MatchSpy implementation.
bool operator()(const StringAndFrequency &a, const StringAndFrequency &b) const
Return true if a has a higher frequency than b.
Definition: matchspy.cc:181
std::map< std::string, Xapian::doccount > values
The values seen so far, together with their frequency.
Definition: matchspy.h:221
Registry for user subclasses.
Definition: registry.h:47
vector< StringAndFrequency >::const_iterator it
Definition: matchspy.cc:192
string str(int value)
Convert int to std::string.
Definition: str.cc:90
Xapian::PositionIterator positionlist_begin() const
Return a PositionIterator for the current position.
Definition: matchspy.cc:242
StringAndFrequency(const std::string &str_, Xapian::doccount frequency_)
Construct a StringAndFrequency object.
Definition: matchspy.cc:159
TermList * next()
Advance the current position to the next term in the termlist.
Definition: matchspy.cc:120
Xapian::doccount frequency
Definition: matchspy.cc:156
Class for iterating over term positions.
bool at_end() const
Return true if the current position is past the last term in this list.
Definition: matchspy.cc:235
static void get_most_frequent_items(vector< StringAndFrequency > &result, const map< string, doccount > &items, size_t maxitems)
Get the most frequent items from a map from string to frequency.
Definition: matchspy.cc:265
StringAndFreqCmpByFreq()
Default constructor.
Definition: matchspy.cc:177
Xapian::doccount get_termfreq() const
Return the term frequency for the term at the current position.
Definition: matchspy.cc:114
Xapian::termcount get_approx_size() const
Return approximate size of this termlist.
Definition: matchspy.cc:240
Class for counting the frequencies of values in the matching documents.
Definition: matchspy.h:205
bool at_end() const
Return true if the current position is past the last term in this list.
Definition: matchspy.cc:138
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
Definition: matchspy.cc:144
void decode_length_and_check(const char **p, const char *end, unsigned &out)
Decode a length encoded by encode_length.
Definition: length.cc:112
char name[9]
Definition: dbcheck.cc:55
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Xapian::termcount get_approx_size() const
Return approximate size of this termlist.
Definition: matchspy.cc:143
Indicates a problem communicating with a remote database.
Definition: error.h:803
unsigned valueno
The number for a value slot in a document.
Definition: types.h:108
Various handy helpers which std::string really should provide.
Abstract base class for termlists.
void init()
init should be called after the values have been set, but before iteration begins.
Definition: matchspy.cc:200
TermList * skip_to(const string &term)
Skip forward to the specified term.
Definition: matchspy.cc:227
Various assertion macros.
Xapian::termcount positionlist_count() const
Return the length of the position list for the current position.
Definition: matchspy.cc:246
API for working with documents.
std::string str
Definition: matchspy.cc:155
A smart pointer that uses intrusive reference counting.
Definition: intrusive_ptr.h:81
TermList * next()
Advance the current position to the next term in the termlist.
Definition: matchspy.cc:217
std::string get_value(Xapian::valueno slot) const
Get value by number.
Definition: omdocument.cc:64
void decode_length(const char **p, const char *end, unsigned &out)
Decode a length encoded by encode_length.
Definition: length.cc:94
A handle representing a document in a Xapian database.
Definition: document.h:61
Compare two StringAndFrequency objects.
Definition: matchspy.cc:174
Wrapper around standard unique_ptr template.
Debug logging macros.
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:476
UnimplementedError indicates an attempt to use an unimplemented feature.
Definition: error.h:325
Class for looking up user subclasses during unserialisation.
vector< StringAndFrequency > values
Definition: matchspy.cc:195
parsing a user query string to build a Xapian::Query object