xapian-core  1.4.25
matchspy.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007,2008,2009,2010,2011,2012,2013,2014,2015 Olly Betts
5  * Copyright (C) 2007,2009 Lemur Consulting Ltd
6  * Copyright (C) 2010 Richard Boulton
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include <config.h>
24 #include <xapian/matchspy.h>
25 
26 #include <xapian/document.h>
27 #include <xapian/error.h>
28 #include <xapian/queryparser.h>
29 #include <xapian/registry.h>
30 
31 #include <map>
32 #include <string>
33 #include <vector>
34 
35 #include "autoptr.h"
36 #include "debuglog.h"
37 #include "noreturn.h"
38 #include "omassert.h"
39 #include "net/length.h"
40 #include "stringutils.h"
41 #include "str.h"
42 #include "termlist.h"
43 
44 using namespace std;
45 using namespace Xapian;
47 
48 MatchSpy::~MatchSpy() {}
49 
50 MatchSpy *
51 MatchSpy::clone() const {
52  throw UnimplementedError("MatchSpy not suitable for use with remote searches - clone() method unimplemented");
53 }
54 
55 string
56 MatchSpy::name() const {
57  throw UnimplementedError("MatchSpy not suitable for use with remote searches - name() method unimplemented");
58 }
59 
60 string
61 MatchSpy::serialise() const {
62  throw UnimplementedError("MatchSpy not suitable for use with remote searches - serialise() method unimplemented");
63 }
64 
65 MatchSpy *
66 MatchSpy::unserialise(const string &, const Registry &) const {
67  throw UnimplementedError("MatchSpy not suitable for use with remote searches - unserialise() method unimplemented");
68 }
69 
70 string
71 MatchSpy::serialise_results() const {
72  throw UnimplementedError("MatchSpy not suitable for use with remote searches - serialise_results() method unimplemented");
73 }
74 
75 void
76 MatchSpy::merge_results(const string &) {
77  throw UnimplementedError("MatchSpy not suitable for use with remote searches - merge_results() method unimplemented");
78 }
79 
80 string
81 MatchSpy::get_description() const {
82  return "Xapian::MatchSpy()";
83 }
84 
85 XAPIAN_NORETURN(static void unsupported_method());
86 static void unsupported_method() {
87  throw Xapian::InvalidOperationError("Method not supported for this type of termlist");
88 }
89 
91 class ValueCountTermList final : public TermList {
92  private:
93  map<string, Xapian::doccount>::const_iterator it;
94  bool started;
96  public:
97 
99  : spy(spy_)
100  {
101  it = spy->values.begin();
102  started = false;
103  }
104 
105  string get_termname() const {
106  Assert(started);
107  Assert(!at_end());
108  return it->first;
109  }
110 
112  Assert(started);
113  Assert(!at_end());
114  return it->second;
115  }
116 
118  if (!started) {
119  started = true;
120  } else {
121  Assert(!at_end());
122  ++it;
123  }
124  return NULL;
125  }
126 
127  TermList * skip_to(const string & term) {
128  while (it != spy->values.end() && it->first < term) {
129  ++it;
130  }
131  started = true;
132  return NULL;
133  }
134 
135  bool at_end() const {
136  Assert(started);
137  return it == spy->values.end();
138  }
139 
144  return Xapian::PositionIterator();
145  }
147 };
148 
152  std::string str;
154  public:
156  StringAndFrequency(const std::string & str_, Xapian::doccount frequency_)
157  : str(str_), frequency(frequency_) {}
158 
160  std::string get_string() const { return str; }
161 
163  Xapian::doccount get_frequency() const { return frequency; }
164 };
165 
172  public:
175 
179  const StringAndFrequency &b) const {
180  if (a.get_frequency() > b.get_frequency()) return true;
181  if (a.get_frequency() < b.get_frequency()) return false;
182  return a.get_string() < b.get_string();
183  }
184 };
185 
187 class StringAndFreqTermList final : public TermList {
188  private:
189  vector<StringAndFrequency>::const_iterator it;
190  bool started;
191  public:
192  vector<StringAndFrequency> values;
193 
197  void init() {
198  it = values.begin();
199  started = false;
200  }
201 
202  string get_termname() const {
203  Assert(started);
204  Assert(!at_end());
205  return it->get_string();
206  }
207 
209  Assert(started);
210  Assert(!at_end());
211  return it->get_frequency();
212  }
213 
215  if (!started) {
216  started = true;
217  } else {
218  Assert(!at_end());
219  ++it;
220  }
221  return NULL;
222  }
223 
224  TermList * skip_to(const string & term) {
225  while (it != values.end() && it->get_string() < term) {
226  ++it;
227  }
228  started = true;
229  return NULL;
230  }
231 
232  bool at_end() const {
233  Assert(started);
234  return it == values.end();
235  }
236 
241  return Xapian::PositionIterator();
242  }
244 };
245 
261 static void
262 get_most_frequent_items(vector<StringAndFrequency> & result,
263  const map<string, doccount> & items,
264  size_t maxitems)
265 {
266  result.clear();
267  result.reserve(maxitems);
269  bool is_heap(false);
270 
271  for (map<string, doccount>::const_iterator i = items.begin();
272  i != items.end(); ++i) {
273  Assert(result.size() <= maxitems);
274  result.push_back(StringAndFrequency(i->first, i->second));
275  if (result.size() > maxitems) {
276  // Make the list back into a heap.
277  if (is_heap) {
278  // Only the new element isn't in the right place.
279  push_heap(result.begin(), result.end(), cmpfn);
280  } else {
281  // Need to build heap from scratch.
282  make_heap(result.begin(), result.end(), cmpfn);
283  is_heap = true;
284  }
285  pop_heap(result.begin(), result.end(), cmpfn);
286  result.pop_back();
287  }
288  }
289 
290  if (is_heap) {
291  sort_heap(result.begin(), result.end(), cmpfn);
292  } else {
293  sort(result.begin(), result.end(), cmpfn);
294  }
295 }
296 
297 void
298 ValueCountMatchSpy::operator()(const Document &doc, double) {
299  Assert(internal.get());
300  ++(internal->total);
301  string val(doc.get_value(internal->slot));
302  if (!val.empty()) ++(internal->values[val]);
303 }
304 
306 ValueCountMatchSpy::values_begin() const
307 {
308  Assert(internal.get());
309  return Xapian::TermIterator(new ValueCountTermList(internal.get()));
310 }
311 
313 ValueCountMatchSpy::top_values_begin(size_t maxvalues) const
314 {
315  Assert(internal.get());
316  AutoPtr<StringAndFreqTermList> termlist(new StringAndFreqTermList);
317  get_most_frequent_items(termlist->values, internal->values, maxvalues);
318  termlist->init();
319  return Xapian::TermIterator(termlist.release());
320 }
321 
322 MatchSpy *
323 ValueCountMatchSpy::clone() const {
324  Assert(internal.get());
325  return new ValueCountMatchSpy(internal->slot);
326 }
327 
328 string
330  return "Xapian::ValueCountMatchSpy";
331 }
332 
333 string
334 ValueCountMatchSpy::serialise() const {
335  Assert(internal.get());
336  string result;
337  result += encode_length(internal->slot);
338  return result;
339 }
340 
341 MatchSpy *
342 ValueCountMatchSpy::unserialise(const string & s, const Registry &) const
343 {
344  const char * p = s.data();
345  const char * end = p + s.size();
346 
347  valueno new_slot;
348  decode_length(&p, end, new_slot);
349  if (p != end) {
350  throw NetworkError("Junk at end of serialised ValueCountMatchSpy");
351  }
352 
353  return new ValueCountMatchSpy(new_slot);
354 }
355 
356 string
357 ValueCountMatchSpy::serialise_results() const {
358  LOGCALL(REMOTE, string, "ValueCountMatchSpy::serialise_results", NO_ARGS);
359  Assert(internal.get());
360  string result;
361  result += encode_length(internal->total);
362  result += encode_length(internal->values.size());
363  for (map<string, doccount>::const_iterator i = internal->values.begin();
364  i != internal->values.end(); ++i) {
365  result += encode_length(i->first.size());
366  result += i->first;
367  result += encode_length(i->second);
368  }
369  RETURN(result);
370 }
371 
372 void
373 ValueCountMatchSpy::merge_results(const string & s) {
374  LOGCALL_VOID(REMOTE, "ValueCountMatchSpy::merge_results", s);
375  Assert(internal.get());
376  const char * p = s.data();
377  const char * end = p + s.size();
378 
380  decode_length(&p, end, n);
381  internal->total += n;
382 
383  map<string, doccount>::size_type items;
384  decode_length(&p, end, items);
385  while (items != 0) {
386  size_t vallen;
387  decode_length_and_check(&p, end, vallen);
388  string val(p, vallen);
389  p += vallen;
390  doccount freq;
391  decode_length(&p, end, freq);
392  internal->values[val] += freq;
393  --items;
394  }
395  if (p != end) {
396  throw NetworkError("Junk at end of serialised ValueCountMatchSpy "
397  "results");
398  }
399 }
400 
401 string
402 ValueCountMatchSpy::get_description() const {
403  string d = "ValueCountMatchSpy(";
404  if (internal.get()) {
405  d += str(internal->total);
406  d += " docs seen, looking in ";
407  d += str(internal->values.size());
408  d += " slots)";
409  } else {
410  d += ")";
411  }
412  return d;
413 }
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
#define RETURN(A)
Definition: debuglog.h:493
#define Assert(COND)
Definition: omassert.h:122
Define the XAPIAN_NORETURN macro.
Xapian::doccount get_termfreq() const
Return the term frequency for the term at the current position.
Definition: matchspy.cc:208
A termlist iterator over a vector of StringAndFrequency objects.
Definition: matchspy.cc:187
length encoded as a string
Xapian::termcount positionlist_count() const
Return the length of the position list for the current position.
Definition: matchspy.cc:146
TermList * skip_to(const string &term)
Skip forward to the specified term.
Definition: matchspy.cc:127
string get_termname() const
Return the termname at the current position.
Definition: matchspy.cc:105
Xapian::PositionIterator positionlist_begin() const
Return a PositionIterator for the current position.
Definition: matchspy.cc:142
Xapian::doccount get_frequency() const
Return the frequency.
Definition: matchspy.cc:163
InvalidOperationError indicates the API was used in an invalid way.
Definition: error.h:283
A string with a corresponding frequency.
Definition: matchspy.cc:151
Abstract base class for match spies.
Definition: matchspy.h:49
A termlist iterator over the contents of a ValueCountMatchSpy.
Definition: matchspy.cc:91
ValueCountTermList(ValueCountMatchSpy::Internal *spy_)
Definition: matchspy.cc:98
intrusive_ptr< Xapian::ValueCountMatchSpy::Internal > spy
Definition: matchspy.cc:95
#define LOGCALL_VOID(CATEGORY, FUNC, PARAMS)
Definition: debuglog.h:488
Abstract base class for termlists.
Definition: termlist.h:39
STL namespace.
Convert types to std::string.
std::string get_string() const
Return the string.
Definition: matchspy.cc:160
std::string encode_length(T len)
Encode a length as a variable-length string.
Definition: length.h:36
static void unsupported_method()
Definition: matchspy.cc:86
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
Definition: matchspy.cc:238
Hierarchy of classes which Xapian can throw as exceptions.
Class for iterating over a list of terms.
Definition: termiterator.h:41
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
string get_termname() const
Return the termname at the current position.
Definition: matchspy.cc:202
map< string, Xapian::doccount >::const_iterator it
Definition: matchspy.cc:93
MatchSpy implementation.
bool operator()(const StringAndFrequency &a, const StringAndFrequency &b) const
Return true if a has a higher frequency than b.
Definition: matchspy.cc:178
std::map< std::string, Xapian::doccount > values
The values seen so far, together with their frequency.
Definition: matchspy.h:221
Registry for user subclasses.
Definition: registry.h:47
vector< StringAndFrequency >::const_iterator it
Definition: matchspy.cc:189
string str(int value)
Convert int to std::string.
Definition: str.cc:90
Xapian::PositionIterator positionlist_begin() const
Return a PositionIterator for the current position.
Definition: matchspy.cc:239
StringAndFrequency(const std::string &str_, Xapian::doccount frequency_)
Construct a StringAndFrequency object.
Definition: matchspy.cc:156
TermList * next()
Advance the current position to the next term in the termlist.
Definition: matchspy.cc:117
Xapian::doccount frequency
Definition: matchspy.cc:153
Class for iterating over term positions.
bool at_end() const
Return true if the current position is past the last term in this list.
Definition: matchspy.cc:232
static void get_most_frequent_items(vector< StringAndFrequency > &result, const map< string, doccount > &items, size_t maxitems)
Get the most frequent items from a map from string to frequency.
Definition: matchspy.cc:262
StringAndFreqCmpByFreq()
Default constructor.
Definition: matchspy.cc:174
Xapian::doccount get_termfreq() const
Return the term frequency for the term at the current position.
Definition: matchspy.cc:111
Xapian::termcount get_approx_size() const
Return approximate size of this termlist.
Definition: matchspy.cc:237
Class for counting the frequencies of values in the matching documents.
Definition: matchspy.h:205
bool at_end() const
Return true if the current position is past the last term in this list.
Definition: matchspy.cc:135
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
Definition: matchspy.cc:141
void decode_length_and_check(const char **p, const char *end, unsigned &out)
Decode a length encoded by encode_length.
Definition: length.cc:112
char name[9]
Definition: dbcheck.cc:55
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Xapian::termcount get_approx_size() const
Return approximate size of this termlist.
Definition: matchspy.cc:140
Indicates a problem communicating with a remote database.
Definition: error.h:803
unsigned valueno
The number for a value slot in a document.
Definition: types.h:108
Various handy helpers which std::string really should provide.
Abstract base class for termlists.
void init()
init should be called after the values have been set, but before iteration begins.
Definition: matchspy.cc:197
TermList * skip_to(const string &term)
Skip forward to the specified term.
Definition: matchspy.cc:224
Various assertion macros.
Xapian::termcount positionlist_count() const
Return the length of the position list for the current position.
Definition: matchspy.cc:243
API for working with documents.
std::string str
Definition: matchspy.cc:152
A smart pointer that uses intrusive reference counting.
Definition: intrusive_ptr.h:81
TermList * next()
Advance the current position to the next term in the termlist.
Definition: matchspy.cc:214
std::string get_value(Xapian::valueno slot) const
Get value by number.
Definition: omdocument.cc:64
void decode_length(const char **p, const char *end, unsigned &out)
Decode a length encoded by encode_length.
Definition: length.cc:94
A handle representing a document in a Xapian database.
Definition: document.h:61
Compare two StringAndFrequency objects.
Definition: matchspy.cc:171
Wrapper around standard unique_ptr template.
Debug logging macros.
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:487
UnimplementedError indicates an attempt to use an unimplemented feature.
Definition: error.h:325
Class for looking up user subclasses during unserialisation.
vector< StringAndFrequency > values
Definition: matchspy.cc:192
parsing a user query string to build a Xapian::Query object