xapian-core  2.0.0
matchspy.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007-2024 Olly Betts
5  * Copyright (C) 2007,2009 Lemur Consulting Ltd
6  * Copyright (C) 2010 Richard Boulton
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, see
20  * <https://www.gnu.org/licenses/>.
21  */
22 
23 #include <config.h>
24 #include <xapian/matchspy.h>
25 
26 #include <xapian/document.h>
27 #include <xapian/error.h>
28 #include <xapian/queryparser.h>
29 #include <xapian/registry.h>
30 
31 #include <map>
32 #include <memory>
33 #include <string>
34 #include <string_view>
35 #include <vector>
36 
37 #include "debuglog.h"
38 #include "heap.h"
39 #include "omassert.h"
40 #include "pack.h"
41 #include "stringutils.h"
42 #include "str.h"
43 #include "termlist.h"
44 
45 using namespace std;
46 using namespace Xapian;
48 
49 MatchSpy::~MatchSpy() {}
50 
51 MatchSpy *
52 MatchSpy::clone() const {
53  throw UnimplementedError("MatchSpy not suitable for use with remote searches - clone() method unimplemented");
54 }
55 
56 string
57 MatchSpy::name() const {
58  throw UnimplementedError("MatchSpy not suitable for use with remote searches - name() method unimplemented");
59 }
60 
61 string
62 MatchSpy::serialise() const {
63  throw UnimplementedError("MatchSpy not suitable for use with remote searches - serialise() method unimplemented");
64 }
65 
66 MatchSpy *
67 MatchSpy::unserialise(const string &, const Registry &) const {
68  throw UnimplementedError("MatchSpy not suitable for use with remote searches - unserialise() method unimplemented");
69 }
70 
71 string
72 MatchSpy::serialise_results() const {
73  throw UnimplementedError("MatchSpy not suitable for use with remote searches - serialise_results() method unimplemented");
74 }
75 
76 void
77 MatchSpy::merge_results(const string &) {
78  throw UnimplementedError("MatchSpy not suitable for use with remote searches - merge_results() method unimplemented");
79 }
80 
81 string
82 MatchSpy::get_description() const {
83  return "Xapian::MatchSpy()";
84 }
85 
86 [[noreturn]]
87 static void unsupported_method() {
88  throw Xapian::InvalidOperationError("Method not supported for this type of termlist");
89 }
90 
92 class ValueCountTermList final : public TermList {
93  private:
94  map<string, Xapian::doccount>::const_iterator it;
95  bool started;
97  public:
98 
100  : spy(spy_)
101  {
102  it = spy->values.begin();
103  started = false;
104  }
105 
107  Assert(started);
108  Assert(it != spy->values.end());
109  return it->second;
110  }
111 
113  if (!started) {
114  started = true;
115  } else {
116  Assert(it != spy->values.end());
117  ++it;
118  }
119  if (it == spy->values.end()) {
120  return this;
121  }
122  current_term = it->first;
123  return NULL;
124  }
125 
126  TermList* skip_to(string_view term) {
127  while (it != spy->values.end() && it->first < term) {
128  ++it;
129  }
130  started = true;
131  if (it == spy->values.end()) {
132  return this;
133  }
134  current_term = it->first;
135  return NULL;
136  }
137 
142 };
143 
147  std::string str;
149  public:
151  StringAndFrequency(const std::string & str_, Xapian::doccount frequency_)
152  : str(str_), frequency(frequency_) {}
153 
155  std::string get_string() const { return str; }
156 
158  Xapian::doccount get_frequency() const { return frequency; }
159 };
160 
167  public:
170 
174  const StringAndFrequency &b) const {
175  if (a.get_frequency() > b.get_frequency()) return true;
176  if (a.get_frequency() < b.get_frequency()) return false;
177  return a.get_string() < b.get_string();
178  }
179 };
180 
182 class StringAndFreqTermList final : public TermList {
183  private:
184  vector<StringAndFrequency>::const_iterator it;
185  bool started;
186  public:
187  vector<StringAndFrequency> values;
188 
192  void init() {
193  it = values.begin();
194  started = false;
195  }
196 
198  Assert(started);
199  Assert(it != values.end());
200  return it->get_frequency();
201  }
202 
204  if (!started) {
205  started = true;
206  } else {
207  Assert(it != values.end());
208  ++it;
209  }
210  if (it == values.end()) {
211  return this;
212  }
213  current_term = it->get_string();
214  return NULL;
215  }
216 
217  TermList* skip_to(string_view term) {
218  while (it != values.end() && it->get_string() < term) {
219  ++it;
220  }
221  started = true;
222  if (it != values.end()) {
223  current_term = it->get_string();
224  }
225  return NULL;
226  }
227 
232 };
233 
249 static void
250 get_most_frequent_items(vector<StringAndFrequency> & result,
251  const map<string, doccount> & items,
252  size_t maxitems)
253 {
254  Assert(maxitems != 0);
255  result.clear();
256  result.reserve(maxitems);
258  bool is_heap = false;
259 
260  for (map<string, doccount>::const_iterator i = items.begin();
261  i != items.end(); ++i) {
262  if (result.size() < maxitems) {
263  result.emplace_back(i->first, i->second);
264  continue;
265  }
266 
267  // We have the desired number of items, so it's one-in one-out from
268  // now on.
269  Assert(result.size() == maxitems);
270  if (!is_heap) {
271  Heap::make(result.begin(), result.end(), cmpfn);
272  is_heap = true;
273  }
274 
275  StringAndFrequency new_item(i->first, i->second);
276  if (!cmpfn(new_item, result[0])) {
277  // The candidate is worse than the worst of the current top N.
278  continue;
279  }
280 
281  result[0] = std::move(new_item);
282  Heap::replace(result.begin(), result.end(), cmpfn);
283  }
284 
285  if (is_heap) {
286  Heap::sort(result.begin(), result.end(), cmpfn);
287  } else {
288  sort(result.begin(), result.end(), cmpfn);
289  }
290 }
291 
292 void
293 ValueCountMatchSpy::operator()(const Document &doc, double) {
294  Assert(internal);
295  ++(internal->total);
296  string val(doc.get_value(internal->slot));
297  if (!val.empty()) ++(internal->values[val]);
298 }
299 
301 ValueCountMatchSpy::values_begin() const
302 {
303  Assert(internal);
304  return Xapian::TermIterator(new ValueCountTermList(internal.get()));
305 }
306 
308 ValueCountMatchSpy::top_values_begin(size_t maxvalues) const
309 {
310  Assert(internal);
311  unique_ptr<StringAndFreqTermList> termlist;
312  if (usual(maxvalues > 0)) {
313  termlist.reset(new StringAndFreqTermList);
314  get_most_frequent_items(termlist->values, internal->values, maxvalues);
315  termlist->init();
316  }
317  return Xapian::TermIterator(termlist.release());
318 }
319 
320 MatchSpy *
321 ValueCountMatchSpy::clone() const {
322  Assert(internal);
323  return new ValueCountMatchSpy(internal->slot);
324 }
325 
326 string
328  return "Xapian::ValueCountMatchSpy";
329 }
330 
331 string
332 ValueCountMatchSpy::serialise() const {
333  Assert(internal);
334  string result;
335  pack_uint_last(result, internal->slot);
336  return result;
337 }
338 
339 MatchSpy *
340 ValueCountMatchSpy::unserialise(const string & s, const Registry &) const
341 {
342  const char * p = s.data();
343  const char * end = p + s.size();
344 
345  valueno new_slot;
346  if (!unpack_uint_last(&p, end, &new_slot)) {
348  }
349 
350  return new ValueCountMatchSpy(new_slot);
351 }
352 
353 string
354 ValueCountMatchSpy::serialise_results() const {
355  LOGCALL(REMOTE, string, "ValueCountMatchSpy::serialise_results", NO_ARGS);
356  Assert(internal);
357  string result;
358  pack_uint(result, internal->total);
359  for (auto&& item : internal->values) {
360  pack_string(result, item.first);
361  pack_uint(result, item.second);
362  }
363  RETURN(result);
364 }
365 
366 void
367 ValueCountMatchSpy::merge_results(const string & s) {
368  LOGCALL_VOID(REMOTE, "ValueCountMatchSpy::merge_results", s);
369  Assert(internal);
370  const char * p = s.data();
371  const char * end = p + s.size();
372 
374  if (!unpack_uint(&p, end, &n)) {
376  }
377  internal->total += n;
378 
379  string val;
380  while (p != end) {
381  doccount freq;
382  if (!unpack_string(&p, end, val) ||
383  !unpack_uint(&p, end, &freq)) {
385  }
386  internal->values[val] += freq;
387  }
388 }
389 
390 string
391 ValueCountMatchSpy::get_description() const {
392  string d = "ValueCountMatchSpy(";
393  if (internal) {
394  d += str(internal->total);
395  d += " docs seen, looking in ";
396  d += str(internal->values.size());
397  d += " slots)";
398  } else {
399  d += ")";
400  }
401  return d;
402 }
char name[9]
Definition: dbcheck.cc:57
Compare two StringAndFrequency objects.
Definition: matchspy.cc:166
StringAndFreqCmpByFreq()
Default constructor.
Definition: matchspy.cc:169
bool operator()(const StringAndFrequency &a, const StringAndFrequency &b) const
Return true if a has a higher frequency than b.
Definition: matchspy.cc:173
A termlist iterator over a vector of StringAndFrequency objects.
Definition: matchspy.cc:182
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
Definition: matchspy.cc:229
void init()
init should be called after the values have been set, but before iteration begins.
Definition: matchspy.cc:192
PositionList * positionlist_begin() const
Return PositionList for the current position.
Definition: matchspy.cc:230
Xapian::termcount positionlist_count() const
Return the length of the position list for the current position.
Definition: matchspy.cc:231
Xapian::termcount get_approx_size() const
Return approximate size of this termlist.
Definition: matchspy.cc:228
vector< StringAndFrequency >::const_iterator it
Definition: matchspy.cc:184
vector< StringAndFrequency > values
Definition: matchspy.cc:187
TermList * next()
Advance the current position to the next term in the termlist.
Definition: matchspy.cc:203
Xapian::doccount get_termfreq() const
Return the term frequency for the term at the current position.
Definition: matchspy.cc:197
TermList * skip_to(string_view term)
Skip forward to the specified term.
Definition: matchspy.cc:217
A string with a corresponding frequency.
Definition: matchspy.cc:146
std::string get_string() const
Return the string.
Definition: matchspy.cc:155
StringAndFrequency(const std::string &str_, Xapian::doccount frequency_)
Construct a StringAndFrequency object.
Definition: matchspy.cc:151
Xapian::doccount frequency
Definition: matchspy.cc:148
std::string str
Definition: matchspy.cc:147
Xapian::doccount get_frequency() const
Return the frequency.
Definition: matchspy.cc:158
A termlist iterator over the contents of a ValueCountMatchSpy.
Definition: matchspy.cc:92
Xapian::doccount get_termfreq() const
Return the term frequency for the term at the current position.
Definition: matchspy.cc:106
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
Definition: matchspy.cc:139
ValueCountTermList(ValueCountMatchSpy::Internal *spy_)
Definition: matchspy.cc:99
TermList * next()
Advance the current position to the next term in the termlist.
Definition: matchspy.cc:112
Xapian::termcount positionlist_count() const
Return the length of the position list for the current position.
Definition: matchspy.cc:141
TermList * skip_to(string_view term)
Skip forward to the specified term.
Definition: matchspy.cc:126
PositionList * positionlist_begin() const
Return PositionList for the current position.
Definition: matchspy.cc:140
Xapian::termcount get_approx_size() const
Return approximate size of this termlist.
Definition: matchspy.cc:138
intrusive_ptr< Xapian::ValueCountMatchSpy::Internal > spy
Definition: matchspy.cc:96
map< string, Xapian::doccount >::const_iterator it
Definition: matchspy.cc:94
Class representing a document.
Definition: document.h:64
std::string get_value(Xapian::valueno slot) const
Read a value slot in this document.
Definition: document.cc:185
A smart pointer that uses intrusive reference counting.
Definition: intrusive_ptr.h:83
InvalidOperationError indicates the API was used in an invalid way.
Definition: error.h:271
Abstract base class for match spies.
Definition: matchspy.h:50
Abstract base class for iterating term positions in a document.
Definition: positionlist.h:32
Registry for user subclasses.
Definition: registry.h:47
Abstract base class for termlists.
Definition: termlist.h:42
Class for iterating over a list of terms.
Definition: termiterator.h:41
UnimplementedError indicates an attempt to use an unimplemented feature.
Definition: error.h:313
Class for counting the frequencies of values in the matching documents.
Definition: matchspy.h:205
#define usual(COND)
Definition: config.h:608
string term
PositionList * p
Debug logging macros.
#define RETURN(...)
Definition: debuglog.h:484
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:478
#define LOGCALL_VOID(CATEGORY, FUNC, PARAMS)
Definition: debuglog.h:479
Class representing a document.
Hierarchy of classes which Xapian can throw as exceptions.
C++ STL heap implementation with extensions.
static void get_most_frequent_items(vector< StringAndFrequency > &result, const map< string, doccount > &items, size_t maxitems)
Get the most frequent items from a map from string to frequency.
Definition: matchspy.cc:250
static void unsupported_method()
Definition: matchspy.cc:87
MatchSpy implementation.
void replace(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:230
void make(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:259
void sort(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:277
string str(int value)
Convert int to std::string.
Definition: str.cc:91
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned valueno
The number for a value slot in a document.
Definition: types.h:90
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
Various assertion macros.
#define Assert(COND)
Definition: omassert.h:122
void unpack_throw_serialisation_error(const char *p)
Throw appropriate SerialisationError.
Definition: pack.cc:29
Pack types into strings and unpack them again.
bool unpack_uint_last(const char **p, const char *end, U *result)
Decode an unsigned integer as the last item in a string.
Definition: pack.h:118
bool unpack_string(const char **p, const char *end, std::string &result)
Decode a std::string from a string.
Definition: pack.h:468
void pack_uint_last(std::string &s, U value)
Append an encoded unsigned integer to a string as the last item.
Definition: pack.h:100
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
Definition: pack.h:346
void pack_uint(std::string &s, U value)
Append an encoded unsigned integer to a string.
Definition: pack.h:315
void pack_string(std::string &s, std::string_view value)
Append an encoded std::string to a string.
Definition: pack.h:442
parsing a user query string to build a Xapian::Query object
Class for looking up user subclasses during unserialisation.
Convert types to std::string.
Various handy string-related helpers.
Abstract base class for termlists.