xapian-core  2.0.0
api_matchspy.cc
Go to the documentation of this file.
1 
4 /* Copyright 2007,2009 Lemur Consulting Ltd
5  * Copyright 2009,2011,2012,2015,2019 Olly Betts
6  * Copyright 2010 Richard Boulton
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as
10  * published by the Free Software Foundation; either version 2 of the
11  * License, or (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, see
20  * <https://www.gnu.org/licenses/>.
21  */
22 
23 #include <config.h>
24 
25 #include "api_matchspy.h"
26 
27 #include <xapian.h>
28 
29 #include <vector>
30 
31 #include "backendmanager.h"
32 #include "str.h"
33 #include "testsuite.h"
34 #include "testutils.h"
35 #include "apitest.h"
36 
37 using namespace std;
38 
39 // #######################################################################
40 // # Tests start here
41 
43  public:
44  // Vector which will be filled with all the document contents seen.
45  std::vector<std::string> seen;
46 
47  void operator()(const Xapian::Document& doc, double) override {
48  // Note that this is not recommended usage of get_data() - you
49  // generally shouldn't call get_data() from inside a MatchSpy, because
50  // it is (likely to be) a slow operation resulting in considerable IO.
51  seen.push_back(doc.get_data());
52  }
53 };
54 
55 // Basic test of a matchspy.
56 DEFINE_TESTCASE(matchspy1, backend && !remote) {
57  Xapian::Database db(get_database("apitest_simpledata"));
58  Xapian::Enquire enquire(db);
59  enquire.set_query(Xapian::Query("this"));
60 
61  SimpleMatchSpy myspy;
62 
63  Xapian::MSet nospymset = enquire.get_mset(0, 100);
64  enquire.add_matchspy(&myspy);
65  Xapian::MSet spymset = enquire.get_mset(0, 100);
66 
67  // Check that the match estimates aren't affected by the matchspy.
68  TEST_EQUAL(nospymset, spymset);
69 
70  vector<bool> docid_checked(db.get_lastdocid());
71 
72  // Check that we get the expected number of matches, and that the stored
73  // document contents are right.
74  Xapian::MSetIterator i = spymset.begin();
75  TEST(i != spymset.end());
76  TEST_EQUAL(spymset.size(), 6);
77  TEST_EQUAL(myspy.seen.size(), spymset.size());
78 
79  std::sort(myspy.seen.begin(), myspy.seen.end());
80 
81  std::vector<std::string> seen2;
82  for ( ; i != spymset.end(); ++i) {
83  const Xapian::Document doc(i.get_document());
84  seen2.push_back(doc.get_data());
85  }
86  std::sort(seen2.begin(), seen2.end());
87 
88  TEST_EQUAL(myspy.seen.size(), seen2.size());
89  std::vector<std::string>::const_iterator j = myspy.seen.begin();
90  std::vector<std::string>::const_iterator j2 = seen2.begin();
91  for (; j != myspy.seen.end(); ++j, ++j2) {
92  TEST_EQUAL(*j, *j2);
93  }
94 }
95 
96 static string values_to_repr(const Xapian::ValueCountMatchSpy & spy) {
97  string resultrepr("|");
98  for (Xapian::TermIterator i = spy.values_begin();
99  i != spy.values_end();
100  ++i) {
101  resultrepr += *i;
102  resultrepr += ':';
103  resultrepr += str(i.get_termfreq());
104  resultrepr += '|';
105  }
106  return resultrepr;
107 }
108 
109 static void
111 {
112  for (int c = 1; c <= 25; ++c) {
113  Xapian::Document doc;
114  doc.set_data("Document " + str(c));
115  int factors = 0;
116  for (int factor = 1; factor <= c; ++factor) {
117  doc.add_term("all");
118  if (c % factor == 0) {
119  doc.add_term("XFACT" + str(factor));
120  ++factors;
121  }
122  }
123 
124  // Number of factors.
125  doc.add_value(0, str(factors));
126  // Units digits.
127  doc.add_value(1, str(c % 10));
128  // Constant.
129  doc.add_value(2, "fish");
130  // Number of digits.
131  doc.add_value(3, str(str(c).size()));
132 
133  db.add_document(doc);
134  }
135 }
136 
137 DEFINE_TESTCASE(matchspy2, backend)
138 {
140 
144 
145  Xapian::Enquire enq(db);
146 
147  enq.set_query(Xapian::Query("all"));
148  if (db.size() > 1) {
149  // Without this, we short-cut on the second shard because we don't get
150  // the documents in ascending weight order.
152  }
153 
154  enq.add_matchspy(&spy0);
155  enq.add_matchspy(&spy1);
156  enq.add_matchspy(&spy3);
157  Xapian::MSet mset = enq.get_mset(0, 10);
158 
159  TEST_EQUAL(spy0.get_total(), 25);
160  TEST_EQUAL(spy1.get_total(), 25);
161  TEST_EQUAL(spy3.get_total(), 25);
162 
163  static const char * const results[] = {
164  "|1:1|2:9|3:3|4:7|5:1|6:3|8:1|",
165  "|0:2|1:3|2:3|3:3|4:3|5:3|6:2|7:2|8:2|9:2|",
166  "|1:9|2:16|",
167  };
168  TEST_STRINGS_EQUAL(values_to_repr(spy0), results[0]);
169  TEST_STRINGS_EQUAL(values_to_repr(spy1), results[1]);
170  TEST_STRINGS_EQUAL(values_to_repr(spy3), results[2]);
171 }
172 
173 DEFINE_TESTCASE(matchspy4, backend)
174 {
176 
177  // We're going to run the match twice - once sorted by relevance, and once
178  // sorted by a value. This is a regression test - the matcher used to fail
179  // to show some documents to the spy when sorting by non-pure-relevance.
186 
187  Xapian::Enquire enqa(db);
188  Xapian::Enquire enqb(db);
189 
190  enqa.set_query(Xapian::Query("all"));
191  if (db.size() > 1) {
192  // Without this, we short-cut on the second shard because we don't get
193  // the documents in ascending weight order.
195  }
196  enqb.set_query(Xapian::Query("all"));
197 
198  enqa.add_matchspy(&spya0);
199  enqa.add_matchspy(&spya1);
200  enqa.add_matchspy(&spya3);
201  enqb.add_matchspy(&spyb0);
202  enqb.add_matchspy(&spyb1);
203  enqb.add_matchspy(&spyb3);
204 
205  Xapian::MSet mseta = enqa.get_mset(0, 10);
206  enqb.set_sort_by_value(0, false);
207  Xapian::MSet msetb = enqb.get_mset(0, 10, 100);
208 
209  TEST_EQUAL(spya0.get_total(), 25);
210  TEST_EQUAL(spya1.get_total(), 25);
211  TEST_EQUAL(spya3.get_total(), 25);
212  TEST_EQUAL(spyb0.get_total(), 25);
213  TEST_EQUAL(spyb1.get_total(), 25);
214  TEST_EQUAL(spyb3.get_total(), 25);
215 
216  static const char * const results[] = {
217  "|2:9|4:7|3:3|6:3|1:1|5:1|8:1|",
218  "|1:3|2:3|3:3|4:3|5:3|0:2|6:2|7:2|8:2|9:2|",
219  "|",
220  "|2:16|1:9|",
221  "|2:9|4:7|3:3|6:3|1:1|5:1|8:1|",
222  "|1:3|2:3|3:3|4:3|5:3|0:2|6:2|7:2|8:2|9:2|",
223  "|",
224  "|2:16|1:9|",
225  NULL
226  };
227  std::vector<Xapian::ValueCountMatchSpy *> spies;
228  spies.push_back(&spya0);
229  spies.push_back(&spya1);
230  spies.push_back(NULL);
231  spies.push_back(&spya3);
232  spies.push_back(&spyb0);
233  spies.push_back(&spyb1);
234  spies.push_back(NULL);
235  spies.push_back(&spyb3);
236  for (Xapian::valueno v = 0; results[v]; ++v) {
237  tout << "value " << v << '\n';
238  Xapian::ValueCountMatchSpy * spy = spies[v];
239  string allvals_str("|");
240  if (spy != NULL) {
241  size_t allvals_size = 0;
242  for (Xapian::TermIterator i = spy->top_values_begin(100);
243  i != spy->top_values_end(100);
244  ++i, ++allvals_size) {
245  allvals_str += *i;
246  allvals_str += ':';
247  allvals_str += str(i.get_termfreq());
248  allvals_str += '|';
249  }
250  tout << allvals_str << '\n';
251  TEST_STRINGS_EQUAL(allvals_str, results[v]);
252 
253  for (size_t count = 0; count < allvals_size; ++count) {
254  tout << "count " << count << '\n';
255  for (Xapian::TermIterator i = spy->top_values_begin(100),
256  j = spy->top_values_begin(count);
257  i != spy->top_values_end(100) &&
258  j != spy->top_values_end(count);
259  ++i, ++j) {
260  tout << "j " << j << '\n';
261  TEST_EQUAL(*i, *j);
262  TEST_EQUAL(i.get_termfreq(), j.get_termfreq());
263  }
264  }
265  }
266  }
267 }
268 
269 // Test builtin match spies
270 DEFINE_TESTCASE(matchspy5, backend)
271 {
272  Xapian::Database db(get_database("apitest_simpledata"));
273  Xapian::Enquire enquire(db);
274  enquire.set_query(Xapian::Query("this"));
275 
276  Xapian::ValueCountMatchSpy myspy1(1);
277  Xapian::ValueCountMatchSpy myspy2(1);
278 
279  enquire.add_matchspy(&myspy1);
280  enquire.add_matchspy(&myspy2);
281  Xapian::MSet mymset = enquire.get_mset(0, 100);
282  TEST_EQUAL(mymset.size(), 6);
283 
284  Xapian::TermIterator i = myspy1.values_begin();
285  TEST(i != myspy1.values_end());
286  TEST(*i == "h");
287  TEST_EQUAL(i.get_termfreq(), 5);
288  ++i;
289  TEST(i != myspy1.values_end());
290  TEST(*i == "n");
291  TEST_EQUAL(i.get_termfreq(), 1);
292  ++i;
293  TEST(i == myspy1.values_end());
294 
295  i = myspy2.values_begin();
296  TEST(i != myspy2.values_end());
297  TEST(*i == "h");
298  TEST_EQUAL(i.get_termfreq(), 5);
299  ++i;
300  TEST(i != myspy2.values_end());
301  TEST(*i == "n");
302  TEST_EQUAL(i.get_termfreq(), 1);
303  ++i;
304  TEST(i == myspy2.values_end());
305 }
306 
307 class MySpy : public Xapian::MatchSpy {
308  void operator()(const Xapian::Document&, double) override {
309  }
310 };
311 
312 // Test exceptions from matchspy base class, and get_description method.
313 DEFINE_TESTCASE(matchspy6, !backend)
314 {
315  MySpy spy;
316 
321  spy.unserialise(std::string(), Xapian::Registry()));
324  spy.merge_results(std::string()));
325  TEST_EQUAL(spy.get_description(), "Xapian::MatchSpy()");
326 }
327 
329 DEFINE_TESTCASE(matchspy7, !backend)
330 {
332  string s = myspy.serialise_results();
333  // Append a string which overflows a 64-bit type when decoded with
334  // pack_uint().
335  s += "xxxxxxxxx";
336  // This merge_results() call used to enter an infinite loop.
338 }
static void make_matchspy2_db(Xapian::WritableDatabase &db, const string &)
DEFINE_TESTCASE(matchspy1, backend &&!remote)
Definition: api_matchspy.cc:56
static string values_to_repr(const Xapian::ValueCountMatchSpy &spy)
Definition: api_matchspy.cc:96
Xapian::Database get_database(const string &dbname)
Definition: apitest.cc:47
test functionality of the Xapian API
Base class for backend handling in test harness.
void operator()(const Xapian::Document &, double) override
Register a document with the match spy.
void operator()(const Xapian::Document &doc, double) override
Register a document with the match spy.
Definition: api_matchspy.cc:47
std::vector< std::string > seen
Definition: api_matchspy.cc:45
Xapian::Weight subclass implementing Coordinate Matching.
Definition: weight.h:2163
An indexed database of documents.
Definition: database.h:75
size_t size() const
Return number of shards in this Database object.
Definition: database.cc:105
Xapian::docid get_lastdocid() const
Get the highest document id which has been used in the database.
Definition: database.cc:239
Class representing a document.
Definition: document.h:64
void set_data(std::string_view data)
Set the document data.
Definition: document.cc:81
std::string get_data() const
Get the document data.
Definition: document.cc:75
void add_term(std::string_view term, Xapian::termcount wdf_inc=1)
Add a term to this document.
Definition: document.cc:87
void add_value(Xapian::valueno slot, std::string_view value)
Add a value to a slot in this document.
Definition: document.cc:191
Querying session.
Definition: enquire.h:57
void set_weighting_scheme(const Weight &weight)
Set the weighting scheme to use.
Definition: enquire.cc:85
void add_matchspy(MatchSpy *spy) XAPIAN_NONNULL()
Add a matchspy.
Definition: enquire.cc:179
MSet get_mset(doccount first, doccount maxitems, doccount checkatleast=0, const RSet *rset=NULL, const MatchDecider *mdecider=NULL) const
Run the query.
Definition: enquire.cc:200
void set_query(const Query &query, termcount query_length=0)
Set the query.
Definition: enquire.cc:72
void set_sort_by_value(valueno sort_key, bool reverse)
Set the sorting to be by value only.
Definition: enquire.cc:103
Iterator over a Xapian::MSet.
Definition: mset.h:535
Xapian::Document get_document() const
Get the Document object for the current position.
Definition: msetiterator.cc:45
Class representing a list of search results.
Definition: mset.h:46
Xapian::doccount size() const
Return number of items in this MSet object.
Definition: mset.cc:374
MSetIterator begin() const
Return iterator pointing to the first item in this MSet.
Definition: mset.h:786
MSetIterator end() const
Return iterator pointing to just after the last item in this MSet.
Definition: mset.h:791
Abstract base class for match spies.
Definition: matchspy.h:50
virtual std::string name() const
Return the name of this match spy.
Definition: matchspy.cc:57
virtual std::string serialise() const
Return this object's parameters serialised as a single string.
Definition: matchspy.cc:62
virtual std::string get_description() const
Return a string describing this object.
Definition: matchspy.cc:82
virtual void merge_results(const std::string &serialised)
Unserialise some results, and merge them into this matchspy.
Definition: matchspy.cc:77
virtual std::string serialise_results() const
Serialise the results of this match spy.
Definition: matchspy.cc:72
virtual MatchSpy * clone() const
Clone the match spy.
Definition: matchspy.cc:52
virtual MatchSpy * unserialise(const std::string &serialised, const Registry &context) const
Unserialise parameters.
Definition: matchspy.cc:67
Class representing a query.
Definition: query.h:45
Registry for user subclasses.
Definition: registry.h:47
Indicates an error in the std::string serialisation of an object.
Definition: error.h:917
Class for iterating over a list of terms.
Definition: termiterator.h:41
Xapian::doccount get_termfreq() const
Return the term frequency for the term at the current position.
UnimplementedError indicates an attempt to use an unimplemented feature.
Definition: error.h:313
Class for counting the frequencies of values in the matching documents.
Definition: matchspy.h:205
TermIterator top_values_begin(size_t maxvalues) const
Get an iterator over the most frequent values seen in the slot.
Definition: matchspy.cc:308
virtual std::string serialise_results() const
Serialise the results of this match spy.
Definition: matchspy.cc:354
TermIterator top_values_end(size_t) const noexcept
End iterator corresponding to top_values_begin()
Definition: matchspy.h:272
virtual void merge_results(const std::string &serialised)
Unserialise some results, and merge them into this matchspy.
Definition: matchspy.cc:367
size_t get_total() const noexcept
Return the total number of documents tallied.
Definition: matchspy.h:241
TermIterator values_end() const noexcept
End iterator corresponding to values_begin()
Definition: matchspy.h:255
TermIterator values_begin() const
Get an iterator over the values seen in the slot.
Definition: matchspy.cc:301
This class provides read/write access to a database.
Definition: database.h:964
Xapian::docid add_document(const Xapian::Document &doc)
Add a document to the database.
Definition: database.cc:561
void sort(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:277
string str(int value)
Convert int to std::string.
Definition: str.cc:91
unsigned valueno
The number for a value slot in a document.
Definition: types.h:90
Convert types to std::string.
std::ostringstream tout
The debug printing stream.
Definition: testsuite.cc:104
a generic test suite engine
#define TEST_EQUAL(a, b)
Test for equality of two things.
Definition: testsuite.h:276
#define TEST_STRINGS_EQUAL(a, b)
Test for equality of two strings.
Definition: testsuite.h:285
#define TEST(a)
Test a condition, without an additional explanation for failure.
Definition: testsuite.h:273
Xapian-specific test helper functions and macros.
#define TEST_EXCEPTION(TYPE, CODE)
Check that CODE throws exactly Xapian exception TYPE.
Definition: testutils.h:112
Public interfaces for the Xapian library.