xapian-core  2.0.0
api_opsynonym.cc
Go to the documentation of this file.
1 
4 /* Copyright 2009,2011,2014,2022 Olly Betts
5  * Copyright 2007,2008,2009 Lemur Consulting Ltd
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, see
19  * <https://www.gnu.org/licenses/>.
20  */
21 
22 #include <config.h>
23 
24 #include "api_opsynonym.h"
25 
26 #include <map>
27 #include <set>
28 #include <vector>
29 
30 #include <xapian.h>
31 
32 #include "backendmanager.h"
33 #include "testsuite.h"
34 #include "testutils.h"
35 
36 #include "apitest.h"
37 
38 using namespace std;
39 
40 // #######################################################################
41 // # Tests start here
42 
44  // How many results should have the same weight when combined with
45  // OP_SYNONYM instead of OP_OR.
47  // How many results should have a different weight when combined with
48  // OP_SYNONYM instead of OP_OR.
50  // How many subqueries.
51  unsigned n_subqs;
52  // The subqueries (use NOQ for unused ones).
53  Xapian::Query subqs[4];
54 };
55 
56 #define NOQ Xapian::Query::MatchNothing
58  {
59  // Single term - all 33 results should be same weight.
60  33, 0, 1,
61  { Xapian::Query("date"), NOQ, NOQ, NOQ }
62  },
63  {
64  // Two terms, which co-occur in some documents.
65  //
66  // All 34 results should be different.
67  0, 34, 2,
68  { Xapian::Query("sky"), Xapian::Query("date"), NOQ, NOQ }
69  },
70  {
71  // Two terms which are entirely disjoint, and where the maximum weight
72  // doesn't occur in the first or second match.
73  //
74  // All 18 results should be different.
75  0, 18, 2,
76  { Xapian::Query("gutenberg"), Xapian::Query("blockhead"), NOQ, NOQ }
77  },
78  {
79  // All 34 results should be different.
80  0, 34, 2,
81  {
82  Xapian::Query("date"),
84  Xapian::Query("sky"),
85  Xapian::Query("glove")),
86  NOQ, NOQ
87  }
88  },
89  {
90  // All 34 results should be different.
91  0, 34, 2,
92  {
93  Xapian::Query("date"),
95  Xapian::Query("sky"),
96  Xapian::Query("date")),
97  NOQ, NOQ
98  }
99  },
100  {
101  // All 34 results should be different.
102  0, 34, 2,
103  {
104  Xapian::Query("date"),
106  Xapian::Query("sky"),
107  Xapian::Query("date")),
108  NOQ, NOQ
109  }
110  },
111  {
112  // All 34 results should be different.
113  0, 34, 2,
114  {
115  Xapian::Query("date"),
117  Xapian::Query("sky"),
118  Xapian::Query("date")),
119  NOQ, NOQ
120  }
121  },
122  {
123  // The AND only matches 1 document, so the estimated termfreq for the
124  // whole synonym works out as 33 (due to rounding), which is the same
125  // as the termfreq for "date". Therefore most of the weights are the
126  // same as just for the pure "date" search, and the only document which
127  // gets a different weight is the one also matched by "sky" (because it
128  // has a wdf boost).
129  32, 1, 2,
130  {
131  Xapian::Query("date"),
133  Xapian::Query("sky"),
134  Xapian::Query("date")),
135  NOQ, NOQ
136  }
137  },
138  {
139  // All 34 results should be different.
140  0, 34, 2,
141  {
142  Xapian::Query("date"),
144  Xapian::Query("sky"),
145  Xapian::Query("date")),
146  NOQ, NOQ
147  }
148  },
149  {
150  // When the top-level operator is OR, the synonym part has an estimated
151  // termfreq of 35. When the top-level operator is SYNONYM, the whole
152  // query has an estimated termfreq of 66, which is rather bogus, but
153  // that's the current situation here (1.2 did better as it flattened
154  // this into a single OP_SYNONYM operator and then merged the two
155  // "date" terms to one with wqf=2. We've decided we shouldn't do such
156  // merging from 1.3.x on (merging to sum the scale_factors is fine, but
157  // we don't do that yet - FIXME).
158  //
159  // Anyway, this means that currently the weights are different for all
160  // matches.
161  0, 34, 2,
162  {
163  Xapian::Query("date"),
165  Xapian::Query("sky"),
166  Xapian::Query("date")),
167  NOQ, NOQ
168  }
169  },
170  {
171  // All 34 results should be different. MAX under SYNONYM should just
172  // be treated as OR.
173  0, 34, 2,
174  {
175  Xapian::Query("date"),
177  Xapian::Query("sky"),
178  Xapian::Query("date")),
179  NOQ, NOQ
180  }
181  },
182  {
183  // All 35 results should be different.
184  0, 35, 4,
185  {
186  Xapian::Query("sky"),
187  Xapian::Query("date"),
188  Xapian::Query("stein"),
189  Xapian::Query("ally")
190  }
191  },
192  {
193  // The estimated term frequency for the synoynm is 2 (because the
194  // estimate for the phrase is 0), which is the same as the term
195  // frequency of "attitud". Thus, the synonym gets the same weight as
196  // "attitud", so documents with only "attitud" (but not the phrase) in
197  // them get the same wdf, and have the same total weight. There turns
198  // out to be exactly one such document.
199  1, 3, 2,
200  {
201  Xapian::Query("attitud"),
203  Xapian::Query("german"),
204  Xapian::Query("adventur")),
205  NOQ, NOQ
206  }
207  },
208  {
209  // All 54 results should be different.
210  0, 54, 2,
211  {
212  Xapian::Query("attitud"),
214  Xapian::Query("german"),
216  Xapian::Query("sky"),
217  Xapian::Query("date"))),
218  NOQ, NOQ
219  }
220  }
221 };
222 
223 // Check a synonym search
224 DEFINE_TESTCASE(synonym1, backend) {
225  Xapian::Database db(get_database("etext"));
226 
228 
229  const Xapian::doccount lots = 214;
230 
231  for (const auto& data : synonym1_data) {
232  const Xapian::Query * qlist = data.subqs;
233  const Xapian::Query * qlist_end = qlist + data.n_subqs;
234 
235  // Run two queries, one joining the subqueries with OR and one joining
236  // them with SYNONYM.
237  Xapian::Enquire enquire(db);
238 
239  // Do the search with OP_OR, getting all the results.
240  Xapian::Query orquery(Xapian::Query::OP_OR, qlist, qlist_end);
241  enquire.set_query(orquery);
242  Xapian::MSet ormset = enquire.get_mset(0, lots);
243 
244  // Do the search with OP_SYNONYM, getting all the results.
245  Xapian::Query synquery(Xapian::Query::OP_SYNONYM, qlist, qlist_end);
246  enquire.set_query(synquery);
247  Xapian::MSet synmset = enquire.get_mset(0, lots);
248 
249  tout << "Comparing " << orquery << " with " << synquery << '\n';
250 
251  // Check that the queries return some results.
252  TEST_NOT_EQUAL(synmset.size(), 0);
253  // Check that the queries return the same number of results.
254  TEST_EQUAL(synmset.size(), ormset.size());
255  map<Xapian::docid, double> values_or;
256  map<Xapian::docid, double> values_synonym;
257  for (Xapian::doccount i = 0; i < synmset.size(); ++i) {
258  values_or[*ormset[i]] = ormset[i].get_weight();
259  values_synonym[*synmset[i]] = synmset[i].get_weight();
260  }
261  TEST_EQUAL(values_or.size(), values_synonym.size());
262 
263  /* Check that the most of the weights for items in the "or" mset are
264  * different from those in the "synonym" mset. */
265  int same_weight = 0;
266  int different_weight = 0;
267  for (map<Xapian::docid, double>::const_iterator
268  j = values_or.begin(); j != values_or.end(); ++j) {
269  Xapian::docid did = j->first;
270  // Check that all the results in the or tree make it to the synonym
271  // tree.
272  TEST(values_synonym.find(did) != values_synonym.end());
273  if (values_or[did] == values_synonym[did]) {
274  ++same_weight;
275  } else {
276  ++different_weight;
277  }
278  }
279 
280  TEST_EQUAL(different_weight, data.diffweight_count);
281  TEST_EQUAL(same_weight, data.sameweight_count);
282 
283  // Do the search with synonym, but just get the top result.
284  // (Regression test - the OR subquery in the synonym postlist tree used
285  // to shortcut incorrectly, and return the wrong result here).
286  Xapian::MSet mset_top = enquire.get_mset(0, 1);
287  TEST_EQUAL(mset_top.size(), 1);
288  TEST(mset_range_is_same(mset_top, 0, synmset, 0, 1));
289  }
290 }
291 
292 // Regression test - test a synonym search with a AndPostlist.
293 DEFINE_TESTCASE(synonym2, backend) {
295  vector<Xapian::Query> subqueries;
296  subqueries.push_back(Xapian::Query("file"));
297  subqueries.push_back(Xapian::Query("the"));
298  subqueries.push_back(Xapian::Query("next"));
299  subqueries.push_back(Xapian::Query("reader"));
300  query = Xapian::Query(Xapian::Query::OP_AND, subqueries.begin(), subqueries.end());
301  subqueries.clear();
302  subqueries.push_back(query);
303  subqueries.push_back(Xapian::Query("gutenberg"));
304  query = Xapian::Query(Xapian::Query::OP_SYNONYM, subqueries.begin(), subqueries.end());
305 
306  tout << query << '\n';
307 
308  Xapian::Database db(get_database("etext"));
309  Xapian::Enquire enquire(db);
310  enquire.set_query(query);
311  Xapian::MSet mset = enquire.get_mset(0, 10);
312  tout << mset << '\n';
313 
314  // Regression test that OP_SCALE_WEIGHT works with OP_SYNONYM
315  double maxposs = mset.get_max_possible();
317  enquire.set_query(query);
318  mset = enquire.get_mset(0, 10);
319  double maxposs2 = mset.get_max_possible();
320 
321  TEST_EQUAL_DOUBLE(maxposs * 10.0, maxposs2);
322 }
323 
324 static void
326  const Xapian::MSet & mset2)
327 {
328  TEST_EQUAL(mset1.size(), mset2.size());
329 
330  set<Xapian::docid> docids;
331  for (Xapian::doccount i = 0; i < mset1.size(); ++i) {
332  docids.insert(*mset1[i]);
333  }
334 
335  // Check that all the results in mset1 are in mset2.
336  for (Xapian::doccount j = 0; j < mset2.size(); ++j) {
337  // Check that we can erase each entry from mset2 element. Since mset1
338  // and mset2 are the same size this means we can be sure that there
339  // were no repeated docids in either (it would be a bug if there were).
340  TEST(docids.erase(*mset2[j]));
341  }
342 }
343 
344 // Test a synonym search which has had its weight scaled to 0.
345 DEFINE_TESTCASE(synonym3, backend) {
347  Xapian::Query("sky"),
348  Xapian::Query("date"));
349 
350  Xapian::Database db(get_database("etext"));
351  Xapian::Enquire enquire(db);
352  enquire.set_query(query);
353  Xapian::MSet mset_orig = enquire.get_mset(0, db.get_doccount());
354 
355  tout << query << '\n';
356  tout << mset_orig << '\n';
357 
358  // Test that OP_SCALE_WEIGHT with a factor of 0.0 works with OP_SYNONYM
359  // (this has a special codepath to avoid doing the synonym calculation).
361  enquire.set_query(query);
362  Xapian::MSet mset_zero = enquire.get_mset(0, db.get_doccount());
363 
364  tout << query << '\n';
365  tout << mset_zero << '\n';
366 
367  // Check that the queries return some results.
368  TEST_NOT_EQUAL(mset_zero.size(), 0);
369  // Check that the queries return the same document IDs, and the zero
370  // one has zero weight.
371  check_msets_contain_same_docs(mset_orig, mset_zero);
372  for (Xapian::doccount i = 0; i < mset_orig.size(); ++i) {
373  TEST_NOT_EQUAL(mset_orig[i].get_weight(), 0.0);
374  TEST_EQUAL(mset_zero[i].get_weight(), 0.0);
375  }
376 }
377 
378 // Test synonym searches combined with various operators.
379 DEFINE_TESTCASE(synonym4, backend) {
380  Xapian::Database db(get_database("etext"));
381  Xapian::Enquire enquire(db);
383  Xapian::Query("gutenberg"),
384  Xapian::Query("blockhead"));
386  Xapian::Query("gutenberg"),
387  Xapian::Query("blockhead"));
388  Xapian::Query date_query = Xapian::Query("date");
389 
390  // Check some queries.
391  static const Xapian::Query::op operators[] = {
398  };
399  for (auto op : operators) {
400  tout.str(string());
401  Xapian::Query query1(op, syn_query, date_query);
402  Xapian::Query query2(op, or_query, date_query);
403 
404  enquire.set_query(query1);
405  tout << "query1:" << query1 << '\n';
406  Xapian::MSet mset1 = enquire.get_mset(0, db.get_doccount());
407  tout << "mset1:" << mset1 << '\n';
408  enquire.set_query(query2);
409  tout << "query2:" << query2 << '\n';
410  Xapian::MSet mset2 = enquire.get_mset(0, db.get_doccount());
411  tout << "mset2:" << mset2 << '\n';
412 
413  TEST_NOT_EQUAL(mset1.size(), 0);
414  if (op != Xapian::Query::OP_XOR) {
415  TEST_EQUAL(mset1[0].get_percent(), 100);
416  } else {
417  TEST(mset1[0].get_percent() != 100);
418  }
419  check_msets_contain_same_docs(mset1, mset2);
420  }
421 }
422 
423 DEFINE_TESTCASE(opmax1, backend) {
424  Xapian::Database db(get_database("etext"));
425  Xapian::Enquire enq(db);
426  Xapian::Query q1("king");
427  Xapian::Query q2("friedrich");
428  Xapian::Query qmax(Xapian::Query::OP_MAX, q1, q2);
429  enq.set_query(q1);
430  Xapian::MSet mset1 = enq.get_mset(0, db.get_doccount());
431  enq.set_query(q2);
432  Xapian::MSet mset2 = enq.get_mset(0, db.get_doccount());
433  enq.set_query(qmax);
434  Xapian::MSet msetmax = enq.get_mset(0, db.get_doccount());
435 
436  // Check that the weights in msetmax are the maximum of the weights in
437  // mset1 and mset2 for each docid.
438  map<Xapian::docid, double> expected_weights;
440  for (i = mset1.begin(); i != mset1.end(); ++i) {
441  expected_weights[*i] = i.get_weight();
442  }
443  for (i = mset2.begin(); i != mset2.end(); ++i) {
444  map<Xapian::docid, double>::iterator j;
445  j = expected_weights.find(*i);
446  if (j != expected_weights.end()) {
447  j->second = max(j->second, i.get_weight());
448  } else {
449  expected_weights[*i] = i.get_weight();
450  }
451  }
452 
453  for (i = msetmax.begin(); i != msetmax.end(); ++i) {
454  map<Xapian::docid, double>::iterator j;
455  j = expected_weights.find(*i);
456  TEST(j != expected_weights.end());
457  TEST_EQUAL_DOUBLE(j->second, i.get_weight());
458  expected_weights.erase(j);
459  tout << expected_weights.size() << '\n';
460  }
461 
462  // Any document in mset1 or mset2 should also be in msetmax.
463  TEST_EQUAL(expected_weights.size(), 0);
464 }
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
Definition: api_anydb.cc:62
#define NOQ
DEFINE_TESTCASE(synonym1, backend)
static const synonym1_data_type synonym1_data[]
static void check_msets_contain_same_docs(const Xapian::MSet &mset1, const Xapian::MSet &mset2)
Xapian::Database get_database(const string &dbname)
Definition: apitest.cc:47
test functionality of the Xapian API
Base class for backend handling in test harness.
An indexed database of documents.
Definition: database.h:75
Xapian::doccount get_doccount() const
Get the number of documents in the database.
Definition: database.cc:233
Xapian::termcount get_doclength_upper_bound() const
Get an upper bound on the length of a document in this DB.
Definition: database.cc:308
Querying session.
Definition: enquire.h:57
MSet get_mset(doccount first, doccount maxitems, doccount checkatleast=0, const RSet *rset=NULL, const MatchDecider *mdecider=NULL) const
Run the query.
Definition: enquire.cc:200
void set_query(const Query &query, termcount query_length=0)
Set the query.
Definition: enquire.cc:72
Iterator over a Xapian::MSet.
Definition: mset.h:535
double get_weight() const
Get the weight for the current position.
Definition: msetiterator.cc:55
Class representing a list of search results.
Definition: mset.h:46
Xapian::doccount size() const
Return number of items in this MSet object.
Definition: mset.cc:374
double get_max_possible() const
The maximum possible weight any document could achieve.
Definition: mset.cc:368
MSetIterator begin() const
Return iterator pointing to the first item in this MSet.
Definition: mset.h:786
MSetIterator end() const
Return iterator pointing to just after the last item in this MSet.
Definition: mset.h:791
Class representing a query.
Definition: query.h:45
op
Query operators.
Definition: query.h:78
@ OP_SCALE_WEIGHT
Scale the weight contributed by a subquery.
Definition: query.h:166
@ OP_MAX
Pick the maximum weight of any subquery.
Definition: query.h:249
@ OP_XOR
Match documents which an odd number of subqueries match.
Definition: query.h:107
@ OP_AND_MAYBE
Match the first subquery taking extra weight from other subqueries.
Definition: query.h:118
@ OP_AND
Match only documents which all subqueries match.
Definition: query.h:84
@ OP_OR
Match documents which at least one subquery matches.
Definition: query.h:92
@ OP_PHRASE
Match only documents where all subqueries match near and in order.
Definition: query.h:152
@ OP_SYNONYM
Match like OP_OR but weighting as if a single term.
Definition: query.h:239
@ OP_AND_NOT
Match documents which the first subquery matches but no others do.
Definition: query.h:99
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
#define TEST_REL(A, REL, B)
Test a relation holds,e.g. TEST_REL(a,>,b);.
Definition: testmacros.h:35
std::ostringstream tout
The debug printing stream.
Definition: testsuite.cc:104
a generic test suite engine
#define TEST_EQUAL(a, b)
Test for equality of two things.
Definition: testsuite.h:276
#define TEST_EQUAL_DOUBLE(a, b)
Test two doubles for near equality.
Definition: testsuite.h:293
#define TEST(a)
Test a condition, without an additional explanation for failure.
Definition: testsuite.h:273
#define TEST_NOT_EQUAL(a, b)
Test for non-equality of two things.
Definition: testsuite.h:303
bool mset_range_is_same(const Xapian::MSet &mset1, unsigned int first1, const Xapian::MSet &mset2, unsigned int first2, unsigned int count)
Definition: testutils.cc:45
Xapian-specific test helper functions and macros.
Public interfaces for the Xapian library.