xapian-core  1.4.20
api_opsynonym.cc
Go to the documentation of this file.
1 
4 /* Copyright 2009,2011,2014,2022 Olly Betts
5  * Copyright 2007,2008,2009 Lemur Consulting Ltd
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20  * USA
21  */
22 
23 #include <config.h>
24 
25 #include "api_opsynonym.h"
26 
27 #include <map>
28 #include <set>
29 #include <vector>
30 
31 #include <xapian.h>
32 
33 #include "backendmanager.h"
34 #include "testsuite.h"
35 #include "testutils.h"
36 
37 #include "apitest.h"
38 
39 using namespace std;
40 
41 // #######################################################################
42 // # Tests start here
43 
45  // How many results should have the same weight when combined with
46  // OP_SYNONYM instead of OP_OR.
48  // How many results should have a different weight when combined with
49  // OP_SYNONYM instead of OP_OR.
51  // How many subqueries.
52  unsigned n_subqs;
53  // The subqueries (use NOQ for unused ones).
54  Xapian::Query subqs[4];
55 };
56 
57 #define NOQ Xapian::Query::MatchNothing
59  {
60  // Single term - all 33 results should be same weight.
61  33, 0, 1,
62  { Xapian::Query("date"), NOQ, NOQ, NOQ }
63  },
64  {
65  // Two terms, which co-occur in some documents.
66  //
67  // All 34 results should be different.
68  0, 34, 2,
69  { Xapian::Query("sky"), Xapian::Query("date"), NOQ, NOQ }
70  },
71  {
72  // Two terms which are entirely disjoint, and where the maximum weight
73  // doesn't occur in the first or second match.
74  //
75  // All 18 results should be different.
76  0, 18, 2,
77  { Xapian::Query("gutenberg"), Xapian::Query("blockhead"), NOQ, NOQ }
78  },
79  {
80  // All 34 results should be different.
81  0, 34, 2,
82  {
83  Xapian::Query("date"),
85  Xapian::Query("sky"),
86  Xapian::Query("glove")),
87  NOQ, NOQ
88  }
89  },
90  {
91  // All 34 results should be different.
92  0, 34, 2,
93  {
94  Xapian::Query("date"),
96  Xapian::Query("sky"),
97  Xapian::Query("date")),
98  NOQ, NOQ
99  }
100  },
101  {
102  // All 34 results should be different.
103  0, 34, 2,
104  {
105  Xapian::Query("date"),
107  Xapian::Query("sky"),
108  Xapian::Query("date")),
109  NOQ, NOQ
110  }
111  },
112  {
113  // All 34 results should be different.
114  0, 34, 2,
115  {
116  Xapian::Query("date"),
118  Xapian::Query("sky"),
119  Xapian::Query("date")),
120  NOQ, NOQ
121  }
122  },
123  {
124  // The AND only matches 1 document, so the estimated termfreq for the
125  // whole synonym works out as 33 (due to rounding), which is the same
126  // as the termfreq for "date". Therefore most of the weights are the
127  // same as just for the pure "date" search, and the only document which
128  // gets a different weight is the one also matched by "sky" (because it
129  // has a wdf boost).
130  32, 1, 2,
131  {
132  Xapian::Query("date"),
134  Xapian::Query("sky"),
135  Xapian::Query("date")),
136  NOQ, NOQ
137  }
138  },
139  {
140  // All 34 results should be different.
141  0, 34, 2,
142  {
143  Xapian::Query("date"),
145  Xapian::Query("sky"),
146  Xapian::Query("date")),
147  NOQ, NOQ
148  }
149  },
150  {
151  // When the top-level operator is OR, the synonym part has an estimated
152  // termfreq of 35. When the top-level operator is SYNONYM, the whole
153  // query has an estimated termfreq of 66, which is rather bogus, but
154  // that's the current situation here (1.2 did better as it flattened
155  // this into a single OP_SYNONYM operator and then merged the two
156  // "date" terms to one with wqf=2. We've decided we shouldn't do such
157  // merging from 1.3.x on (merging to sum the scale_factors is fine, but
158  // we don't do that yet - FIXME).
159  //
160  // Anyway, this means that currently the weights are different for all
161  // matches.
162  0, 34, 2,
163  {
164  Xapian::Query("date"),
166  Xapian::Query("sky"),
167  Xapian::Query("date")),
168  NOQ, NOQ
169  }
170  },
171  {
172  // All 34 results should be different. MAX under SYNONYM should just
173  // be treated as OR.
174  0, 34, 2,
175  {
176  Xapian::Query("date"),
178  Xapian::Query("sky"),
179  Xapian::Query("date")),
180  NOQ, NOQ
181  }
182  },
183  {
184  // All 35 results should be different.
185  0, 35, 4,
186  {
187  Xapian::Query("sky"),
188  Xapian::Query("date"),
189  Xapian::Query("stein"),
190  Xapian::Query("ally")
191  }
192  },
193  {
194  // The estimated term frequency for the synoynm is 2 (because the
195  // estimate for the phrase is 0), which is the same as the term
196  // frequency of "attitud". Thus, the synonym gets the same weight as
197  // "attitud", so documents with only "attitud" (but not the phrase) in
198  // them get the same wdf, and have the same total weight. There turns
199  // out to be exactly one such document.
200  1, 3, 2,
201  {
202  Xapian::Query("attitud"),
204  Xapian::Query("german"),
205  Xapian::Query("adventur")),
206  NOQ, NOQ
207  }
208  },
209  {
210  // All 54 results should be different.
211  0, 54, 2,
212  {
213  Xapian::Query("attitud"),
215  Xapian::Query("german"),
217  Xapian::Query("sky"),
218  Xapian::Query("date"))),
219  NOQ, NOQ
220  }
221  }
222 };
223 
224 // Check a synonym search
225 DEFINE_TESTCASE(synonym1, backend) {
226  Xapian::Database db(get_database("etext"));
227 
229 
230  const Xapian::doccount lots = 214;
231 
232  for (size_t subqgroup = 0;
233  subqgroup != sizeof(synonym1_data) / sizeof(synonym1_data[0]);
234  ++subqgroup) {
235  const synonym1_data_type & data = synonym1_data[subqgroup];
236  const Xapian::Query * qlist = data.subqs;
237  const Xapian::Query * qlist_end = qlist + data.n_subqs;
238 
239  // Run two queries, one joining the subqueries with OR and one joining
240  // them with SYNONYM.
241  Xapian::Enquire enquire(db);
242 
243  // Do the search with OP_OR, getting all the results.
244  Xapian::Query orquery(Xapian::Query::OP_OR, qlist, qlist_end);
245  enquire.set_query(orquery);
246  Xapian::MSet ormset = enquire.get_mset(0, lots);
247 
248  // Do the search with OP_SYNONYM, getting all the results.
249  Xapian::Query synquery(Xapian::Query::OP_SYNONYM, qlist, qlist_end);
250  enquire.set_query(synquery);
251  Xapian::MSet synmset = enquire.get_mset(0, lots);
252 
253  tout << "Comparing " << orquery << " with " << synquery << '\n';
254 
255  // Check that the queries return some results.
256  TEST_NOT_EQUAL(synmset.size(), 0);
257  // Check that the queries return the same number of results.
258  TEST_EQUAL(synmset.size(), ormset.size());
259  map<Xapian::docid, double> values_or;
260  map<Xapian::docid, double> values_synonym;
261  for (Xapian::doccount i = 0; i < synmset.size(); ++i) {
262  values_or[*ormset[i]] = ormset[i].get_weight();
263  values_synonym[*synmset[i]] = synmset[i].get_weight();
264  }
265  TEST_EQUAL(values_or.size(), values_synonym.size());
266 
267  /* Check that the most of the weights for items in the "or" mset are
268  * different from those in the "synonym" mset. */
269  int same_weight = 0;
270  int different_weight = 0;
271  for (map<Xapian::docid, double>::const_iterator
272  j = values_or.begin(); j != values_or.end(); ++j) {
273  Xapian::docid did = j->first;
274  // Check that all the results in the or tree make it to the synonym
275  // tree.
276  TEST(values_synonym.find(did) != values_synonym.end());
277  if (values_or[did] == values_synonym[did]) {
278  ++same_weight;
279  } else {
280  ++different_weight;
281  }
282  }
283 
284  TEST_EQUAL(different_weight, data.diffweight_count);
285  TEST_EQUAL(same_weight, data.sameweight_count);
286 
287  // Do the search with synonym, but just get the top result.
288  // (Regression test - the OR subquery in the synonym postlist tree used
289  // to shortcut incorrectly, and return the wrong result here).
290  Xapian::MSet mset_top = enquire.get_mset(0, 1);
291  TEST_EQUAL(mset_top.size(), 1);
292  TEST(mset_range_is_same(mset_top, 0, synmset, 0, 1));
293  }
294 }
295 
296 // Regression test - test a synonym search with a MultiAndPostlist.
297 DEFINE_TESTCASE(synonym2, backend) {
299  vector<Xapian::Query> subqueries;
300  subqueries.push_back(Xapian::Query("file"));
301  subqueries.push_back(Xapian::Query("the"));
302  subqueries.push_back(Xapian::Query("next"));
303  subqueries.push_back(Xapian::Query("reader"));
304  query = Xapian::Query(Xapian::Query::OP_AND, subqueries.begin(), subqueries.end());
305  subqueries.clear();
306  subqueries.push_back(query);
307  subqueries.push_back(Xapian::Query("gutenberg"));
308  query = Xapian::Query(Xapian::Query::OP_SYNONYM, subqueries.begin(), subqueries.end());
309 
310  tout << query << '\n';
311 
312  Xapian::Database db(get_database("etext"));
313  Xapian::Enquire enquire(db);
314  enquire.set_query(query);
315  Xapian::MSet mset = enquire.get_mset(0, 10);
316  tout << mset << '\n';
317 
318  // Regression test that OP_SCALE_WEIGHT works with OP_SYNONYM
319  double maxposs = mset.get_max_possible();
320  query = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 10.0);
321  enquire.set_query(query);
322  mset = enquire.get_mset(0, 10);
323  double maxposs2 = mset.get_max_possible();
324 
325  TEST_EQUAL_DOUBLE(maxposs * 10.0, maxposs2);
326 }
327 
328 static void
330  const Xapian::MSet & mset2)
331 {
332  TEST_EQUAL(mset1.size(), mset2.size());
333 
334  set<Xapian::docid> docids;
335  for (Xapian::doccount i = 0; i < mset1.size(); ++i) {
336  docids.insert(*mset1[i]);
337  }
338 
339  // Check that all the results in mset1 are in mset2.
340  for (Xapian::doccount j = 0; j < mset2.size(); ++j) {
341  // Check that we can erase each entry from mset2 element. Since mset1
342  // and mset2 are the same size this means we can be sure that there
343  // were no repeated docids in either (it would be a bug if there were).
344  TEST(docids.erase(*mset2[j]));
345  }
346 }
347 
348 // Test a synonym search which has had its weight scaled to 0.
349 DEFINE_TESTCASE(synonym3, backend) {
351  Xapian::Query("sky"),
352  Xapian::Query("date"));
353 
354  Xapian::Database db(get_database("etext"));
355  Xapian::Enquire enquire(db);
356  enquire.set_query(query);
357  Xapian::MSet mset_orig = enquire.get_mset(0, db.get_doccount());
358 
359  tout << query << '\n';
360  tout << mset_orig << '\n';
361 
362  // Test that OP_SCALE_WEIGHT with a factor of 0.0 works with OP_SYNONYM
363  // (this has a special codepath to avoid doing the synonym calculation).
364  query = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 0.0);
365  enquire.set_query(query);
366  Xapian::MSet mset_zero = enquire.get_mset(0, db.get_doccount());
367 
368  tout << query << '\n';
369  tout << mset_zero << '\n';
370 
371  // Check that the queries return some results.
372  TEST_NOT_EQUAL(mset_zero.size(), 0);
373  // Check that the queries return the same document IDs, and the zero
374  // one has zero weight.
375  check_msets_contain_same_docs(mset_orig, mset_zero);
376  for (Xapian::doccount i = 0; i < mset_orig.size(); ++i) {
377  TEST_NOT_EQUAL(mset_orig[i].get_weight(), 0.0);
378  TEST_EQUAL(mset_zero[i].get_weight(), 0.0);
379  }
380 }
381 
382 // Test synonym searches combined with various operators.
383 DEFINE_TESTCASE(synonym4, backend) {
384  Xapian::Database db(get_database("etext"));
385  Xapian::Enquire enquire(db);
387  Xapian::Query("gutenberg"),
388  Xapian::Query("blockhead"));
390  Xapian::Query("gutenberg"),
391  Xapian::Query("blockhead"));
392  Xapian::Query date_query = Xapian::Query("date");
393 
394  // Check some queries.
395  static const Xapian::Query::op operators[] = {
402  };
403  const Xapian::Query::op * end;
404  end = operators + sizeof(operators) / sizeof(operators[0]);
405  for (const Xapian::Query::op * i = operators; i != end; ++i) {
406  tout.str(string());
407  Xapian::Query query1(*i, syn_query, date_query);
408  Xapian::Query query2(*i, or_query, date_query);
409 
410  enquire.set_query(query1);
411  tout << "query1:" << query1 << '\n';
412  Xapian::MSet mset1 = enquire.get_mset(0, db.get_doccount());
413  tout << "mset1:" << mset1 << '\n';
414  enquire.set_query(query2);
415  tout << "query2:" << query2 << '\n';
416  Xapian::MSet mset2 = enquire.get_mset(0, db.get_doccount());
417  tout << "mset2:" << mset2 << '\n';
418 
419  TEST_NOT_EQUAL(mset1.size(), 0);
420  if (*i != Xapian::Query::OP_XOR) {
421  TEST_EQUAL(mset1[0].get_percent(), 100);
422  } else {
423  TEST(mset1[0].get_percent() != 100);
424  }
425  check_msets_contain_same_docs(mset1, mset2);
426  }
427 }
428 
429 DEFINE_TESTCASE(opmax1, backend) {
430  Xapian::Database db(get_database("etext"));
431  Xapian::Enquire enq(db);
432  Xapian::Query q1("king");
433  Xapian::Query q2("friedrich");
434  Xapian::Query qmax(Xapian::Query::OP_MAX, q1, q2);
435  enq.set_query(q1);
436  Xapian::MSet mset1 = enq.get_mset(0, db.get_doccount());
437  enq.set_query(q2);
438  Xapian::MSet mset2 = enq.get_mset(0, db.get_doccount());
439  enq.set_query(qmax);
440  Xapian::MSet msetmax = enq.get_mset(0, db.get_doccount());
441 
442  // Check that the weights in msetmax are the maximum of the weights in
443  // mset1 and mset2 for each docid.
444  map<Xapian::docid, double> expected_weights;
446  for (i = mset1.begin(); i != mset1.end(); ++i) {
447  expected_weights[*i] = i.get_weight();
448  }
449  for (i = mset2.begin(); i != mset2.end(); ++i) {
450  map<Xapian::docid, double>::iterator j;
451  j = expected_weights.find(*i);
452  if (j != expected_weights.end()) {
453  j->second = max(j->second, i.get_weight());
454  } else {
455  expected_weights[*i] = i.get_weight();
456  }
457  }
458 
459  for (i = msetmax.begin(); i != msetmax.end(); ++i) {
460  map<Xapian::docid, double>::iterator j;
461  j = expected_weights.find(*i);
462  TEST(j != expected_weights.end());
463  TEST_EQUAL_DOUBLE(j->second, i.get_weight());
464  expected_weights.erase(j);
465  tout << expected_weights.size() << endl;
466  }
467 
468  // Any document in mset1 or mset2 should also be in msetmax.
469  TEST_EQUAL(expected_weights.size(), 0);
470 }
Xapian::doccount size() const
Return number of items in this MSet object.
Definition: omenquire.cc:318
double get_max_possible() const
The maximum possible weight any document could achieve.
Definition: omenquire.cc:290
#define TEST(a)
Test a condition, without an additional explanation for failure.
Definition: testsuite.h:275
This class is used to access a database, or a group of databases.
Definition: database.h:68
Match documents which an odd number of subqueries match.
Definition: query.h:107
bool mset_range_is_same(const Xapian::MSet &mset1, unsigned int first1, const Xapian::MSet &mset2, unsigned int first2, unsigned int count)
Definition: testutils.cc:46
op
Query operators.
Definition: query.h:78
a generic test suite engine
Class representing a list of search results.
Definition: mset.h:44
STL namespace.
Pick the maximum weight of any subquery.
Definition: query.h:249
MSet get_mset(Xapian::doccount first, Xapian::doccount maxitems, Xapian::doccount checkatleast=0, const RSet *omrset=0, const MatchDecider *mdecider=0) const
Get (a portion of) the match set for the current query.
Definition: omenquire.cc:932
Xapian::doccount get_doccount() const
Get the number of documents in the database.
Definition: omdatabase.cc:267
test functionality of the Xapian API
#define TEST_REL(A, REL, B)
Test a relation holds,e.g. TEST_REL(a,>,b);.
Definition: testmacros.h:32
#define TEST_NOT_EQUAL(a, b)
Test for non-equality of two things.
Definition: testsuite.h:305
Xapian::Query subqs[4]
Xapian::termcount get_doclength_upper_bound() const
Get an upper bound on the length of a document in this DB.
Definition: omdatabase.cc:421
Base class for backend handling in test harness.
std::ostringstream tout
The debug printing stream.
Definition: testsuite.cc:103
Iterator over a Xapian::MSet.
Definition: mset.h:351
Scale the weight contributed by a subquery.
Definition: query.h:166
Match only documents where all subqueries match near and in order.
Definition: query.h:152
Match the first subquery taking extra weight from other subqueries.
Definition: query.h:118
Public interfaces for the Xapian library.
#define NOQ
DEFINE_TESTCASE(synonym1, backend)
MSetIterator begin() const
Return iterator pointing to the first item in this MSet.
Definition: mset.h:607
MSetIterator end() const
Return iterator pointing to just after the last item in this MSet.
Definition: mset.h:612
#define TEST_EQUAL_DOUBLE(a, b)
Test two doubles for near equality.
Definition: testsuite.h:295
void set_query(const Xapian::Query &query, Xapian::termcount qlen=0)
Set the query to run.
Definition: omenquire.cc:793
static const synonym1_data_type synonym1_data[]
Match like OP_OR but weighting as if a single term.
Definition: query.h:239
static void check_msets_contain_same_docs(const Xapian::MSet &mset1, const Xapian::MSet &mset2)
double get_weight() const
Get the weight for the current position.
Definition: omenquire.cc:460
Match only documents which all subqueries match.
Definition: query.h:84
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
Definition: api_anydb.cc:63
Xapian::Database get_database(const string &dbname)
Definition: apitest.cc:48
This class provides an interface to the information retrieval system for the purpose of searching...
Definition: enquire.h:152
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Match documents which the first subquery matches but no others do.
Definition: query.h:99
Match documents which at least one subquery matches.
Definition: query.h:92
Xapian-specific test helper functions and macros.
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
Class representing a query.
Definition: query.h:46
#define TEST_EQUAL(a, b)
Test for equality of two things.
Definition: testsuite.h:278