xapian-core  1.4.27
api_percentages.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2008,2009 Lemur Consulting Ltd
5  * Copyright (C) 2008,2009,2010,2011,2012,2014 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include <config.h>
23 
24 #include "api_percentages.h"
25 
26 #include <xapian.h>
27 
28 #include "apitest.h"
29 #include "str.h"
30 #include "testutils.h"
31 
32 #include <cfloat>
33 
34 using namespace std;
35 
36 // Test that percentages reported are the same regardless of which part of the
37 // mset is returned, for sort-by-value search. Regression test for bug#216 in
38 // 1.0.10 and earlier with returned percentages.
39 DEFINE_TESTCASE(consistency3, backend) {
40  Xapian::Database db(get_database("apitest_sortconsist"));
41  Xapian::Enquire enquire(db);
42  enquire.set_query(Xapian::Query("foo"));
43  enquire.set_sort_by_value(1, 0);
44  Xapian::doccount lots = 3;
45  Xapian::MSet bigmset = enquire.get_mset(0, lots);
46  TEST_EQUAL(bigmset.size(), lots);
47  for (Xapian::doccount start = 0; start < lots; ++start) {
48  tout << *bigmset[start] << ":" << bigmset[start].get_weight() << ":"
49  << bigmset[start].get_percent() << "%\n";
50  for (Xapian::doccount size = 0; size < lots - start; ++size) {
51  Xapian::MSet mset = enquire.get_mset(start, size);
52  if (mset.size()) {
53  TEST_EQUAL(start + mset.size(),
54  min(start + size, bigmset.size()));
55  } else if (size) {
56  TEST(start >= bigmset.size());
57  }
58  for (Xapian::doccount i = 0; i < mset.size(); ++i) {
59  TEST_EQUAL(*mset[i], *bigmset[start + i]);
60  TEST_EQUAL_DOUBLE(mset[i].get_weight(),
61  bigmset[start + i].get_weight());
62  TEST_EQUAL_DOUBLE(mset[i].get_percent(),
63  bigmset[start + i].get_percent());
64  }
65  }
66  }
67 }
68 
70  vector<pair<Xapian::docid, double>> weights;
71  vector<pair<Xapian::docid, double>>::const_iterator i;
72  bool started;
73 
74  MyPostingSource(const vector<pair<Xapian::docid, double>>& weights_,
75  double max_wt)
76  : weights(weights_), started(false)
77  {
78  set_maxweight(max_wt);
79  }
80 
81  public:
82  MyPostingSource() : started(false) { }
83 
84  PostingSource* clone() const override {
85  return new MyPostingSource(weights, get_maxweight());
86  }
87 
88  void append_docweight(Xapian::docid did, double wt) {
89  weights.push_back(make_pair(did, wt));
90  if (wt > get_maxweight()) set_maxweight(wt);
91  }
92 
93  void init(const Xapian::Database&) override { started = false; }
94 
95  double get_weight() const override { return i->second; }
96 
98  return weights.size();
99  }
101  return weights.size();
102  }
104  return weights.size();
105  }
106 
107  void next(double /*wt*/) override {
108  if (!started) {
109  i = weights.begin();
110  started = true;
111  } else {
112  ++i;
113  }
114  }
115 
116  bool at_end() const override {
117  return (i == weights.end());
118  }
119 
120  Xapian::docid get_docid() const override { return i->first; }
121 
122  string get_description() const override {
123  return "MyPostingSource";
124  }
125 };
126 
128 DEFINE_TESTCASE(pctcutoff4, backend && !remote && !multi) {
129  // Find the number of DBL_EPSILONs to subtract which result in the
130  // percentage of the second hit being 49% instead of 50%.
131  int epsilons = 0;
132  Xapian::Database db(get_database("apitest_simpledata"));
133  Xapian::Enquire enquire(db);
134  while (true) {
135  MyPostingSource source;
136  source.append_docweight(1, 100);
137  source.append_docweight(2, 50 - epsilons * DBL_EPSILON);
138  enquire.set_query(Xapian::Query(&source));
139  Xapian::MSet mset = enquire.get_mset(0, 10);
140  TEST_EQUAL(mset.size(), 2);
141  if (mset[1].get_percent() != 50) break;
142  ++epsilons;
143  }
144 
145  // Make a set of document weights including ones on either side of the
146  // 49% / 50% boundary.
147  MyPostingSource source;
148  source.append_docweight(1, 100);
149  source.append_docweight(2, 50);
150  source.append_docweight(3, 50 - (epsilons - 1) * DBL_EPSILON);
151  source.append_docweight(4, 50 - epsilons * DBL_EPSILON);
152  source.append_docweight(5, 25);
153 
154  enquire.set_query(Xapian::Query(&source));
155  Xapian::MSet mset1 = enquire.get_mset(0, 10);
156  TEST_EQUAL(mset1.size(), 5);
157  TEST_EQUAL(mset1[2].get_percent(), 50);
158  TEST_EQUAL(mset1[3].get_percent(), 49);
159 
160  // Use various different percentage cutoffs, and check that the values
161  // returned are as expected.
162  int percent = 100;
163  for (Xapian::MSetIterator i = mset1.begin(); i != mset1.end(); ++i) {
164  int new_percent = mset1.convert_to_percent(i);
165  tout << "mset1 item = " << i.get_percent() << "%\n";
166  if (new_percent != percent) {
167  enquire.set_cutoff(percent);
168  Xapian::MSet mset2 = enquire.get_mset(0, 10);
169  tout << "cutoff = " << percent << "%, "
170  "mset size = " << mset2.size() << "\n";
171  TEST_EQUAL(mset2.size(), i.get_rank());
172  percent = new_percent;
173  }
174  }
175 }
176 
178 DEFINE_TESTCASE(pctcutoff5, backend) {
179  Xapian::Database db(get_database("apitest_simpledata"));
180  Xapian::Enquire enquire(db);
181  enquire.set_query(Xapian::Query("test"));
182  enquire.set_cutoff(42);
183  Xapian::MSet mset;
184 
185  enquire.set_sort_by_value(0, false);
186  TEST_EXCEPTION(Xapian::UnimplementedError, mset = enquire.get_mset(0, 10));
187 
188  enquire.set_sort_by_value(0, true);
189  TEST_EXCEPTION(Xapian::UnimplementedError, mset = enquire.get_mset(0, 10));
190 
191  enquire.set_sort_by_value_then_relevance(0, false);
192  TEST_EXCEPTION(Xapian::UnimplementedError, mset = enquire.get_mset(0, 10));
193 
194  enquire.set_sort_by_value_then_relevance(0, true);
195  TEST_EXCEPTION(Xapian::UnimplementedError, mset = enquire.get_mset(0, 10));
196 }
197 
198 // Regression test for bug fixed in 1.0.14.
199 DEFINE_TESTCASE(topercent3, backend) {
200  Xapian::Database db = get_database("apitest_simpledata");
201  Xapian::Enquire enquire(db);
202  enquire.set_sort_by_value(1, false);
203 
204  static const char * const terms[] = { "paragraph", "banana" };
205  enquire.set_query(Xapian::Query(Xapian::Query::OP_OR, terms, terms + 2));
206 
207  Xapian::MSet mset = enquire.get_mset(0, 20);
208 
210  for (i = mset.begin(); i != mset.end(); ++i) {
211  // We should never achieve 100%.
212  TEST_REL(i.get_percent(),<,100);
213  }
214 }
215 
216 // Regression test for bug introduced temporarily by the "percent without
217 // termlist" patch.
218 DEFINE_TESTCASE(topercent4, backend) {
219  Xapian::Enquire enquire(get_database("apitest_simpledata"));
220 
222  Xapian::Query("paragraph"),
223  Xapian::Query("queri"));
225  query, Xapian::Query("rubbish"));
226 
227  enquire.set_query(query);
228  Xapian::MSet mset = enquire.get_mset(0, 10);
229 
230  // We should get 50% not 33%.
231  TEST(!mset.empty());
232  TEST_EQUAL(mset[0].get_percent(), 50);
233 }
234 
236 DEFINE_TESTCASE(topercent5, backend) {
237  Xapian::Enquire enquire(get_database("apitest_simpledata"));
239  Xapian::Query("paragraph"), Xapian::Query("xyzzy"));
240  enquire.set_query(q);
241  Xapian::MSet mset = enquire.get_mset(0, 10);
242  TEST(!mset.empty());
243  TEST(mset[0].get_percent() < 100);
244  // It would be odd if the non-existent term was worth more, but in 1.0.x
245  // the top hit got 4% in this testcase. In 1.2.x it gets 50%, which is
246  // better, but >50% would be more natural.
247  TEST_REL(mset[0].get_percent(), >=, 50);
248 
249  // Repeat tests with TradWeight.
251  mset = enquire.get_mset(0, 10);
252  TEST(!mset.empty());
253  TEST(mset[0].get_percent() < 100);
254  TEST_REL(mset[0].get_percent(), >=, 50);
255 }
256 
258 // Regression test for bug#590 fixed in 1.3.1 and 1.2.10.
259 DEFINE_TESTCASE(topercent6, backend) {
260  Xapian::Enquire enquire(get_database("apitest_simpledata"));
262  Xapian::Query("rubbish"), Xapian::Query("letter"));
263  enquire.set_query(q);
264  Xapian::MSet mset = enquire.get_mset(0, 10);
265  TEST(!mset.empty());
266  TEST(mset[0].get_percent() < 100);
267 
268  q = Xapian::Query(q.OP_FILTER, q, Xapian::Query("this"));
269  enquire.set_query(q);
270  Xapian::MSet mset2 = enquire.get_mset(0, 10);
271  TEST(!mset2.empty());
272  TEST_EQUAL(mset[0].get_percent(), mset2[0].get_percent());
273 }
274 
275 static void
277 {
278  for (int i = 1; i <= 6; ++i) {
280  d.set_data(str(i));
281  d.add_term("boom", 2 + (i - 4)*(i - 2));
282  if (i != 5)
283  d.add_boolean_term("XCAT122");
284  db.add_document(d);
285  }
286  db.commit();
287 }
288 
290 // Regression test for bug introduced in 1.2.10 by the original fix for #590,
291 // and fixed in 1.2.13 (and in trunk before 1.3.1 was released).
292 DEFINE_TESTCASE(topercent7, backend) {
294 
295  Xapian::Query q;
296  q = Xapian::Query(q.OP_OR, Xapian::Query("tomb"), Xapian::Query("boom"));
297  q = Xapian::Query(q.OP_AND, q, Xapian::Query("XCAT122"));
298 
299  Xapian::Enquire enq(db);
300  enq.set_query(q);
301  Xapian::MSet m = enq.get_mset(0, 10);
302  TEST(!m.empty());
303  TEST_REL(m[0].get_percent(),>,60);
304 }
305 
306 class ZWeight : public Xapian::Weight {
307  public:
308  ZWeight() { }
309 
310  void init(double) override { }
311 
312  Weight* clone() const override {
313  return new ZWeight();
314  }
315 
318  Xapian::termcount) const override {
319  return 0.0;
320  }
321 
322  double get_maxpart() const override {
323  return 0.0;
324  }
325 
327  Xapian::termcount) const override {
328  return 1.0 / doclen;
329  }
330 
331  double get_maxextra() const override {
332  return 1.0;
333  }
334 };
335 
337 DEFINE_TESTCASE(checkzeromaxpartopt1, backend && !remote) {
338  Xapian::Database db = get_database("apitest_simpledata");
339  Xapian::Enquire enquire(db);
340  // "this" indexes all documents, so will get replaced with MatchAll
341  // internally.
342  static const char * const terms[] = { "this", "spoken", "blank" };
343  enquire.set_query(Xapian::Query(Xapian::Query::OP_OR, terms, terms + 3));
344  ZWeight wt;
345  enquire.set_weighting_scheme(wt);
346  Xapian::MSet mset = enquire.get_mset(0, db.get_doccount());
347  // No documents match all 3 terms, so the score shouldn't be 100%.
348  TEST(mset[0].get_percent() != 100);
349  // Make sure the percentage score isn't 0 or 1 though.
350  TEST_REL(mset[0].get_percent(), >, 1);
351 }
Xapian::doccount size() const
Return number of items in this MSet object.
Definition: omenquire.cc:318
Xapian::docid add_document(const Xapian::Document &document)
Add a new document to the database.
Definition: omdatabase.cc:902
void set_sort_by_value_then_relevance(Xapian::valueno sort_key, bool reverse)
Set the sorting to be by value, then by relevance for documents with the same value.
Definition: omenquire.cc:884
#define TEST(a)
Test a condition, without an additional explanation for failure.
Definition: testsuite.h:275
double get_maxextra() const override
Return an upper bound on what get_sumextra() can return for any document.
int convert_to_percent(double weight) const
Convert a weight to a percentage.
Definition: omenquire.cc:198
This class is used to access a database, or a group of databases.
Definition: database.h:68
double get_weight() const override
Return the weight contribution for the current document.
void set_sort_by_value(Xapian::valueno sort_key, bool reverse)
Set the sorting to be by value only.
Definition: omenquire.cc:875
Xapian::doccount get_termfreq_min() const override
A lower bound on the number of documents this object can return.
Weight * clone() const override
Clone this object.
Match documents which an odd number of subqueries match.
Definition: query.h:107
void set_cutoff(int percent_cutoff, double weight_cutoff=0)
Set the percentage and/or weight cutoffs.
Definition: omenquire.cc:862
bool empty() const
Return true if this MSet object is empty.
Definition: mset.h:300
double get_sumpart(Xapian::termcount, Xapian::termcount, Xapian::termcount) const override
Calculate the weight contribution for this object&#39;s term to a document.
Class representing a list of search results.
Definition: mset.h:44
STL namespace.
MSet get_mset(Xapian::doccount first, Xapian::doccount maxitems, Xapian::doccount checkatleast=0, const RSet *omrset=0, const MatchDecider *mdecider=0) const
Get (a portion of) the match set for the current query.
Definition: omenquire.cc:938
Convert types to std::string.
#define false
Definition: header.h:9
Xapian::doccount get_doccount() const
Get the number of documents in the database.
Definition: omdatabase.cc:267
Xapian::docid get_docid() const override
Return the current docid.
PostingSource * clone() const override
Clone the posting source.
static void make_topercent7_db(Xapian::WritableDatabase &db, const string &)
test functionality of the Xapian API
double get_sumextra(Xapian::termcount doclen, Xapian::termcount) const override
Calculate the term-independent weight component for a document.
void init(const Xapian::Database &) override
Set this PostingSource to the start of the list of postings.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
#define TEST_REL(A, REL, B)
Test a relation holds,e.g. TEST_REL(a,>,b);.
Definition: testmacros.h:32
This class provides read/write access to a database.
Definition: database.h:789
std::ostringstream tout
The debug printing stream.
Definition: testsuite.cc:104
Iterator over a Xapian::MSet.
Definition: mset.h:368
void init(double) override
Allow the subclass to perform any initialisation it needs to.
Public interfaces for the Xapian library.
Match like OP_AND but only taking weight from the first subquery.
Definition: query.h:128
#define TEST_EXCEPTION(TYPE, CODE)
Check that CODE throws exactly Xapian exception TYPE.
Definition: testutils.h:109
double get_maxpart() const override
Return an upper bound on what get_sumpart() can return for any document.
vector< pair< Xapian::docid, double > > weights
string str(int value)
Convert int to std::string.
Definition: str.cc:90
MSetIterator begin() const
Return iterator pointing to the first item in this MSet.
Definition: mset.h:624
MSetIterator end() const
Return iterator pointing to just after the last item in this MSet.
Definition: mset.h:629
int percent
The percentage score for a document in an MSet.
Definition: types.h:66
void commit()
Commit any pending modifications made to the database.
Definition: omdatabase.cc:857
Xapian::Weight subclass implementing the traditional probabilistic formula.
Definition: weight.h:774
Base class which provides an "external" source of postings.
Definition: postingsource.h:47
void append_docweight(Xapian::docid did, double wt)
int get_percent() const
Convert the weight of the current iterator position to a percentage.
Definition: mset.h:531
#define TEST_EQUAL_DOUBLE(a, b)
Test two doubles for near equality.
Definition: testsuite.h:295
Xapian::doccount get_termfreq_est() const override
An estimate of the number of documents this object can return.
Xapian::doccount get_termfreq_max() const override
An upper bound on the number of documents this object can return.
void set_query(const Xapian::Query &query, Xapian::termcount qlen=0)
Set the query to run.
Definition: omenquire.cc:793
vector< pair< Xapian::docid, double > >::const_iterator i
bool at_end() const override
Return true if the current position is past the last entry in this list.
Match only documents which all subqueries match.
Definition: query.h:84
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
Definition: api_anydb.cc:63
DEFINE_TESTCASE(consistency3, backend)
Xapian::Database get_database(const string &dbname)
Definition: apitest.cc:48
This class provides an interface to the information retrieval system for the purpose of searching...
Definition: enquire.h:152
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Match documents which at least one subquery matches.
Definition: query.h:92
Xapian-specific test helper functions and macros.
void add_boolean_term(const std::string &term)
Add a boolean filter term to the document.
Definition: document.h:192
MyPostingSource(const vector< pair< Xapian::docid, double >> &weights_, double max_wt)
void set_weighting_scheme(const Weight &weight_)
Set the weighting scheme to use for queries.
Definition: omenquire.cc:819
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
Class representing a query.
Definition: query.h:46
#define TEST_EQUAL(a, b)
Test for equality of two things.
Definition: testsuite.h:278
void set_data(const std::string &data)
Set data stored in the document.
Definition: omdocument.cc:78
void next(double) override
Advance the current position to the next matching document.
A handle representing a document in a Xapian database.
Definition: document.h:61
string get_description() const override
Return a string describing this object.
UnimplementedError indicates an attempt to use an unimplemented feature.
Definition: error.h:325
void add_term(const std::string &tname, Xapian::termcount wdfinc=1)
Add a term to the document, without positional information.
Definition: omdocument.cc:140
Abstract base class for weighting schemes.
Definition: weight.h:35