xapian-core  1.4.21
api_percentages.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2008,2009 Lemur Consulting Ltd
5  * Copyright (C) 2008,2009,2010,2011,2012,2014 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include <config.h>
23 
24 #include "api_percentages.h"
25 
26 #include <xapian.h>
27 
28 #include "apitest.h"
29 #include "str.h"
30 #include "testutils.h"
31 
32 #include <cfloat>
33 
34 using namespace std;
35 
36 // Test that percentages reported are the same regardless of which part of the
37 // mset is returned, for sort-by-value search. Regression test for bug#216 in
38 // 1.0.10 and earlier with returned percentages.
39 DEFINE_TESTCASE(consistency3, backend) {
40  Xapian::Database db(get_database("apitest_sortconsist"));
41  Xapian::Enquire enquire(db);
42  enquire.set_query(Xapian::Query("foo"));
43  enquire.set_sort_by_value(1, 0);
44  Xapian::doccount lots = 3;
45  Xapian::MSet bigmset = enquire.get_mset(0, lots);
46  TEST_EQUAL(bigmset.size(), lots);
47  for (Xapian::doccount start = 0; start < lots; ++start) {
48  tout << *bigmset[start] << ":" << bigmset[start].get_weight() << ":"
49  << bigmset[start].get_percent() << "%" << endl;
50  for (Xapian::doccount size = 0; size < lots - start; ++size) {
51  Xapian::MSet mset = enquire.get_mset(start, size);
52  if (mset.size()) {
53  TEST_EQUAL(start + mset.size(),
54  min(start + size, bigmset.size()));
55  } else if (size) {
56  TEST(start >= bigmset.size());
57  }
58  for (Xapian::doccount i = 0; i < mset.size(); ++i) {
59  TEST_EQUAL(*mset[i], *bigmset[start + i]);
60  TEST_EQUAL_DOUBLE(mset[i].get_weight(),
61  bigmset[start + i].get_weight());
62  TEST_EQUAL_DOUBLE(mset[i].get_percent(),
63  bigmset[start + i].get_percent());
64  }
65  }
66  }
67 }
68 
70  vector<pair<Xapian::docid, double>> weights;
71  vector<pair<Xapian::docid, double>>::const_iterator i;
72  bool started;
73 
74  MyPostingSource(const vector<pair<Xapian::docid, double>>& weights_,
75  double max_wt)
76  : weights(weights_), started(false)
77  {
78  set_maxweight(max_wt);
79  }
80 
81  public:
82  MyPostingSource() : started(false) { }
83 
84  PostingSource * clone() const
85  {
86  return new MyPostingSource(weights, get_maxweight());
87  }
88 
89  void append_docweight(Xapian::docid did, double wt) {
90  weights.push_back(make_pair(did, wt));
91  if (wt > get_maxweight()) set_maxweight(wt);
92  }
93 
94  void init(const Xapian::Database &) { started = false; }
95 
96  double get_weight() const { return i->second; }
97 
98  Xapian::doccount get_termfreq_min() const { return weights.size(); }
99  Xapian::doccount get_termfreq_est() const { return weights.size(); }
100  Xapian::doccount get_termfreq_max() const { return weights.size(); }
101 
102  void next(double /*wt*/) {
103  if (!started) {
104  i = weights.begin();
105  started = true;
106  } else {
107  ++i;
108  }
109  }
110 
111  bool at_end() const {
112  return (i == weights.end());
113  }
114 
115  Xapian::docid get_docid() const { return i->first; }
116 
117  string get_description() const {
118  return "MyPostingSource";
119  }
120 };
121 
123 DEFINE_TESTCASE(pctcutoff4, backend && !remote && !multi) {
124  // Find the number of DBL_EPSILONs to subtract which result in the
125  // percentage of the second hit being 49% instead of 50%.
126  int epsilons = 0;
127  Xapian::Database db(get_database("apitest_simpledata"));
128  Xapian::Enquire enquire(db);
129  while (true) {
130  MyPostingSource source;
131  source.append_docweight(1, 100);
132  source.append_docweight(2, 50 - epsilons * DBL_EPSILON);
133  enquire.set_query(Xapian::Query(&source));
134  Xapian::MSet mset = enquire.get_mset(0, 10);
135  TEST_EQUAL(mset.size(), 2);
136  if (mset[1].get_percent() != 50) break;
137  ++epsilons;
138  }
139 
140  // Make a set of document weights including ones on either side of the
141  // 49% / 50% boundary.
142  MyPostingSource source;
143  source.append_docweight(1, 100);
144  source.append_docweight(2, 50);
145  source.append_docweight(3, 50 - (epsilons - 1) * DBL_EPSILON);
146  source.append_docweight(4, 50 - epsilons * DBL_EPSILON);
147  source.append_docweight(5, 25);
148 
149  enquire.set_query(Xapian::Query(&source));
150  Xapian::MSet mset1 = enquire.get_mset(0, 10);
151  TEST_EQUAL(mset1.size(), 5);
152  TEST_EQUAL(mset1[2].get_percent(), 50);
153  TEST_EQUAL(mset1[3].get_percent(), 49);
154 
155  // Use various different percentage cutoffs, and check that the values
156  // returned are as expected.
157  int percent = 100;
158  for (Xapian::MSetIterator i = mset1.begin(); i != mset1.end(); ++i) {
159  int new_percent = mset1.convert_to_percent(i);
160  tout << "mset1 item = " << i.get_percent() << "%\n";
161  if (new_percent != percent) {
162  enquire.set_cutoff(percent);
163  Xapian::MSet mset2 = enquire.get_mset(0, 10);
164  tout << "cutoff = " << percent << "%, "
165  "mset size = " << mset2.size() << "\n";
166  TEST_EQUAL(mset2.size(), i.get_rank());
167  percent = new_percent;
168  }
169  }
170 }
171 
173 DEFINE_TESTCASE(pctcutoff5, backend) {
174  Xapian::Database db(get_database("apitest_simpledata"));
175  Xapian::Enquire enquire(db);
176  enquire.set_query(Xapian::Query("test"));
177  enquire.set_cutoff(42);
178  Xapian::MSet mset;
179 
180  enquire.set_sort_by_value(0, false);
181  TEST_EXCEPTION(Xapian::UnimplementedError, mset = enquire.get_mset(0, 10));
182 
183  enquire.set_sort_by_value(0, true);
184  TEST_EXCEPTION(Xapian::UnimplementedError, mset = enquire.get_mset(0, 10));
185 
186  enquire.set_sort_by_value_then_relevance(0, false);
187  TEST_EXCEPTION(Xapian::UnimplementedError, mset = enquire.get_mset(0, 10));
188 
189  enquire.set_sort_by_value_then_relevance(0, true);
190  TEST_EXCEPTION(Xapian::UnimplementedError, mset = enquire.get_mset(0, 10));
191 }
192 
193 // Regression test for bug fixed in 1.0.14.
194 DEFINE_TESTCASE(topercent3, backend) {
195  Xapian::Database db = get_database("apitest_simpledata");
196  Xapian::Enquire enquire(db);
197  enquire.set_sort_by_value(1, false);
198 
199  static const char * const terms[] = { "paragraph", "banana" };
200  enquire.set_query(Xapian::Query(Xapian::Query::OP_OR, terms, terms + 2));
201 
202  Xapian::MSet mset = enquire.get_mset(0, 20);
203 
205  for (i = mset.begin(); i != mset.end(); ++i) {
206  // We should never achieve 100%.
207  TEST_REL(i.get_percent(),<,100);
208  }
209 }
210 
211 // Regression test for bug introduced temporarily by the "percent without
212 // termlist" patch.
213 DEFINE_TESTCASE(topercent4, backend) {
214  Xapian::Enquire enquire(get_database("apitest_simpledata"));
215 
217  Xapian::Query("paragraph"),
218  Xapian::Query("queri"));
220  query, Xapian::Query("rubbish"));
221 
222  enquire.set_query(query);
223  Xapian::MSet mset = enquire.get_mset(0, 10);
224 
225  // We should get 50% not 33%.
226  TEST(!mset.empty());
227  TEST_EQUAL(mset[0].get_percent(), 50);
228 }
229 
231 DEFINE_TESTCASE(topercent5, backend) {
232  Xapian::Enquire enquire(get_database("apitest_simpledata"));
234  Xapian::Query("paragraph"), Xapian::Query("xyzzy"));
235  enquire.set_query(q);
236  Xapian::MSet mset = enquire.get_mset(0, 10);
237  TEST(!mset.empty());
238  TEST(mset[0].get_percent() < 100);
239  // It would be odd if the non-existent term was worth more, but in 1.0.x
240  // the top hit got 4% in this testcase. In 1.2.x it gets 50%, which is
241  // better, but >50% would be more natural.
242  TEST_REL(mset[0].get_percent(), >=, 50);
243 }
244 
246 // Regression test for bug#590 fixed in 1.3.1 and 1.2.10.
247 DEFINE_TESTCASE(topercent6, backend) {
248  Xapian::Enquire enquire(get_database("apitest_simpledata"));
250  Xapian::Query("rubbish"), Xapian::Query("letter"));
251  enquire.set_query(q);
252  Xapian::MSet mset = enquire.get_mset(0, 10);
253  TEST(!mset.empty());
254  TEST(mset[0].get_percent() < 100);
255 
256  q = Xapian::Query(q.OP_FILTER, q, Xapian::Query("this"));
257  enquire.set_query(q);
258  Xapian::MSet mset2 = enquire.get_mset(0, 10);
259  TEST(!mset2.empty());
260  TEST_EQUAL(mset[0].get_percent(), mset2[0].get_percent());
261 }
262 
263 static void
265 {
266  for (int i = 1; i <= 6; ++i) {
268  d.set_data(str(i));
269  d.add_term("boom", 2 + (i - 4)*(i - 2));
270  if (i != 5)
271  d.add_boolean_term("XCAT122");
272  db.add_document(d);
273  }
274  db.commit();
275 }
276 
278 // Regression test for bug introduced in 1.2.10 by the original fix for #590,
279 // and fixed in 1.2.13 (and in trunk before 1.3.1 was released).
280 DEFINE_TESTCASE(topercent7, generated) {
282 
283  Xapian::Query q;
284  q = Xapian::Query(q.OP_OR, Xapian::Query("tomb"), Xapian::Query("boom"));
285  q = Xapian::Query(q.OP_AND, q, Xapian::Query("XCAT122"));
286 
287  Xapian::Enquire enq(db);
288  enq.set_query(q);
289  Xapian::MSet m = enq.get_mset(0, 10);
290  TEST(!m.empty());
291  TEST_REL(m[0].get_percent(),>,60);
292 }
293 
294 class ZWeight : public Xapian::Weight {
295  public:
296  ZWeight() { }
297 
298  void init(double) { }
299 
300  Weight * clone() const {
301  return new ZWeight();
302  }
303 
306  Xapian::termcount) const {
307  return 0.0;
308  }
309 
310  double get_maxpart() const {
311  return 0.0;
312  }
313 
315  Xapian::termcount) const {
316  return 1.0 / doclen;
317  }
318 
319  double get_maxextra() const {
320  return 1.0;
321  }
322 };
323 
325 DEFINE_TESTCASE(checkzeromaxpartopt1, backend && !remote) {
326  Xapian::Database db = get_database("apitest_simpledata");
327  Xapian::Enquire enquire(db);
328  // "this" indexes all documents, so will get replaced with MatchAll
329  // internally.
330  static const char * const terms[] = { "this", "spoken", "blank" };
331  enquire.set_query(Xapian::Query(Xapian::Query::OP_OR, terms, terms + 3));
332  ZWeight wt;
333  enquire.set_weighting_scheme(wt);
334  Xapian::MSet mset = enquire.get_mset(0, db.get_doccount());
335  // No documents match all 3 terms, so the score shouldn't be 100%.
336  TEST(mset[0].get_percent() != 100);
337  // Make sure the percentage score isn't 0 or 1 though.
338  TEST_REL(mset[0].get_percent(), >, 1);
339 }
Xapian::doccount size() const
Return number of items in this MSet object.
Definition: omenquire.cc:318
Xapian::docid add_document(const Xapian::Document &document)
Add a new document to the database.
Definition: omdatabase.cc:902
void set_sort_by_value_then_relevance(Xapian::valueno sort_key, bool reverse)
Set the sorting to be by value, then by relevance for documents with the same value.
Definition: omenquire.cc:878
#define TEST(a)
Test a condition, without an additional explanation for failure.
Definition: testsuite.h:275
int convert_to_percent(double weight) const
Convert a weight to a percentage.
Definition: omenquire.cc:198
This class is used to access a database, or a group of databases.
Definition: database.h:68
double get_maxpart() const
Return an upper bound on what get_sumpart() can return for any document.
void set_sort_by_value(Xapian::valueno sort_key, bool reverse)
Set the sorting to be by value only.
Definition: omenquire.cc:869
Match documents which an odd number of subqueries match.
Definition: query.h:107
void set_cutoff(int percent_cutoff, double weight_cutoff=0)
Set the percentage and/or weight cutoffs.
Definition: omenquire.cc:856
bool empty() const
Return true if this MSet object is empty.
Definition: mset.h:283
Class representing a list of search results.
Definition: mset.h:44
STL namespace.
MSet get_mset(Xapian::doccount first, Xapian::doccount maxitems, Xapian::doccount checkatleast=0, const RSet *omrset=0, const MatchDecider *mdecider=0) const
Get (a portion of) the match set for the current query.
Definition: omenquire.cc:932
Convert types to std::string.
void next(double)
Advance the current position to the next matching document.
#define false
Definition: header.h:9
Xapian::doccount get_doccount() const
Get the number of documents in the database.
Definition: omdatabase.cc:267
Xapian::doccount get_termfreq_min() const
A lower bound on the number of documents this object can return.
static void make_topercent7_db(Xapian::WritableDatabase &db, const string &)
test functionality of the Xapian API
PostingSource * clone() const
Clone the posting source.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
#define TEST_REL(A, REL, B)
Test a relation holds,e.g. TEST_REL(a,>,b);.
Definition: testmacros.h:32
Weight * clone() const
Clone this object.
Xapian::doccount get_termfreq_est() const
An estimate of the number of documents this object can return.
double get_sumextra(Xapian::termcount doclen, Xapian::termcount) const
Calculate the term-independent weight component for a document.
This class provides read/write access to a database.
Definition: database.h:785
std::ostringstream tout
The debug printing stream.
Definition: testsuite.cc:103
Iterator over a Xapian::MSet.
Definition: mset.h:351
Public interfaces for the Xapian library.
void init(const Xapian::Database &)
Set this PostingSource to the start of the list of postings.
Match like OP_AND but only taking weight from the first subquery.
Definition: query.h:128
#define TEST_EXCEPTION(TYPE, CODE)
Check that CODE throws exactly Xapian exception TYPE.
Definition: testutils.h:109
vector< pair< Xapian::docid, double > > weights
bool at_end() const
Return true if the current position is past the last entry in this list.
string str(int value)
Convert int to std::string.
Definition: str.cc:90
MSetIterator begin() const
Return iterator pointing to the first item in this MSet.
Definition: mset.h:607
MSetIterator end() const
Return iterator pointing to just after the last item in this MSet.
Definition: mset.h:612
int percent
The percentage score for a document in an MSet.
Definition: types.h:66
void commit()
Commit any pending modifications made to the database.
Definition: omdatabase.cc:857
double get_sumpart(Xapian::termcount, Xapian::termcount, Xapian::termcount) const
Calculate the weight contribution for this object&#39;s term to a document.
Base class which provides an "external" source of postings.
Definition: postingsource.h:47
void append_docweight(Xapian::docid did, double wt)
int get_percent() const
Convert the weight of the current iterator position to a percentage.
Definition: mset.h:514
#define TEST_EQUAL_DOUBLE(a, b)
Test two doubles for near equality.
Definition: testsuite.h:295
void set_query(const Xapian::Query &query, Xapian::termcount qlen=0)
Set the query to run.
Definition: omenquire.cc:793
string get_description() const
Return a string describing this object.
vector< pair< Xapian::docid, double > >::const_iterator i
Match only documents which all subqueries match.
Definition: query.h:84
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
Definition: api_anydb.cc:63
DEFINE_TESTCASE(consistency3, backend)
Xapian::Database get_database(const string &dbname)
Definition: apitest.cc:48
This class provides an interface to the information retrieval system for the purpose of searching...
Definition: enquire.h:152
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Xapian::doccount get_termfreq_max() const
An upper bound on the number of documents this object can return.
Match documents which at least one subquery matches.
Definition: query.h:92
Xapian-specific test helper functions and macros.
double get_maxextra() const
Return an upper bound on what get_sumextra() can return for any document.
void init(double)
Allow the subclass to perform any initialisation it needs to.
void add_boolean_term(const std::string &term)
Add a boolean filter term to the document.
Definition: document.h:191
MyPostingSource(const vector< pair< Xapian::docid, double >> &weights_, double max_wt)
void set_weighting_scheme(const Weight &weight_)
Set the weighting scheme to use for queries.
Definition: omenquire.cc:819
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
Class representing a query.
Definition: query.h:46
#define TEST_EQUAL(a, b)
Test for equality of two things.
Definition: testsuite.h:278
void set_data(const std::string &data)
Set data stored in the document.
Definition: omdocument.cc:78
double get_weight() const
Return the weight contribution for the current document.
A handle representing a document in a Xapian database.
Definition: document.h:61
Xapian::docid get_docid() const
Return the current docid.
UnimplementedError indicates an attempt to use an unimplemented feature.
Definition: error.h:325
void add_term(const std::string &tname, Xapian::termcount wdfinc=1)
Add a term to the document, without positional information.
Definition: omdocument.cc:140
Abstract base class for weighting schemes.
Definition: weight.h:35