xapian-core  2.0.0
api_weight.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2004-2024 Olly Betts
5  * Copyright (C) 2013 Aarsh Shah
6  * Copyright (C) 2016 Vivek Pal
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, see
20  * <https://www.gnu.org/licenses/>.
21  */
22 
23 #include <config.h>
24 
25 #include "api_weight.h"
26 #include <cmath>
27 #include <memory>
28 
29 #define XAPIAN_DEPRECATED(D) D
30 #include <xapian.h>
31 
32 #include "apitest.h"
33 #include "heap.h"
34 #include "testutils.h"
35 
36 using namespace std;
37 
38 template<class W>
39 static inline void
40 test_weight_class_no_params(const char* classname, const char* name)
41 {
42  tout << classname << '\n';
43  W obj;
44  // Check name() returns the class name.
45  TEST_EQUAL(obj.name(), name);
46  // If there are no parameters, there's nothing to serialise.
47  string obj_serialised = obj.serialise();
48  TEST_EQUAL(obj_serialised.size(), 0);
49  // Check serialising and unserialising gives object with same serialisation.
50  unique_ptr<Xapian::Weight> wt(W().unserialise(obj_serialised));
51  TEST_EQUAL(obj_serialised, wt->serialise());
52  // Check that unserialise() throws suitable error for bad serialisation.
53  // The easy case to test is extra junk after the serialised weight.
54  try {
55  unique_ptr<Xapian::Weight> bad(W().unserialise(obj_serialised + "X"));
56  FAIL_TEST(classname << " did not throw for unserialise with junk "
57  "appended");
58  } catch (const Xapian::SerialisationError& e) {
59  // Check the exception message contains the weighting scheme name
60  // (regression test for TradWeight's exception saying "BM25").
61  string target = classname + CONST_STRLEN("Xapian::");
62  TEST(e.get_msg().find(target) != string::npos);
63  }
64 }
65 
66 #define TEST_WEIGHT_CLASS_NO_PARAMS(W, N) test_weight_class_no_params<W>(#W, N)
67 
68 template<class W>
69 static inline void
70 test_weight_class(const char* classname, const char* name,
71  const W& obj_default, const W& obj_other)
72 {
73  tout << classname << '\n';
74  W obj;
75  // Check name() returns the class name.
76  TEST_EQUAL(obj.name(), name);
77  TEST_EQUAL(obj_default.name(), name);
78  TEST_EQUAL(obj_other.name(), name);
79  // Check serialisation matches that of object constructed with explicit
80  // parameter values of what the defaults are meant to be.
81  string obj_serialised = obj.serialise();
82  TEST_EQUAL(obj_serialised, obj_default.serialise());
83  // Check serialisation is different to object with different parameters.
84  string obj_other_serialised = obj_other.serialise();
85  TEST_NOT_EQUAL(obj_serialised, obj_other_serialised);
86  // Check serialising and unserialising gives object with same serialisation.
87  unique_ptr<Xapian::Weight> wt(W().unserialise(obj_serialised));
88  TEST_EQUAL(obj_serialised, wt->serialise());
89  // Check serialising and unserialising of object with different parameters.
90  unique_ptr<Xapian::Weight> wt2(W().unserialise(obj_other_serialised));
91  TEST_EQUAL(obj_other_serialised, wt2->serialise());
92  // Check that unserialise() throws suitable error for bad serialisation.
93  // The easy case to test is extra junk after the serialised weight.
94  try {
95  unique_ptr<Xapian::Weight> bad(W().unserialise(obj_serialised + "X"));
96  FAIL_TEST(classname << " did not throw for unserialise with junk "
97  "appended");
98  } catch (const Xapian::SerialisationError& e) {
99  // Check the exception message contains the correct weighting scheme
100  // name (originally a regression test for TradWeight's exception saying
101  // "BM25", but not TradWeight is just a thin subclass of BM25Weight so
102  // it's expected it reports as BM25Weight now!)
103  string target = classname + CONST_STRLEN("Xapian::");
104  if (target == "TradWeight") target = "BM25Weight";
105  TEST(e.get_msg().find(target) != string::npos);
106  }
107 }
108 
109 // W Should be the class name.
110 //
111 // DEFAULT should be a parenthesised parameter list to explicitly construct
112 // an object of class W with the documented default parameters.
113 //
114 // OTHER should be a parenthesised parameter list to construct an object with
115 // non-default parameters.
116 #define TEST_WEIGHT_CLASS(W, N, DEFAULT, OTHER) \
117  test_weight_class<W>(#W, N, W DEFAULT, W OTHER)
118 
120 DEFINE_TESTCASE(weightserialisation1, !backend) {
121  // Parameter-free weighting schemes.
127 
128  // Parameterised weighting schemes.
129  TEST_WEIGHT_CLASS(Xapian::TradWeight, "bm25", (1.0), (2.0));
131  (1, 0, 1, 0.5, 0.5),
132  (1, 0.5, 1, 0.5, 0.5));
134  (1, 0, 1, 0.5, 0.5, 1.0),
135  (1, 0, 1, 0.5, 0.5, 2.0));
136  TEST_WEIGHT_CLASS(Xapian::TfIdfWeight, "tfidf", ("ntn"), ("bpn"));
137  TEST_WEIGHT_CLASS(Xapian::InL2Weight, "inl2", (1.0), (2.0));
138  TEST_WEIGHT_CLASS(Xapian::IfB2Weight, "ifb2", (1.0), (2.0));
139  TEST_WEIGHT_CLASS(Xapian::IneB2Weight, "ineb2", (1.0), (2.0));
140  TEST_WEIGHT_CLASS(Xapian::BB2Weight, "bb2", (1.0), (2.0));
141  TEST_WEIGHT_CLASS(Xapian::PL2Weight, "pl2", (1.0), (2.0));
143  (1.0, 0.8),
144  (2.0, 0.9));
146  (0.7, 2000.0),
147  (0.5, 2000.0));
149  (0.7),
150  (0.75));
152  (2000.0, 0.05),
153  (2034.0, 0.0));
154  TEST_WEIGHT_CLASS(Xapian::LMJMWeight, "lmjm", (0.0), (0.5));
155 }
156 
158 DEFINE_TESTCASE(weight1, backend) {
159  Xapian::Database db(get_database("etext"));
160  Xapian::Enquire enquire(db);
161  Xapian::Enquire enquire_scaled(db);
162  auto term = "robinson";
163  Xapian::Query q{term};
164  enquire.set_query(q);
165  enquire_scaled.set_query(q * 15.0);
166  auto expected_matches = db.get_termfreq(term);
167  auto helper = [&](const Xapian::Weight& weight,
168  string_view name,
169  string_view params) {
170  tout << name << '(' << params << ")\n";
171  enquire.set_weighting_scheme(weight);
172  enquire_scaled.set_weighting_scheme(weight);
173  Xapian::MSet mset = enquire.get_mset(0, expected_matches + 1);
174  TEST_EQUAL(mset.size(), expected_matches);
175  if (name == "Xapian::BoolWeight") {
176  /* All weights should be zero. */
177  TEST_EQUAL(mset[0].get_weight(), 0.0);
178  TEST_EQUAL(mset.back().get_weight(), 0.0);
179  } else if (name == "Xapian::CoordWeight") {
180  /* All weights should be 1 for a single term query. */
181  TEST_EQUAL(mset[0].get_weight(), 1.0);
182  TEST_EQUAL(mset.back().get_weight(), 1.0);
183  } else if (!params.empty()) {
184  /* All weights should be equal with these particular parameters. */
185  TEST_NOT_EQUAL(mset[0].get_weight(), 0.0);
186  TEST_EQUAL(mset[0].get_weight(), mset.back().get_weight());
187  } else {
188  TEST_NOT_EQUAL(mset[0].get_weight(), 0.0);
189  TEST_NOT_EQUAL(mset[0].get_weight(), mset.back().get_weight());
190  }
191  Xapian::MSet mset_scaled = enquire_scaled.get_mset(0, expected_matches);
192  TEST_EQUAL(mset_scaled.size(), expected_matches);
193  auto lm = name.find("::LM");
194  // All the LM* schemes have sumextra except LMJMWeight.
195  //
196  // BM25 and BM25+ have sumextra, but by default k2 is 0 which means
197  // sumextra is zero too.
198  bool has_sumextra = lm != string::npos && name[lm + 4] != 'J';
199  for (Xapian::doccount i = 0; i < expected_matches; ++i) {
200  double w = mset[i].get_weight();
201  double ws = mset_scaled[i].get_weight();
202  if (has_sumextra) {
203  // sumextra is not scaled, so we can't test for (near)
204  // equality, but we can test that the weight is affected by the
205  // scaling, and that it's between the unscaled weight and the
206  // fully scaled weight.
207  TEST_NOT_EQUAL_DOUBLE(ws, w);
208  TEST_REL(ws, <=, w * 15.0);
209  TEST_REL(ws, >=, w);
210  } else {
211  TEST_EQUAL_DOUBLE(ws, w * 15.0);
212  }
213  }
214  };
215 
216  // MSVC gives nothing for #__VA_ARGS__ when there are no varargs.
217 #define TEST_WEIGHTING_SCHEME(W, ...) \
218  helper(W(__VA_ARGS__), #W, "" #__VA_ARGS__)
219 
239  // Regression test for bug fixed in 1.2.4.
241  /* As mentioned in the documentation, when parameter k is 0, wdf and
242  * document length don't affect the weights. Regression test for bug fixed
243  * in 1.2.4.
244  */
246 #undef TEST_WEIGHTING_SCHEME
247 }
248 
250 DEFINE_TESTCASE(weightcreate1, !backend) {
252  delete Xapian::Weight::create(""));
254  delete Xapian::Weight::create("invalid"));
256  delete Xapian::Weight::create("invalid 1.0"));
257 }
258 
263 DEFINE_TESTCASE(bm25weight1, backend) {
264  Xapian::Enquire enquire(get_database("apitest_simpledata"));
265  enquire.set_weighting_scheme(Xapian::BM25Weight(1, 25, 1, 0.01, 0.5));
266  enquire.set_query(Xapian::Query("word"));
267 
268  Xapian::MSet mset = enquire.get_mset(0, 25);
269 }
270 
272 DEFINE_TESTCASE(bm25weight2, !backend) {
273  {
274  auto wt_ptr = Xapian::Weight::create("bm25");
275  auto wt = Xapian::BM25Weight();
276  TEST_EQUAL(wt_ptr->serialise(), wt.serialise());
277  delete wt_ptr;
278  }
279 
280  {
281  auto wt_ptr = Xapian::Weight::create("bm25 1 0 1 0.5 0.5");
282  auto wt = Xapian::BM25Weight(1, 0, 1, 0.5, 0.5);
283  TEST_EQUAL(wt_ptr->serialise(), wt.serialise());
284  delete wt_ptr;
285  }
286 }
287 
288 // Test parameter combinations which should be unaffected by doclength.
289 DEFINE_TESTCASE(bm25weight4, backend) {
290  Xapian::Database db = get_database("apitest_simpledata");
291  Xapian::Enquire enquire(db);
292  enquire.set_query(Xapian::Query("paragraph"));
293  Xapian::MSet mset;
294 
295  enquire.set_weighting_scheme(Xapian::BM25Weight(1, 0, 1, 0, 0.5));
296  mset = enquire.get_mset(0, 10);
297  TEST_EQUAL(mset.size(), 5);
298  // Expect: wdf has an effect on weight, but doclen doesn't.
299  TEST_REL(mset[0].get_weight(),>,mset[1].get_weight());
300  TEST_EQUAL_DOUBLE(mset[1].get_weight(), mset[2].get_weight());
301  TEST_REL(mset[2].get_weight(),>,mset[3].get_weight());
302  TEST_EQUAL_DOUBLE(mset[3].get_weight(), mset[4].get_weight());
303 
304  enquire.set_weighting_scheme(Xapian::BM25Weight(0, 0, 1, 1, 0.5));
305  mset = enquire.get_mset(0, 10);
306  TEST_EQUAL(mset.size(), 5);
307  // Expect: neither wdf nor doclen affects weight.
308  TEST_EQUAL_DOUBLE(mset[0].get_weight(), mset[4].get_weight());
309 }
310 
312 // Regression test for bug fixed in 1.2.17 and 1.3.2.
313 DEFINE_TESTCASE(bm25weight5, backend) {
314  Xapian::Database db = get_database("apitest_simpledata");
315  Xapian::Enquire enquire(db);
316  enquire.set_query(Xapian::Query("paragraph"));
317  Xapian::MSet mset;
318 
319  enquire.set_weighting_scheme(Xapian::BM25Weight(0, 1, 1, 0.5, 0.5));
320  mset = enquire.get_mset(0, 10);
321  TEST_EQUAL(mset.size(), 5);
322  // Expect: wdf has no effect on weight; shorter docs rank higher.
323  mset_expect_order(mset, 3, 5, 1, 4, 2);
324  TEST_EQUAL_DOUBLE(mset[0].get_weight(), mset[1].get_weight());
325  TEST_REL(mset[1].get_weight(),>,mset[2].get_weight());
326  TEST_REL(mset[2].get_weight(),>,mset[3].get_weight());
327  TEST_REL(mset[3].get_weight(),>,mset[4].get_weight());
328 }
329 
330 // Test parameter combinations which should be unaffected by doclength.
331 DEFINE_TESTCASE(bm25plusweight2, backend) {
332  Xapian::Database db = get_database("apitest_simpledata");
333  Xapian::Enquire enquire(db);
334  enquire.set_query(Xapian::Query("paragraph"));
335  Xapian::MSet mset;
336 
337  enquire.set_weighting_scheme(Xapian::BM25PlusWeight(1, 0, 1, 0, 0.5, 1));
338  mset = enquire.get_mset(0, 10);
339  TEST_EQUAL(mset.size(), 5);
340  // Expect: wdf has an effect on weight, but doclen doesn't.
341  TEST_REL(mset[0].get_weight(),>,mset[1].get_weight());
342  TEST_EQUAL_DOUBLE(mset[1].get_weight(), mset[2].get_weight());
343  TEST_REL(mset[2].get_weight(),>,mset[3].get_weight());
344  TEST_EQUAL_DOUBLE(mset[3].get_weight(), mset[4].get_weight());
345 
346  enquire.set_weighting_scheme(Xapian::BM25PlusWeight(0, 0, 1, 1, 0.5, 1));
347  mset = enquire.get_mset(0, 10);
348  TEST_EQUAL(mset.size(), 5);
349  // Expect: neither wdf nor doclen affects weight.
350  TEST_EQUAL_DOUBLE(mset[0].get_weight(), mset[4].get_weight());
351 }
352 
353 // Regression test for a mistake corrected in the BM25+ implementation.
354 DEFINE_TESTCASE(bm25plusweight3, backend) {
355  Xapian::Database db = get_database("apitest_simpledata");
356  Xapian::Enquire enquire(db);
357  enquire.set_query(Xapian::Query("paragraph"));
358  Xapian::MSet mset;
359 
360  enquire.set_weighting_scheme(Xapian::BM25PlusWeight(1, 0, 1, 0.5, 0.5, 1));
361  mset = enquire.get_mset(0, 10);
362  TEST_EQUAL(mset.size(), 5);
363 
364  // The value of each doc weight calculated manually from the BM25+ formulae
365  // by using the respective document statistics.
366  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 0.7920796567487473);
367  TEST_EQUAL_DOUBLE(mset[1].get_weight(), 0.7846980783848447);
368  TEST_EQUAL_DOUBLE(mset[2].get_weight(), 0.7558817623365934);
369  TEST_EQUAL_DOUBLE(mset[3].get_weight(), 0.7210119356168847);
370  TEST_EQUAL_DOUBLE(mset[4].get_weight(), 0.7210119356168847);
371 }
372 
373 // Test for invalid values of c.
374 DEFINE_TESTCASE(inl2weight2, !backend) {
375  // InvalidArgumentError should be thrown if the parameter c is invalid.
377  Xapian::InL2Weight wt(-2.0));
378 
380  Xapian::InL2Weight wt2(0.0));
381 }
382 
383 // Feature tests for Inl2Weight
384 DEFINE_TESTCASE(inl2weight3, backend) {
385  Xapian::Database db = get_database("apitest_simpledata");
386  Xapian::Enquire enquire(db);
387  Xapian::Query query("banana");
388 
389  enquire.set_query(query);
391 
392  Xapian::MSet mset1;
393  mset1 = enquire.get_mset(0, 10);
394  TEST_EQUAL(mset1.size(), 1);
395  mset_expect_order(mset1, 6);
396 
397  /* The value has been calculated in the python interpreter by looking at the
398  * database statistics. */
399  TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 1.559711143842063);
400 }
401 
402 // Test for invalid values of c.
403 DEFINE_TESTCASE(ifb2weight2, !backend) {
404  // InvalidArgumentError should be thrown if the parameter c is invalid.
406  Xapian::IfB2Weight wt(-2.0));
407 
409  Xapian::IfB2Weight wt2(0.0));
410 }
411 
412 // Feature test
413 DEFINE_TESTCASE(ifb2weight3, backend) {
414  Xapian::Database db = get_database("apitest_simpledata");
415  Xapian::Enquire enquire(db);
416  Xapian::Query query("banana");
417 
418  enquire.set_query(query);
420 
421  Xapian::MSet mset1;
422  mset1 = enquire.get_mset(0, 10);
423  TEST_EQUAL(mset1.size(), 1);
424 
425  /* The value of the weight has been manually calculated using the statistics
426  * of the test database. */
427  TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 3.119422287684126);
428 }
429 
430 // Test for invalid values of c.
431 DEFINE_TESTCASE(ineb2weight2, !backend) {
432  // InvalidArgumentError should be thrown if parameter c is invalid.
434  Xapian::IneB2Weight wt(-2.0));
435 
437  Xapian::IneB2Weight wt2(0.0));
438 }
439 
440 // Feature test.
441 DEFINE_TESTCASE(ineb2weight3, backend) {
442  Xapian::Database db = get_database("apitest_simpledata");
443  Xapian::Enquire enquire(db);
444  Xapian::Query query("paragraph");
445  enquire.set_query(query);
447 
448  Xapian::MSet mset1;
449  mset1 = enquire.get_mset(0, 10);
450  TEST_EQUAL(mset1.size(), 5);
451 
452  // The third document in the database is 4th in the ranking.
453  /* The weight value has been manually calculated by using the statistics
454  * of the test database. */
455  TEST_EQUAL_DOUBLE(mset1[4].get_weight(), 0.61709730297692400036);
456 }
457 
458 // Test for invalid values of c.
459 DEFINE_TESTCASE(bb2weight2, !backend) {
460  // InvalidArgumentError should be thrown if the parameter c is invalid.
462  Xapian::BB2Weight wt(-2.0));
463 
465  Xapian::BB2Weight wt2(0.0));
466 }
467 
468 // Feature test
469 DEFINE_TESTCASE(bb2weight3, backend) {
470  Xapian::Database db = get_database("apitest_simpledata");
471  Xapian::Enquire enquire(db);
472  Xapian::Query query("paragraph");
473 
474  enquire.set_query(query);
476 
477  Xapian::MSet mset1;
478  mset1 = enquire.get_mset(0, 10);
479  TEST_EQUAL(mset1.size(), 5);
480  /* The third document in the database has the highest weight and is the
481  * first in the mset. */
482  // Value calculated manually by using the statistics of the test database.
483  TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 1.6823696969784483);
484 
485  // Test with OP_SCALE_WEIGHT and a small factor (regression test, as we
486  // were applying the factor to the upper bound twice).
489 
490  Xapian::MSet mset3;
491  mset3 = enquire.get_mset(0, 10);
492  TEST_EQUAL(mset3.size(), 5);
493 
494  for (int i = 0; i < 5; ++i) {
495  TEST_EQUAL_DOUBLE(mset1[i].get_weight(), mset3[i].get_weight() * 1024);
496  }
497 }
498 
499 // Regression test: we used to calculate log2(0) when there was only one doc.
500 DEFINE_TESTCASE(bb2weight4, backend) {
501  Xapian::Database db = get_database("apitest_onedoc");
502  Xapian::Enquire enquire(db);
503  Xapian::Query query("word");
504 
505  enquire.set_query(query);
507 
508  Xapian::MSet mset1;
509  mset1 = enquire.get_mset(0, 10);
510  TEST_EQUAL(mset1.size(), 1);
511  TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 3.431020621347435);
512 }
513 
514 // Feature test.
515 DEFINE_TESTCASE(dlhweight1, backend) {
516  Xapian::Database db = get_database("apitest_simpledata");
517  Xapian::Enquire enquire(db);
518  Xapian::Query query("a");
519 
520  enquire.set_query(query);
522 
523  Xapian::MSet mset1;
524  mset1 = enquire.get_mset(0, 10);
525  TEST_EQUAL(mset1.size(), 3);
526  mset_expect_order(mset1, 3, 1, 2);
527  // Weights calculated manually using stats from the database.
528  TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 1.0046477754371292362);
529  TEST_EQUAL_DOUBLE(mset1[1].get_weight(), 0.97621929514640352757);
530  // The following weight would be negative but gets clamped to 0.
531  TEST_EQUAL_DOUBLE(mset1[2].get_weight(), 0.0);
532 }
533 
534 static void
536 {
537  Xapian::Document doc;
538  doc.add_term("solo", 37);
539  db.add_document(doc);
540 }
541 
542 // Test wdf == doclen.
543 DEFINE_TESTCASE(dlhweight3, backend) {
544  Xapian::Database db = get_database("wdf_eq_doclen", gen_wdf_eq_doclen_db);
545  Xapian::Enquire enquire(db);
546  Xapian::Query query("solo");
547 
548  enquire.set_query(query);
550 
551  Xapian::MSet mset1;
552  mset1 = enquire.get_mset(0, 10);
553  TEST_EQUAL(mset1.size(), 1);
554  // Weight gets clamped to zero.
555  TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 0.0);
556 }
557 
558 // Test for invalid values of c.
559 DEFINE_TESTCASE(pl2weight2, !backend) {
560  // InvalidArgumentError should be thrown if parameter c is invalid.
562  Xapian::PL2Weight wt(-2.0));
563 }
564 
565 // Feature Test.
566 DEFINE_TESTCASE(pl2weight3, backend) {
567  Xapian::Database db = get_database("apitest_simpledata");
568  Xapian::Enquire enquire(db);
569  Xapian::Query query("paragraph");
570  enquire.set_query(query);
571  Xapian::MSet mset;
572 
574  mset = enquire.get_mset(0, 10);
575  TEST_EQUAL(mset.size(), 5);
576  // Expected weight difference calculated in extended precision using stats
577  // from the test database.
578  TEST_EQUAL_DOUBLE(mset[2].get_weight(),
579  mset[3].get_weight() + 0.0086861771701328694);
580 }
581 
582 // Test for invalid values of parameters, c and delta.
583 DEFINE_TESTCASE(pl2plusweight2, !backend) {
584  // InvalidArgumentError should be thrown if parameter c is invalid.
586  Xapian::PL2PlusWeight wt(-2.0, 0.9));
587 
588  // InvalidArgumentError should be thrown if parameter delta is invalid.
590  Xapian::PL2PlusWeight wt(1.0, -1.9));
591 }
592 
593 // Feature Test 1 for PL2PlusWeight.
594 DEFINE_TESTCASE(pl2plusweight4, backend) {
595  Xapian::Database db = get_database("apitest_simpledata");
596  Xapian::Enquire enquire(db);
597  enquire.set_query(Xapian::Query("to"));
598  Xapian::MSet mset;
599 
600  enquire.set_weighting_scheme(Xapian::PL2PlusWeight(2.0, 0.8));
601  mset = enquire.get_mset(0, 10);
602  TEST_EQUAL(mset.size(), 3);
603  // Expected weight difference calculated in Python using stats from the
604  // test database.
605  TEST_EQUAL_DOUBLE(mset[1].get_weight(),
606  mset[2].get_weight() + 0.016760925252262027);
607 }
608 
609 // Feature Test 2 for PL2PlusWeight
610 DEFINE_TESTCASE(pl2plusweight5, backend) {
611  Xapian::Database db = get_database("apitest_simpledata");
612  Xapian::Enquire enquire(db);
613  Xapian::Query query("word");
614  enquire.set_query(query);
615  Xapian::MSet mset;
616 
617  enquire.set_weighting_scheme(Xapian::PL2PlusWeight(1.0, 0.8));
618  mset = enquire.get_mset(0, 10);
619  // Expect MSet contains two documents having query "word".
620  TEST_EQUAL(mset.size(), 2);
621  // Expect Document 2 has higher weight than document 4 because
622  // "word" appears more no. of times in document 2 than document 4.
623  mset_expect_order(mset, 2, 4);
624 }
625 
626 // Feature test
627 DEFINE_TESTCASE(dphweight1, backend) {
628  Xapian::Database db = get_database("apitest_simpledata");
629  Xapian::Enquire enquire(db);
630  Xapian::Query query("paragraph");
631 
632  enquire.set_query(query);
634 
635  Xapian::MSet mset1;
636  mset1 = enquire.get_mset(0, 10);
637  TEST_EQUAL(mset1.size(), 5);
638  /* The weight has been calculated manually by using the statistics of the
639  * test database. */
640  TEST_EQUAL_DOUBLE(mset1[2].get_weight() - mset1[4].get_weight(), 0.542623617687990167);
641 }
642 
643 // Test wdf == doclen.
644 DEFINE_TESTCASE(dphweight3, backend) {
645  Xapian::Database db = get_database("wdf_eq_doclen", gen_wdf_eq_doclen_db);
646  Xapian::Enquire enquire(db);
647  Xapian::Query query("solo");
648 
649  enquire.set_query(query);
651 
652  Xapian::MSet mset1;
653  mset1 = enquire.get_mset(0, 10);
654  TEST_EQUAL(mset1.size(), 1);
655  // Weight gets clamped to zero.
656  TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 0.0);
657 }
658 
659 // Test for various cases of normalization string.
660 DEFINE_TESTCASE(tfidfweight1, !backend) {
661  // InvalidArgumentError should be thrown if normalization string is invalid
663  Xapian::TfIdfWeight b("JOHN_LENNON"));
664 
666  Xapian::TfIdfWeight b("LOL"));
667 
669  Xapian::Weight::create("tfidf FUN NONE NONE"));
670 
672  Xapian::Weight::create("tfidf NONE FUN NONE"));
673 
675  Xapian::Weight::create("tfidf NONE NONE FUN"));
676 
678  Xapian::Weight::create("tfidf NONE"));
679 
681  Xapian::Weight::create("tfidf NONE NONE"));
682 }
683 
684 // Feature tests for various normalization functions.
685 DEFINE_TESTCASE(tfidfweight3, backend) {
686  Xapian::Database db = get_database("apitest_simpledata");
687  Xapian::Enquire enquire(db);
688  Xapian::Query query("word");
689  Xapian::MSet mset;
690 
691  // Check for "ntn" when termfreq != N
692  enquire.set_query(query);
694  mset = enquire.get_mset(0, 10);
695  TEST_EQUAL(mset.size(), 2);
696  // doc 2 should have higher weight than 4 as only tf(wdf) will dominate.
697  mset_expect_order(mset, 2, 4);
698  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 * log(6.0 / 2));
699 
700  // Check that wqf is taken into account.
701  enquire.set_query(Xapian::Query("word", 2));
703  Xapian::MSet mset2 = enquire.get_mset(0, 10);
704  TEST_EQUAL(mset2.size(), 2);
705  // doc 2 should have higher weight than 4 as only tf(wdf) will dominate.
706  mset_expect_order(mset2, 2, 4);
707  // wqf is 2, so weights should be doubled.
708  TEST_EQUAL_DOUBLE(mset[0].get_weight() * 2, mset2[0].get_weight());
709  TEST_EQUAL_DOUBLE(mset[1].get_weight() * 2, mset2[1].get_weight());
710 
711  // check for "nfn" when termfreq != N
712  enquire.set_query(query);
714  mset = enquire.get_mset(0, 10);
715  TEST_EQUAL(mset.size(), 2);
716  mset_expect_order(mset, 2, 4);
717  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 / 2);
718 
719  // check for "nsn" when termfreq != N
720  enquire.set_query(query);
722  mset = enquire.get_mset(0, 10);
723  TEST_EQUAL(mset.size(), 2);
724  mset_expect_order(mset, 2, 4);
725  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 * pow(log(6.0 / 2), 2.0));
726 
727  // Check for "bnn" and for both branches of 'b'.
728  enquire.set_query(Xapian::Query("test"));
730  mset = enquire.get_mset(0, 10);
731  TEST_EQUAL(mset.size(), 1);
732  mset_expect_order(mset, 1);
733  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 1.0);
734 
735  // Check for "lnn" and for both branches of 'l'.
736  enquire.set_query(Xapian::Query("word"));
738  mset = enquire.get_mset(0, 10);
739  TEST_EQUAL(mset.size(), 2);
740  mset_expect_order(mset, 2, 4);
741  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 1 + log(8.0)); // idfn=1 and so wt=tfn=1+log(tf)
742  TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1.0); // idfn=1 and wt=tfn=1+log(tf)=1+log(1)=1
743 
744  // Check for "snn"
745  enquire.set_query(Xapian::Query("paragraph"));
746  enquire.set_weighting_scheme(Xapian::TfIdfWeight("snn")); // idf=1 and tfn=tf*tf
747  mset = enquire.get_mset(0, 10);
748  TEST_EQUAL(mset.size(), 5);
749  mset_expect_order(mset, 2, 1, 4, 3, 5);
750  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 9.0);
751  TEST_EQUAL_DOUBLE(mset[4].get_weight(), 1.0);
752 
753  // Check for "ntn" when termfreq=N
754  enquire.set_query(Xapian::Query("this")); // N=termfreq and so idfn=0 for "t"
756  mset = enquire.get_mset(0, 10);
757  TEST_EQUAL(mset.size(), 6);
758  mset_expect_order(mset, 1, 2, 3, 4, 5, 6);
759  for (int i = 0; i < 6; ++i) {
760  TEST_EQUAL_DOUBLE(mset[i].get_weight(), 0.0);
761  }
762 
763  // Check for "npn" and for both branches of 'p'
764  enquire.set_query(Xapian::Query("this")); // N=termfreq and so idfn=0 for "p"
766  mset = enquire.get_mset(0, 10);
767  TEST_EQUAL(mset.size(), 6);
768  mset_expect_order(mset, 1, 2, 3, 4, 5, 6);
769  for (int i = 0; i < 6; ++i) {
770  TEST_EQUAL_DOUBLE(mset[i].get_weight(), 0.0);
771  }
772 
773  // Check for "Lnn".
774  enquire.set_query(Xapian::Query("word"));
776  mset = enquire.get_mset(0, 10);
777  TEST_EQUAL(mset.size(), 2);
778  mset_expect_order(mset, 2, 4);
779  TEST_EQUAL_DOUBLE(mset[0].get_weight(), (1 + log(8.0)) / (1 + log(81.0 / 56.0)));
780  TEST_EQUAL_DOUBLE(mset[1].get_weight(), (1 + log(1.0)) / (1 + log(31.0 / 26.0)));
781 
782  enquire.set_query(Xapian::Query("word"));
784  mset = enquire.get_mset(0, 10);
785  TEST_EQUAL(mset.size(), 2);
786  mset_expect_order(mset, 2, 4);
787  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8 * log((6.0 - 2) / 2));
788  TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1 * log((6.0 - 2) / 2));
789 
790  // Check for "mnn".
791  enquire.set_query(Xapian::Query("word"));
793  mset = enquire.get_mset(0, 10);
794  TEST_EQUAL(mset.size(), 2);
795  mset_expect_order(mset, 2, 4);
796  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 / 8);
797  TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1.0 / 4);
798 
799  // Check for "ann".
800  enquire.set_query(Xapian::Query("word"));
802  mset = enquire.get_mset(0, 10);
803  TEST_EQUAL(mset.size(), 2);
804  mset_expect_order(mset, 2, 4);
805  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 0.5 + 0.5 * 8.0 / 8);
806  TEST_EQUAL_DOUBLE(mset[1].get_weight(), 0.5 + 0.5 * 1.0 / 4);
807 
808  // Check for NONE, TFIDF, NONE when termfreq != N
809  enquire.set_query(query);
810  enquire.set_weighting_scheme(
815  mset = enquire.get_mset(0, 10);
816  TEST_EQUAL(mset.size(), 2);
817  // doc 2 should have higher weight than 4 as only tf(wdf) will dominate.
818  mset_expect_order(mset, 2, 4);
819  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 * log(6.0 / 2));
820 
821  // Check that wqf is taken into account.
822  enquire.set_query(Xapian::Query("word", 2));
823  mset2 = enquire.get_mset(0, 10);
824  TEST_EQUAL(mset2.size(), 2);
825  // doc 2 should have higher weight than 4 as only tf(wdf) will dominate.
826  mset_expect_order(mset2, 2, 4);
827  // wqf is 2, so weights should be doubled.
828  TEST_EQUAL_DOUBLE(mset[0].get_weight() * 2, mset2[0].get_weight());
829  TEST_EQUAL_DOUBLE(mset[1].get_weight() * 2, mset2[1].get_weight());
830 
831  // check for NONE, FREQ, NONE when termfreq != N
832  enquire.set_query(query);
833  enquire.set_weighting_scheme(
838  mset = enquire.get_mset(0, 10);
839  TEST_EQUAL(mset.size(), 2);
840  mset_expect_order(mset, 2, 4);
841  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 / 2);
842 
843  // check for NONE, SQUARE, NONE when termfreq != N
844  enquire.set_query(query);
845  enquire.set_weighting_scheme(
850  mset = enquire.get_mset(0, 10);
851  TEST_EQUAL(mset.size(), 2);
852  mset_expect_order(mset, 2, 4);
853  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 * pow(log(6.0 / 2), 2.0));
854 
855  // Check for BOOLEAN, NONE, NONE and for both branches of BOOLEAN.
856  enquire.set_query(Xapian::Query("test"));
857  enquire.set_weighting_scheme(
862  mset = enquire.get_mset(0, 10);
863  TEST_EQUAL(mset.size(), 1);
864  mset_expect_order(mset, 1);
865  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 1.0);
866 
867  // Check for LOG, NONE, NONE and for both branches of LOG.
868  enquire.set_query(Xapian::Query("word"));
869  enquire.set_weighting_scheme(
874  mset = enquire.get_mset(0, 10);
875  TEST_EQUAL(mset.size(), 2);
876  mset_expect_order(mset, 2, 4);
877  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 1 + log(8.0));
878  TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1.0);
879 
880  // Check for SQUARE, NONE, NONE.
881  enquire.set_query(Xapian::Query("paragraph"));
882  enquire.set_weighting_scheme(
886  Xapian::TfIdfWeight::wt_norm::NONE)); // idf=1 and tfn=tf*tf
887  mset = enquire.get_mset(0, 10);
888  TEST_EQUAL(mset.size(), 5);
889  mset_expect_order(mset, 2, 1, 4, 3, 5);
890  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 9.0);
891  TEST_EQUAL_DOUBLE(mset[4].get_weight(), 1.0);
892 
893  // Check for NONE, TFIDF, NONE when termfreq=N
894  enquire.set_query(Xapian::Query("this"));
895  // N=termfreq and so idfn=0 for TFIDF
896  enquire.set_weighting_scheme(
901  mset = enquire.get_mset(0, 10);
902  TEST_EQUAL(mset.size(), 6);
903  mset_expect_order(mset, 1, 2, 3, 4, 5, 6);
904  for (int i = 0; i < 6; ++i) {
905  TEST_EQUAL_DOUBLE(mset[i].get_weight(), 0.0);
906  }
907 
908  // Check for NONE, PROB, NONE and for both branches of PROB
909  enquire.set_query(Xapian::Query("this"));
910  // N=termfreq and so idfn=0 for PROB
911  enquire.set_weighting_scheme(
916  mset = enquire.get_mset(0, 10);
917  TEST_EQUAL(mset.size(), 6);
918  mset_expect_order(mset, 1, 2, 3, 4, 5, 6);
919  for (int i = 0; i < 6; ++i) {
920  TEST_EQUAL_DOUBLE(mset[i].get_weight(), 0.0);
921  }
922 
923  enquire.set_query(Xapian::Query("word"));
924  enquire.set_weighting_scheme(
929  mset = enquire.get_mset(0, 10);
930  TEST_EQUAL(mset.size(), 2);
931  mset_expect_order(mset, 2, 4);
932  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8 * log((6.0 - 2) / 2));
933  TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1 * log((6.0 - 2) / 2));
934 
935  // Check for LOG_AVERAGE, NONE, NONE.
936  enquire.set_query(Xapian::Query("word"));
937  enquire.set_weighting_scheme(
942  mset = enquire.get_mset(0, 10);
943  TEST_EQUAL(mset.size(), 2);
944  mset_expect_order(mset, 2, 4);
945  TEST_EQUAL_DOUBLE(mset[0].get_weight(),
946  (1 + log(8.0)) / (1 + log(81.0 / 56.0)));
947  TEST_EQUAL_DOUBLE(mset[1].get_weight(),
948  (1 + log(1.0)) / (1 + log(31.0 / 26.0)));
949 
950  // Check for AUG_LOG, NONE, NONE.
951  enquire.set_weighting_scheme(
956  mset = enquire.get_mset(0, 10);
957  TEST_EQUAL(mset.size(), 2);
958  mset_expect_order(mset, 2, 4);
959  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 0.2 + 0.8 * log(1.0 + 8));
960  TEST_EQUAL_DOUBLE(mset[1].get_weight(), 0.2 + 0.8 * log(1.0 + 1));
961 
962  // Check for NONE, GLOBAL_FREQ, NONE.
963  enquire.set_weighting_scheme(
968  mset = enquire.get_mset(0, 10);
969  TEST_EQUAL(mset.size(), 2);
970  mset_expect_order(mset, 2, 4);
971  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8 * (9.0 / 2));
972  TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1 * (9.0 / 2));
973 
974  // Check for SQRT, NONE, NONE.
975  enquire.set_weighting_scheme(
980  mset = enquire.get_mset(0, 10);
981  TEST_EQUAL(mset.size(), 2);
982  mset_expect_order(mset, 2, 4);
983  TEST_EQUAL_DOUBLE(mset[0].get_weight(), sqrt(8 - 0.5) + 1);
984  TEST_EQUAL_DOUBLE(mset[1].get_weight(), sqrt(1 - 0.5) + 1);
985 
986  // Check for NONE, LOG_GLOBAL_FREQ, NONE.
987  enquire.set_weighting_scheme(
992  mset = enquire.get_mset(0, 10);
993  TEST_EQUAL(mset.size(), 2);
994  mset_expect_order(mset, 2, 4);
995  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8 * log(9.0 / 2 + 1));
996  TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1 * log(9.0 / 2 + 1));
997 
998  // Check for NONE, INCREMENTED_GLOBAL_FREQ, NONE.
999  enquire.set_weighting_scheme(
1004  mset = enquire.get_mset(0, 10);
1005  TEST_EQUAL(mset.size(), 2);
1006  mset_expect_order(mset, 2, 4);
1007  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8 * (9.0 / 2 + 1));
1008  TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1 * (9.0 / 2 + 1));
1009 
1010  // Check for NONE, SQRT_GLOBAL_FREQ, NONE.
1011  enquire.set_weighting_scheme(
1016  mset = enquire.get_mset(0, 10);
1017  TEST_EQUAL(mset.size(), 2);
1018  mset_expect_order(mset, 2, 4);
1019  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8 * sqrt(9.0 / 2 - 0.9));
1020  TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1 * sqrt(9.0 / 2 - 0.9));
1021 
1022  // Check for AUG_AVERAGE, NONE, NONE.
1023  enquire.set_weighting_scheme(
1028  mset = enquire.get_mset(0, 10);
1029  TEST_EQUAL(mset.size(), 2);
1030  mset_expect_order(mset, 2, 4);
1031  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 0.9 + 0.1 * (8.0 / (81.0 / 56.0)));
1032  TEST_EQUAL_DOUBLE(mset[1].get_weight(), 0.9 + 0.1 * (1.0 / (31.0 / 26.0)));
1033 
1034  // Check for MAX, NONE, NONE.
1035  enquire.set_weighting_scheme(
1040  mset = enquire.get_mset(0, 10);
1041  TEST_EQUAL(mset.size(), 2);
1042  mset_expect_order(mset, 2, 4);
1043  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 / 8);
1044  TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1.0 / 4);
1045 
1046  // Check for AUG, NONE, NONE.
1047  enquire.set_weighting_scheme(
1052  mset = enquire.get_mset(0, 10);
1053  TEST_EQUAL(mset.size(), 2);
1054  mset_expect_order(mset, 2, 4);
1055  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 0.5 + 0.5 * 8.0 / 8);
1056  TEST_EQUAL_DOUBLE(mset[1].get_weight(), 0.5 + 0.5 * 1.0 / 4);
1057 }
1058 
1059 // Feature tests for pivoted normalization functions.
1060 DEFINE_TESTCASE(tfidfweight4, backend) {
1061  Xapian::Database db = get_database("apitest_simpledata");
1062  Xapian::Enquire enquire(db);
1063  Xapian::Query query("paragraph");
1064  Xapian::MSet mset;
1065 
1066  // Check for "PPn" normalization string.
1067  enquire.set_query(query);
1068  enquire.set_weighting_scheme(Xapian::TfIdfWeight("PPn", 0.2, 1.0));
1069  mset = enquire.get_mset(0, 10);
1070  TEST_EQUAL(mset.size(), 5);
1071  // Shorter docs should ranker higher if wqf is equal among all the docs.
1072  TEST_REL(mset[0].get_weight(),>,mset[1].get_weight());
1073  TEST_REL(mset[2].get_weight(),>,mset[3].get_weight());
1074 
1075  // Check that wqf is taken into account.
1076  enquire.set_query(Xapian::Query("paragraph", 2));
1077  enquire.set_weighting_scheme(Xapian::TfIdfWeight("PPn", 0.2, 1.0));
1078  Xapian::MSet mset2 = enquire.get_mset(0, 10);
1079  TEST_EQUAL(mset2.size(), 5);
1080  // wqf is 2, so weights should be doubled.
1081  TEST_EQUAL_DOUBLE(mset[0].get_weight() * 2, mset2[0].get_weight());
1082  TEST_EQUAL_DOUBLE(mset[1].get_weight() * 2, mset2[1].get_weight());
1083 
1084  // check for "nPn" which represents "xPx"
1085  enquire.set_query(Xapian::Query("word"));
1086  enquire.set_weighting_scheme(Xapian::TfIdfWeight("nPn", 0.2, 1.0));
1087  mset = enquire.get_mset(0, 10);
1088  TEST_EQUAL(mset.size(), 2);
1089  // Expect doc 2 with query "word" to have higher weight than doc 4.
1090  mset_expect_order(mset, 2, 4);
1091 
1092  // check for "Ptn" which represents "Pxx"
1093  enquire.set_query(Xapian::Query("word"));
1094  enquire.set_weighting_scheme(Xapian::TfIdfWeight("Ptn", 0.2, 1.0));
1095  mset = enquire.get_mset(0, 10);
1096  TEST_EQUAL(mset.size(), 2);
1097  // Expect doc 2 with query "word" to have higher weight than doc 4.
1098  mset_expect_order(mset, 2, 4);
1099 
1100  // Check for PIVOTED, PIVOTED, NONE normalization string.
1101  enquire.set_query(query);
1102  enquire.set_weighting_scheme(
1107  mset = enquire.get_mset(0, 10);
1108  TEST_EQUAL(mset.size(), 5);
1109  // Shorter docs should ranker higher if wqf is equal among all the docs.
1110  TEST_REL(mset[0].get_weight(),>,mset[1].get_weight());
1111  TEST_REL(mset[2].get_weight(),>,mset[3].get_weight());
1112 
1113  // Check that wqf is taken into account.
1114  enquire.set_query(Xapian::Query("paragraph", 2));
1115  mset2 = enquire.get_mset(0, 10);
1116  TEST_EQUAL(mset2.size(), 5);
1117  // wqf is 2, so weights should be doubled.
1118  TEST_EQUAL_DOUBLE(mset[0].get_weight() * 2, mset2[0].get_weight());
1119  TEST_EQUAL_DOUBLE(mset[1].get_weight() * 2, mset2[1].get_weight());
1120 
1121  // check for NONE, PIVOTED, NONE
1122  enquire.set_query(Xapian::Query("word"));
1123  enquire.set_weighting_scheme(
1128  mset = enquire.get_mset(0, 10);
1129  TEST_EQUAL(mset.size(), 2);
1130  // Expect doc 2 with query "word" to have higher weight than doc 4.
1131  mset_expect_order(mset, 2, 4);
1132 
1133  // check for PIVOTED, TFIDF, NONE
1134  enquire.set_query(Xapian::Query("word"));
1135  enquire.set_weighting_scheme(
1140  mset = enquire.get_mset(0, 10);
1141  TEST_EQUAL(mset.size(), 2);
1142  // Expect doc 2 with query "word" to have higher weight than doc 4.
1143  mset_expect_order(mset, 2, 4);
1144 }
1145 
1146 // Check that create_from_parameters() creates the correct object.
1147 DEFINE_TESTCASE(tfidfweight5, !backend) {
1148  auto wt_ptr = Xapian::Weight::create("tfidf NONE TFIDF NONE");
1152  TEST_EQUAL(wt_ptr->serialise(), wt.serialise());
1153  delete wt_ptr;
1154 
1155  auto wt_ptr2 = Xapian::Weight::create("tfidf SQRT PIVOTED NONE");
1159  TEST_EQUAL(wt_ptr2->serialise(), wt2.serialise());
1160  delete wt_ptr2;
1161 }
1162 
1164  public:
1165  double factor;
1166 
1167  unsigned & zero_inits, & non_zero_inits;
1168 
1169  CheckInitWeight(unsigned &z, unsigned &n)
1170  : factor(-1.0), zero_inits(z), non_zero_inits(n) {
1171  need_stat(DOC_LENGTH);
1172  }
1173 
1174  void init(double factor_) override {
1175  factor = factor_;
1176  if (factor == 0.0)
1177  ++zero_inits;
1178  else
1179  ++non_zero_inits;
1180  }
1181 
1182  Weight* clone() const override {
1183  return new CheckInitWeight(zero_inits, non_zero_inits);
1184  }
1185 
1187  Xapian::termcount, Xapian::termcount) const override {
1188  return 1.0;
1189  }
1190 
1191  double get_maxpart() const override { return 1.0; }
1192 
1195  Xapian::termcount) const override {
1196  return 1.0 / doclen;
1197  }
1198 
1199  double get_maxextra() const override { return 1.0; }
1200 };
1201 
1203 DEFINE_TESTCASE(checkinitweight1, backend && !multi && !remote) {
1204  Xapian::Database db = get_database("apitest_simpledata");
1205  Xapian::Enquire enquire(db);
1207  Xapian::Query("this"), Xapian::Query("paragraph"));
1208  enquire.set_query(q);
1209  unsigned zero_inits = 0, non_zero_inits = 0;
1210  CheckInitWeight wt(zero_inits, non_zero_inits);
1211  enquire.set_weighting_scheme(wt);
1212  Xapian::MSet mset = enquire.get_mset(0, 3);
1213  TEST_EQUAL(zero_inits, 1);
1214  TEST_EQUAL(non_zero_inits, 2);
1215 }
1216 
1218  public:
1219  double factor = -1.0;
1220 
1222 
1223  string term1;
1224 
1225  // When testing OP_SYNONYM, term2 is also set.
1226  // When testing OP_WILDCARD, term2 == "*".
1227  // When testing a repeated term, term2 == "=" for the first occurrence and
1228  // "_" for subsequent occurrences.
1229  mutable string term2;
1230 
1233 
1234  mutable Xapian::termcount len_upper = 0;
1235  mutable Xapian::termcount len_lower = Xapian::termcount(-1);
1236  mutable Xapian::termcount uniqueterms_upper = 0;
1237  mutable Xapian::termcount uniqueterms_lower = Xapian::termcount(-1);
1238  mutable Xapian::termcount wdf_upper = 0;
1239 
1241  const string & term1_,
1242  const string & term2_,
1243  Xapian::termcount & sum_,
1244  Xapian::termcount & sum_squares_)
1245  : db(db_), term1(term1_), term2(term2_),
1246  sum(sum_), sum_squares(sum_squares_)
1247  {
1248  need_stat(COLLECTION_SIZE);
1249  need_stat(RSET_SIZE);
1250  need_stat(AVERAGE_LENGTH);
1251  need_stat(TERMFREQ);
1252  need_stat(RELTERMFREQ);
1253  need_stat(QUERY_LENGTH);
1254  need_stat(WQF);
1255  need_stat(WDF);
1256  need_stat(DOC_LENGTH);
1257  need_stat(DOC_LENGTH_MIN);
1258  need_stat(DOC_LENGTH_MAX);
1259  need_stat(DB_DOC_LENGTH_MIN);
1260  need_stat(DB_DOC_LENGTH_MAX);
1261  need_stat(WDF_MAX);
1262  need_stat(COLLECTION_FREQ);
1263  need_stat(UNIQUE_TERMS);
1264  need_stat(UNIQUE_TERMS_MIN);
1265  need_stat(UNIQUE_TERMS_MAX);
1266  need_stat(DB_UNIQUE_TERMS_MIN);
1267  need_stat(DB_UNIQUE_TERMS_MAX);
1268  need_stat(TOTAL_LENGTH);
1269  need_stat(WDF_DOC_MAX);
1270  }
1271 
1273  const string & term_,
1274  Xapian::termcount & sum_,
1275  Xapian::termcount & sum_squares_)
1276  : CheckStatsWeight(db_, term_, string(), sum_, sum_squares_) { }
1277 
1278  void init(double factor_) override {
1279  factor = factor_;
1280  }
1281 
1282  Weight* clone() const override {
1283  auto res = new CheckStatsWeight(db, term1, term2, sum, sum_squares);
1284  if (term2 == "=") {
1285  // The object passed to Enquire::set_weighting_scheme() is cloned
1286  // right away, and then cloned again for each term, and then
1287  // potentially once more for the term-independent weight
1288  // contribution. In the repeated case, we want to handle the first
1289  // actual term specially, so we arrange for that to have "=" for
1290  // term2, and subsequent clones to have "_", so that we accumulate
1291  // sum and sum_squares on the first occurrence only.
1292  term2 = "_";
1293  }
1294  return res;
1295  }
1296 
1298  Xapian::termcount doclen,
1299  Xapian::termcount uniqueterms,
1300  Xapian::termcount wdfdocmax) const override {
1301  Xapian::doccount num_docs = db.get_doccount();
1302  TEST_EQUAL(get_collection_size(), num_docs);
1303  TEST_EQUAL(get_rset_size(), 0);
1304  TEST_EQUAL(get_average_length(), db.get_avlength());
1305  Xapian::totallength totlen = get_total_length();
1306  TEST_EQUAL(totlen, db.get_total_length());
1307  double total_term_occurences = get_average_length() * num_docs;
1308  TEST_EQUAL(Xapian::totallength(total_term_occurences + 0.5), totlen);
1309  if (term2.empty() || term2 == "=" || term2 == "_") {
1310  TEST_EQUAL(get_termfreq(), db.get_termfreq(term1));
1311  TEST_EQUAL(get_collection_freq(), db.get_collection_freq(term1));
1312  if (term2.empty()) {
1313  TEST_EQUAL(get_query_length(), 1);
1314  } else {
1315  TEST_EQUAL(get_query_length(), 2);
1316  }
1317  } else {
1318  Xapian::doccount tfmax = 0, tfsum = 0;
1319  Xapian::termcount cfmax = 0, cfsum = 0;
1320  if (term2 == "*") {
1321  // OP_WILDCARD case.
1322  for (auto&& t = db.allterms_begin(term1);
1323  t != db.allterms_end(term1); ++t) {
1324  Xapian::doccount tf = t.get_termfreq();
1325  tout << "->" << *t << " " << tf << '\n';
1326  tfsum += tf;
1327  tfmax = max(tfmax, tf);
1329  cfsum += cf;
1330  cfmax = max(cfmax, cf);
1331  }
1332  TEST_EQUAL(get_query_length(), 1);
1333  } else {
1334  // OP_SYNONYM case.
1335  Xapian::doccount tf1 = db.get_termfreq(term1);
1336  Xapian::doccount tf2 = db.get_termfreq(term2);
1337  tfsum = tf1 + tf2;
1338  tfmax = max(tf1, tf2);
1339  Xapian::termcount cf1 = db.get_collection_freq(term1);
1340  Xapian::termcount cf2 = db.get_collection_freq(term2);
1341  cfsum = cf1 + cf2;
1342  cfmax = max(cf1, cf2);
1343  TEST_EQUAL(get_query_length(), 2);
1344  }
1345  // Synonym occurs at least as many times as any term.
1346  TEST_REL(get_termfreq(), >=, tfmax);
1347  TEST_REL(get_collection_freq(), >=, cfmax);
1348  // Synonym can't occur more times than the terms do.
1349  TEST_REL(get_termfreq(), <=, tfsum);
1350  TEST_REL(get_collection_freq(), <=, cfsum);
1351  // Synonym can't occur more times than there are documents/terms.
1352  TEST_REL(get_termfreq(), <=, num_docs);
1353  TEST_REL(get_collection_freq(), <=, totlen);
1354  }
1355  TEST_EQUAL(get_reltermfreq(), 0);
1356  TEST_EQUAL(get_wqf(), 1);
1357  TEST_REL(doclen,>=,len_lower);
1358  TEST_REL(doclen,<=,len_upper);
1359  if (doclen > 0) {
1360  TEST_REL(uniqueterms,>=,1);
1361  TEST_REL(uniqueterms_lower,>=,1);
1362  TEST_REL(wdfdocmax,>=,1);
1363  }
1364  TEST_REL(uniqueterms,>=,uniqueterms_lower);
1365  TEST_REL(uniqueterms,<=,uniqueterms_upper);
1366  TEST_REL(uniqueterms,<=,doclen);
1367  TEST_REL(uniqueterms_upper,<=,len_upper);
1368  TEST_REL(wdf,<=,wdf_upper);
1369  TEST_REL(wdfdocmax,<=,doclen);
1370  TEST_REL(wdfdocmax,>=,wdf);
1371 
1372  auto db_len_lower = db.get_doclength_lower_bound();
1373  auto db_len_upper = db.get_doclength_upper_bound();
1374  auto db_uniqueterms_lower = db.get_unique_terms_lower_bound();
1375  auto db_uniqueterms_upper = db.get_unique_terms_upper_bound();
1376  TEST_EQUAL(get_db_doclength_lower_bound(), db_len_lower);
1377  TEST_EQUAL(get_db_doclength_upper_bound(), db_len_upper);
1378  TEST_EQUAL(get_db_unique_terms_lower_bound(), db_uniqueterms_lower);
1379  TEST_EQUAL(get_db_unique_terms_upper_bound(), db_uniqueterms_upper);
1380  if (db.size() == 1) {
1381  TEST_EQUAL(len_lower, db_len_lower);
1382  TEST_EQUAL(len_upper, db_len_upper);
1383  TEST_EQUAL(uniqueterms_lower, db_uniqueterms_lower);
1384  TEST_EQUAL(uniqueterms_upper, db_uniqueterms_upper);
1385  } else {
1386  TEST_REL(len_lower,>=,db_len_lower);
1387  TEST_REL(len_upper,<=,db_len_upper);
1388  TEST_REL(uniqueterms_lower,>=,db_uniqueterms_lower);
1389  TEST_REL(uniqueterms_upper,<=,db_uniqueterms_upper);
1390  }
1391  if (term2 != "_") {
1392  sum += wdf;
1393  sum_squares += wdf * wdf;
1394  }
1395  return 1.0;
1396  }
1397 
1398  double get_maxpart() const override {
1399  if (len_upper == 0) {
1400  len_lower = get_doclength_lower_bound();
1401  len_upper = get_doclength_upper_bound();
1402  uniqueterms_lower = get_unique_terms_lower_bound();
1403  uniqueterms_upper = get_unique_terms_upper_bound();
1404  wdf_upper = get_wdf_upper_bound();
1405  }
1406  return 1.0;
1407  }
1408 
1411  Xapian::termcount) const override {
1412  return 1.0 / doclen;
1413  }
1414 
1415  double get_maxextra() const override { return 1.0; }
1416 };
1417 
1419 DEFINE_TESTCASE(checkstatsweight1, backend && !remote) {
1420  Xapian::Database db = get_database("apitest_simpledata");
1421  Xapian::Enquire enquire(db);
1423  for (a = db.allterms_begin(); a != db.allterms_end(); ++a) {
1424  const string & term = *a;
1425  enquire.set_query(Xapian::Query(term));
1426  Xapian::termcount sum = 0;
1427  Xapian::termcount sum_squares = 0;
1428  CheckStatsWeight wt(db, term, sum, sum_squares);
1429  enquire.set_weighting_scheme(wt);
1430  Xapian::MSet mset = enquire.get_mset(0, db.get_doccount());
1431 
1432  // The document order in the multi-db case isn't the same as the
1433  // postlist order on the combined DB, so it's hard to compare the
1434  // wdf for each document in the Weight objects, but we can sum
1435  // the wdfs and the squares of the wdfs which provides a decent
1436  // check that we're not getting the wrong wdf values (it ensures
1437  // they have the right mean and standard deviation).
1438  Xapian::termcount expected_sum = 0;
1439  Xapian::termcount expected_sum_squares = 0;
1441  for (i = db.postlist_begin(term); i != db.postlist_end(term); ++i) {
1442  Xapian::termcount wdf = i.get_wdf();
1443  expected_sum += wdf;
1444  expected_sum_squares += wdf * wdf;
1445  }
1446  TEST_EQUAL(sum, expected_sum);
1447  TEST_EQUAL(sum_squares, expected_sum_squares);
1448  }
1449 }
1450 
1452 // Regression test for bugs fixed in 1.4.1.
1453 DEFINE_TESTCASE(checkstatsweight2, backend && !remote) {
1454  Xapian::Database db = get_database("apitest_simpledata");
1455  Xapian::Enquire enquire(db);
1457  for (a = db.allterms_begin(); a != db.allterms_end(); ++a) {
1458  const string & term1 = *a;
1459  if (++a == db.allterms_end()) break;
1460  const string & term2 = *a;
1462  Xapian::Query(term1), Xapian::Query(term2));
1463  tout << q.get_description() << '\n';
1464  enquire.set_query(q);
1465  Xapian::termcount sum = 0;
1466  Xapian::termcount sum_squares = 0;
1467  CheckStatsWeight wt(db, term1, term2, sum, sum_squares);
1468  enquire.set_weighting_scheme(wt);
1469  Xapian::MSet mset = enquire.get_mset(0, db.get_doccount());
1470 
1471  // The document order in the multi-db case isn't the same as the
1472  // postlist order on the combined DB, so it's hard to compare the
1473  // wdf for each document in the Weight objects, but we can sum
1474  // the wdfs and the squares of the wdfs which provides a decent
1475  // check that we're not getting the wrong wdf values (it ensures
1476  // they have the right mean and standard deviation).
1477  Xapian::termcount expected_sum = 0;
1478  Xapian::termcount expected_sum_squares = 0;
1481  Xapian::docid did1 = *i, did2 = *j;
1482  while (true) {
1483  // To calculate expected_sum_squares correctly we need to square
1484  // the sum per document.
1485  Xapian::termcount wdf;
1486  if (did1 == did2) {
1487  wdf = i.get_wdf() + j.get_wdf();
1488  did1 = did2 = 0;
1489  } else if (did1 < did2) {
1490  wdf = i.get_wdf();
1491  did1 = 0;
1492  } else {
1493  wdf = j.get_wdf();
1494  did2 = 0;
1495  }
1496  expected_sum += wdf;
1497  expected_sum_squares += wdf * wdf;
1498 
1499  if (did1 == 0) {
1500  if (++i != db.postlist_end(term1)) {
1501  did1 = *i;
1502  } else {
1503  if (did2 == Xapian::docid(-1)) break;
1504  did1 = Xapian::docid(-1);
1505  }
1506  }
1507  if (did2 == 0) {
1508  if (++j != db.postlist_end(term2)) {
1509  did2 = *j;
1510  } else {
1511  if (did1 == Xapian::docid(-1)) break;
1512  did2 = Xapian::docid(-1);
1513  }
1514  }
1515  }
1516  // The OP_SYNONYM's wdf should be equal to the sum of the wdfs of
1517  // the individual terms.
1518  TEST_EQUAL(sum, expected_sum);
1519  TEST_EQUAL(sum_squares, expected_sum_squares);
1520  }
1521 }
1522 
1524 // Test the case where we need to clamp wdf to <= doclen.
1525 DEFINE_TESTCASE(checkstatsweight6, backend && !remote) {
1526  Xapian::Database db = get_database("checkstatsweight6",
1527  [](Xapian::WritableDatabase& wdb,
1528  const string&) {
1529  Xapian::Document doc;
1530  doc.add_term("book");
1531  doc.add_term("radio", 4);
1532  doc.add_term("tv");
1533  wdb.add_document(doc);
1534  });
1535  Xapian::Enquire enquire(db);
1537  // Check the case where a term is repeated in the synonym.
1538  string term{"radio"};
1540  tout << q.get_description() << '\n';
1541  enquire.set_query(q);
1542  Xapian::termcount sum = 0;
1543  Xapian::termcount sum_squares = 0;
1544  CheckStatsWeight wt(db, term, term, sum, sum_squares);
1545  enquire.set_weighting_scheme(wt);
1546  Xapian::MSet mset = enquire.get_mset(0, db.get_doccount());
1547 
1548  // The document order in the multi-db case isn't the same as the
1549  // postlist order on the combined DB, so it's hard to compare the
1550  // wdf for each document in the Weight objects, but we can sum
1551  // the wdfs and the squares of the wdfs which provides a decent
1552  // check that we're not getting the wrong wdf values (it ensures
1553  // they have the right mean and standard deviation).
1554  Xapian::termcount expected_sum = 0;
1555  Xapian::termcount expected_sum_squares = 0;
1556  for (auto i = db.postlist_begin(term);
1557  i != db.postlist_end(term);
1558  ++i) {
1559  auto wdf = std::min(i.get_wdf() * 2, db.get_doclength(*i));
1560  expected_sum += wdf;
1561  expected_sum_squares += wdf * wdf;
1562  }
1563  TEST_EQUAL(sum, expected_sum);
1564  TEST_EQUAL(sum_squares, expected_sum_squares);
1565 }
1566 
1568 // Regression test for bug fixed in 1.4.1.
1569 DEFINE_TESTCASE(checkstatsweight3, backend && !remote) {
1570  // The most correct thing to do would be to collate termfreqs across shards
1571  // for this, but if that's too hard to do efficiently we could at least
1572  // scale up the termfreqs proportional to the size of the shard.
1573  XFAIL_FOR_BACKEND("multi", "OP_WILDCARD+OP_SYNONYM use shard termfreqs");
1574 
1575  struct PlCmp {
1576  bool operator()(const Xapian::PostingIterator& a,
1577  const Xapian::PostingIterator& b) {
1578  return *a < *b;
1579  }
1580  };
1581 
1582  Xapian::Database db = get_database("apitest_simpledata");
1583  Xapian::Enquire enquire(db);
1585  static const char * const testcases[] = {
1586  "a", // a* matches all documents, but no term matches all.
1587  "pa", // Expands to only "paragraph", matching 5.
1588  "zulu", // No matches.
1589  "th", // Term "this" matches all documents.
1590  };
1591  for (auto pattern : testcases) {
1593  tout.str(string{});
1594  tout << q.get_description() << '\n';
1595  enquire.set_query(q);
1596  Xapian::termcount sum = 0;
1597  Xapian::termcount sum_squares = 0;
1598  CheckStatsWeight wt(db, pattern, "*", sum, sum_squares);
1599  enquire.set_weighting_scheme(wt);
1600  Xapian::MSet mset = enquire.get_mset(0, db.get_doccount());
1601 
1602  // The document order in the multi-db case isn't the same as the
1603  // postlist order on the combined DB, so it's hard to compare the
1604  // wdf for each document in the Weight objects, but we can sum
1605  // the wdfs and the squares of the wdfs which provides a decent
1606  // check that we're not getting the wrong wdf values (it ensures
1607  // they have the right mean and standard deviation).
1608  Xapian::termcount expected_sum = 0;
1609  Xapian::termcount expected_sum_squares = 0;
1610  vector<Xapian::PostingIterator> postlists;
1611  for (auto&& t = db.allterms_begin(pattern);
1612  t != db.allterms_end(pattern); ++t) {
1613  postlists.emplace_back(db.postlist_begin(*t));
1614  }
1615  Heap::make(postlists.begin(), postlists.end(), PlCmp());
1616  Xapian::docid did = 0;
1617  Xapian::termcount wdf = 0;
1618  while (!postlists.empty()) {
1619  Xapian::docid did_new = *postlists.front();
1620  Xapian::termcount wdf_new = postlists.front().get_wdf();
1621  if (++(postlists.front()) == Xapian::PostingIterator()) {
1622  Heap::pop(postlists.begin(), postlists.end(), PlCmp());
1623  postlists.pop_back();
1624  } else {
1625  Heap::replace(postlists.begin(), postlists.end(), PlCmp());
1626  }
1627  if (did_new != did) {
1628  expected_sum += wdf;
1629  expected_sum_squares += wdf * wdf;
1630  wdf = 0;
1631  did = did_new;
1632  }
1633  wdf += wdf_new;
1634  }
1635  expected_sum += wdf;
1636  expected_sum_squares += wdf * wdf;
1637  // The OP_SYNONYM's wdf should be equal to the sum of the wdfs of
1638  // the individual terms.
1639  TEST_EQUAL(sum, expected_sum);
1640  TEST_REL(sum_squares, >=, expected_sum_squares);
1641  }
1642 }
1643 
1645 // Regression test for bug fixed in 1.4.6. Doesn't work with
1646 // multi as the weight object is cloned more times.
1647 DEFINE_TESTCASE(checkstatsweight4, backend && !remote && !multi) {
1648  Xapian::Database db = get_database("apitest_simpledata");
1649  Xapian::Enquire enquire(db);
1651  for (a = db.allterms_begin(); a != db.allterms_end(); ++a) {
1652  const string & term = *a;
1653  enquire.set_query(Xapian::Query(term, 1, 1) |
1654  Xapian::Query(term, 1, 2));
1655  Xapian::termcount sum = 0;
1656  Xapian::termcount sum_squares = 0;
1657  CheckStatsWeight wt(db, term, "=", sum, sum_squares);
1658  enquire.set_weighting_scheme(wt);
1659  Xapian::MSet mset = enquire.get_mset(0, db.get_doccount());
1660 
1661  // The document order in the multi-db case isn't the same as the
1662  // postlist order on the combined DB, so it's hard to compare the
1663  // wdf for each document in the Weight objects, but we can sum
1664  // the wdfs and the squares of the wdfs which provides a decent
1665  // check that we're not getting the wrong wdf values (it ensures
1666  // they have the right mean and standard deviation).
1667  Xapian::termcount expected_sum = 0;
1668  Xapian::termcount expected_sum_squares = 0;
1670  for (i = db.postlist_begin(term); i != db.postlist_end(term); ++i) {
1671  Xapian::termcount wdf = i.get_wdf();
1672  expected_sum += wdf;
1673  expected_sum_squares += wdf * wdf;
1674  }
1675  TEST_EQUAL(sum, expected_sum);
1676  TEST_EQUAL(sum_squares, expected_sum_squares);
1677  }
1678 }
1679 
1681  public:
1682  mutable Xapian::docid did = 0;
1683 
1684  double factor;
1685 
1687 
1689 
1690  explicit
1691  CheckStatsWeight5(const Xapian::Database& db_, char stat_code_ = '\0')
1692  : factor(-1.0), db(db_), stat_code(stat_code_)
1693  {
1694  switch (stat_code) {
1695  case 'w':
1696  need_stat(WDF);
1697  break;
1698  case 'd':
1699  need_stat(DOC_LENGTH);
1700  break;
1701  }
1702  need_stat(WDF_DOC_MAX);
1703  }
1704 
1705  void init(double factor_) override {
1706  factor = factor_;
1707  }
1708 
1709  Weight* clone() const override {
1710  return new CheckStatsWeight5(db, stat_code);
1711  }
1712 
1716  Xapian::termcount wdfdocmax) const override {
1717  // The query is a synonym of all terms, so should match all documents.
1718  ++did;
1719  TEST_REL(wdfdocmax,==,db.get_doclength(did));
1720  return 1.0 / wdfdocmax;
1721  }
1722 
1723  double get_maxpart() const override {
1724  return 1.0;
1725  }
1726 };
1727 
1729 DEFINE_TESTCASE(checkstatsweight5, backend && !multi && !remote) {
1730  Xapian::Database db = get_database("apitest_simpledata");
1731  Xapian::Enquire enquire(db);
1733  db.allterms_begin(),
1734  db.allterms_end()};
1735  enquire.set_query(q);
1737  Xapian::MSet mset1 = enquire.get_mset(0, db.get_doccount());
1738  enquire.set_weighting_scheme(CheckStatsWeight5(db, 'w'));
1739  Xapian::MSet mset2 = enquire.get_mset(0, db.get_doccount());
1740  enquire.set_weighting_scheme(CheckStatsWeight5(db, 'd'));
1741  Xapian::MSet mset3 = enquire.get_mset(0, db.get_doccount());
1742 }
1743 
1744 // Feature test for Dir+ weighting.
1745 DEFINE_TESTCASE(lmdirichletweight1, backend) {
1746  Xapian::Database db = get_database("apitest_simpledata");
1747  Xapian::Enquire enquire1(db);
1748  Xapian::Enquire enquire2(db);
1749  enquire1.set_query(Xapian::Query("paragraph"));
1750  enquire2.set_query(Xapian::Query("paragraph"));
1751  Xapian::MSet mset1;
1752  Xapian::MSet mset2;
1753 
1755  enquire2.set_weighting_scheme(Xapian::LMDirichletWeight(2000, 0.05));
1756 
1757  mset1 = enquire1.get_mset(0, 10);
1758  mset2 = enquire2.get_mset(0, 10);
1759 
1760  // mset size should be 5
1761  TEST_EQUAL(mset1.size(), 5);
1762  TEST_EQUAL(mset2.size(), 5);
1763 
1764  // Expect mset weights from Dir+ to be less than mset weights from
1765  // Dirichlet for this testcase.
1766  TEST_REL(mset2[0].get_weight(),<,mset1[0].get_weight());
1767  TEST_REL(mset2[1].get_weight(),<,mset1[1].get_weight());
1768  TEST_REL(mset2[2].get_weight(),<,mset1[2].get_weight());
1769  TEST_REL(mset2[3].get_weight(),<,mset1[3].get_weight());
1770  TEST_REL(mset2[4].get_weight(),<,mset1[4].get_weight());
1771 }
1772 
1773 // Feature test for CoordWeight.
1774 DEFINE_TESTCASE(coordweight1, backend) {
1775  Xapian::Enquire enquire(get_database("apitest_simpledata"));
1777  static const char * const terms[] = {
1778  "this", "line", "paragraph", "rubbish"
1779  };
1780  Xapian::Query query(Xapian::Query::OP_OR, terms, std::end(terms));
1781  enquire.set_query(query);
1782  Xapian::MSet mymset1 = enquire.get_mset(0, 100);
1783  // CoordWeight scores 1 for each matching term, so the weight should equal
1784  // the number of matching terms.
1785  for (Xapian::MSetIterator i = mymset1.begin(); i != mymset1.end(); ++i) {
1786  Xapian::termcount matching_terms = 0;
1788  while (t != enquire.get_matching_terms_end(i)) {
1789  ++matching_terms;
1790  ++t;
1791  }
1792  TEST_EQUAL(i.get_weight(), matching_terms);
1793  }
1794 }
1795 
1796 // Feature test.
1797 DEFINE_TESTCASE(diceweight2, backend) {
1798  Xapian::Database db = get_database("apitest_simpledata3");
1799  Xapian::Enquire enquire(db);
1800  static const char * const terms[] = {
1801  "one", "three"
1802  };
1803  Xapian::Query query(Xapian::Query::OP_OR, terms, std::end(terms));
1804  enquire.set_query(query);
1806 
1807  Xapian::MSet mset1;
1808  mset1 = enquire.get_mset(0, 10);
1809  TEST_EQUAL(mset1.size(), 4);
1810 
1811  /* The weight value has been manually calculated by using the statistics
1812  * of the test database. */
1813  TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 0.571428571428571);
1814  TEST_EQUAL_DOUBLE(mset1[1].get_weight(), 0.5);
1815  TEST_EQUAL_DOUBLE(mset1[2].get_weight(), 0.2);
1816  TEST_EQUAL_DOUBLE(mset1[3].get_weight(), 0.181818181818182);
1817 }
1818 
1819 // Test handling of a term with zero wdf.
1820 DEFINE_TESTCASE(diceweight3, backend) {
1821  Xapian::Database db = get_database("diceweight3",
1822  [](Xapian::WritableDatabase& wdb,
1823  const string&) {
1824  Xapian::Document doc;
1825  doc.add_term("radio", 2);
1826  doc.add_term("seahorse");
1827  doc.add_term("zebra");
1828  doc.add_boolean_term("false");
1829  doc.add_boolean_term("true");
1830  wdb.add_document(doc);
1831  });
1832  Xapian::Enquire enquire(db);
1833  enquire.set_weighting_scheme(Xapian::DiceWeight());
1834 
1835  // OP_SYNONYM gives wdf zero is need_stat(WDF) isn't specified (and
1836  // it isn't by DiceWeight).
1838  Xapian::Query("false"), Xapian::Query("true"));
1840  q, 6.0), 2);
1841  Xapian::MSet mset = enquire.get_mset(0, 10);
1842  TEST_EQUAL(mset.size(), 1);
1843 
1844  // factor * 2.0 * wqf / (query_length + unique_term_count)
1845  // = 6.0 * 2.0 * 1 / (2 + 4) = 2.0
1846  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 2.0);
1847 }
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
Definition: api_anydb.cc:62
static const testcase testcases[]
Definition: api_unicode.cc:40
static void test_weight_class_no_params(const char *classname, const char *name)
Definition: api_weight.cc:40
#define TEST_WEIGHT_CLASS_NO_PARAMS(W, N)
Definition: api_weight.cc:66
#define TEST_WEIGHTING_SCHEME(W,...)
DEFINE_TESTCASE(weightserialisation1, !backend)
Test serialisation and introspection of built-in weighting schemes.
Definition: api_weight.cc:120
static void test_weight_class(const char *classname, const char *name, const W &obj_default, const W &obj_other)
Definition: api_weight.cc:70
static void gen_wdf_eq_doclen_db(Xapian::WritableDatabase &db, const string &)
Definition: api_weight.cc:535
#define TEST_WEIGHT_CLASS(W, N, DEFAULT, OTHER)
Definition: api_weight.cc:116
Xapian::Database get_database(const string &dbname)
Definition: apitest.cc:47
void XFAIL_FOR_BACKEND(const std::string &backend_prefix, const char *msg)
Definition: apitest.cc:154
test functionality of the Xapian API
char name[9]
Definition: dbcheck.cc:57
Weight * clone() const override
Clone this object.
Definition: api_weight.cc:1182
void init(double factor_) override
Allow the subclass to perform any initialisation it needs to.
Definition: api_weight.cc:1174
CheckInitWeight(unsigned &z, unsigned &n)
Definition: api_weight.cc:1169
double get_maxextra() const override
Return an upper bound on what get_sumextra() can return for any document.
Definition: api_weight.cc:1199
unsigned & non_zero_inits
Definition: api_weight.cc:1167
double get_sumextra(Xapian::termcount doclen, Xapian::termcount, Xapian::termcount) const override
Calculate the term-independent weight component for a document.
Definition: api_weight.cc:1193
double get_sumpart(Xapian::termcount, Xapian::termcount, Xapian::termcount, Xapian::termcount) const override
Calculate the weight contribution for this object's term to a document.
Definition: api_weight.cc:1186
double get_maxpart() const override
Return an upper bound on what get_sumpart() can return for any document.
Definition: api_weight.cc:1191
Xapian::Database db
Definition: api_weight.cc:1686
double get_sumpart(Xapian::termcount, Xapian::termcount, Xapian::termcount, Xapian::termcount wdfdocmax) const override
Calculate the weight contribution for this object's term to a document.
Definition: api_weight.cc:1713
CheckStatsWeight5(const Xapian::Database &db_, char stat_code_='\0')
Definition: api_weight.cc:1691
void init(double factor_) override
Allow the subclass to perform any initialisation it needs to.
Definition: api_weight.cc:1705
Weight * clone() const override
Clone this object.
Definition: api_weight.cc:1709
double get_maxpart() const override
Return an upper bound on what get_sumpart() can return for any document.
Definition: api_weight.cc:1723
CheckStatsWeight(const Xapian::Database &db_, const string &term_, Xapian::termcount &sum_, Xapian::termcount &sum_squares_)
Definition: api_weight.cc:1272
double get_maxextra() const override
Return an upper bound on what get_sumextra() can return for any document.
Definition: api_weight.cc:1415
Weight * clone() const override
Clone this object.
Definition: api_weight.cc:1282
void init(double factor_) override
Allow the subclass to perform any initialisation it needs to.
Definition: api_weight.cc:1278
double get_sumextra(Xapian::termcount doclen, Xapian::termcount, Xapian::termcount) const override
Calculate the term-independent weight component for a document.
Definition: api_weight.cc:1409
Xapian::termcount & sum
Definition: api_weight.cc:1231
Xapian::Database db
Definition: api_weight.cc:1221
CheckStatsWeight(const Xapian::Database &db_, const string &term1_, const string &term2_, Xapian::termcount &sum_, Xapian::termcount &sum_squares_)
Definition: api_weight.cc:1240
Xapian::termcount & sum_squares
Definition: api_weight.cc:1232
double get_maxpart() const override
Return an upper bound on what get_sumpart() can return for any document.
Definition: api_weight.cc:1398
double get_sumpart(Xapian::termcount wdf, Xapian::termcount doclen, Xapian::termcount uniqueterms, Xapian::termcount wdfdocmax) const override
Calculate the weight contribution for this object's term to a document.
Definition: api_weight.cc:1297
This class implements the BB2 weighting scheme.
Definition: weight.h:1540
Xapian::Weight subclass implementing the BM25+ probabilistic formula.
Definition: weight.h:1161
Xapian::Weight subclass implementing the BM25 probabilistic formula.
Definition: weight.h:1050
Class implementing a "boolean" weighting scheme.
Definition: weight.h:678
Xapian::Weight subclass implementing Coordinate Matching.
Definition: weight.h:2163
This class implements the DLH weighting scheme, which is a representative scheme of the Divergence fr...
Definition: weight.h:1615
This class implements the DPH weighting scheme.
Definition: weight.h:1826
An indexed database of documents.
Definition: database.h:75
Xapian::doccount get_termfreq(std::string_view term) const
Get the number of documents indexed by a specified term.
Definition: database.cc:262
Xapian::termcount get_unique_terms_lower_bound() const
Get a lower bound on the unique terms size of a document in this DB.
Definition: database.cc:323
Xapian::totallength get_total_length() const
Get the total length of all the documents in the database.
Definition: database.cc:256
Xapian::termcount get_doclength_lower_bound() const
Get a lower bound on the length of a document in this DB.
Definition: database.cc:302
PostingIterator postlist_begin(std::string_view term) const
Start iterating the postings of a term.
Definition: database.cc:192
double get_avlength() const
Old name for get_average_length() for backward compatibility.
Definition: database.h:322
size_t size() const
Return number of shards in this Database object.
Definition: database.cc:105
Xapian::termcount get_doclength(Xapian::docid did) const
Get the length of a specified document.
Definition: database.cc:341
Xapian::termcount get_unique_terms_upper_bound() const
Get an upper bound on the unique terms size of a document in this DB.
Definition: database.cc:329
TermIterator allterms_end(std::string_view={}) const noexcept
End iterator corresponding to allterms_begin(prefix).
Definition: database.h:307
Xapian::termcount get_collection_freq(std::string_view term) const
Get the total number of occurrences of a specified term.
Definition: database.cc:273
Xapian::doccount get_doccount() const
Get the number of documents in the database.
Definition: database.cc:233
PostingIterator postlist_end(std::string_view) const noexcept
End iterator corresponding to postlist_begin().
Definition: database.h:258
TermIterator allterms_begin(std::string_view prefix={}) const
Start iterating all terms in the database with a given prefix.
Definition: database.cc:209
Xapian::termcount get_doclength_upper_bound() const
Get an upper bound on the length of a document in this DB.
Definition: database.cc:308
Xapian::Weight subclass implementing Dice Coefficient.
Definition: weight.h:2207
Class representing a document.
Definition: document.h:64
void add_boolean_term(std::string_view term)
Add a boolean filter term to the document.
Definition: document.h:145
void add_term(std::string_view term, Xapian::termcount wdf_inc=1)
Add a term to this document.
Definition: document.cc:87
Querying session.
Definition: enquire.h:57
void set_weighting_scheme(const Weight &weight)
Set the weighting scheme to use.
Definition: enquire.cc:85
MSet get_mset(doccount first, doccount maxitems, doccount checkatleast=0, const RSet *rset=NULL, const MatchDecider *mdecider=NULL) const
Run the query.
Definition: enquire.cc:200
TermIterator get_matching_terms_begin(docid did) const
Iterate query terms matching a document.
Definition: enquire.cc:210
void set_query(const Query &query, termcount query_length=0)
Set the query.
Definition: enquire.cc:72
TermIterator get_matching_terms_end(docid) const noexcept
End iterator corresponding to get_matching_terms_begin().
Definition: enquire.h:435
const std::string & get_msg() const noexcept
Message giving details of the error, intended for human consumption.
Definition: error.h:111
This class implements the IfB2 weighting scheme.
Definition: weight.h:1397
This class implements the InL2 weighting scheme.
Definition: weight.h:1327
This class implements the IneB2 weighting scheme.
Definition: weight.h:1469
InvalidArgumentError indicates an invalid parameter value was passed to the API.
Definition: error.h:229
Language Model weighting with Two Stage smoothing.
Definition: weight.h:2093
Language Model weighting with Absolute Discount smoothing.
Definition: weight.h:2024
Language Model weighting with Dirichlet or Dir+ smoothing.
Definition: weight.h:1948
Language Model weighting with Jelinek-Mercer smoothing.
Definition: weight.h:1875
Iterator over a Xapian::MSet.
Definition: mset.h:535
double get_weight() const
Get the weight for the current position.
Definition: msetiterator.cc:55
Class representing a list of search results.
Definition: mset.h:46
Xapian::doccount size() const
Return number of items in this MSet object.
Definition: mset.cc:374
MSetIterator back() const
Return iterator pointing to the last object in this MSet.
Definition: mset.h:803
MSetIterator begin() const
Return iterator pointing to the first item in this MSet.
Definition: mset.h:786
MSetIterator end() const
Return iterator pointing to just after the last item in this MSet.
Definition: mset.h:791
Xapian::Weight subclass implementing the PL2+ probabilistic formula.
Definition: weight.h:1731
This class implements the PL2 weighting scheme.
Definition: weight.h:1671
Class for iterating over a list of terms.
Xapian::termcount get_wdf() const
Return the wdf for the document at the current position.
Class representing a query.
Definition: query.h:45
std::string get_description() const
Return a string describing this object.
Definition: query.cc:307
@ OP_SCALE_WEIGHT
Scale the weight contributed by a subquery.
Definition: query.h:166
@ OP_WILDCARD
Wildcard expansion.
Definition: query.h:255
@ OP_AND
Match only documents which all subqueries match.
Definition: query.h:84
@ OP_OR
Match documents which at least one subquery matches.
Definition: query.h:92
@ OP_SYNONYM
Match like OP_OR but weighting as if a single term.
Definition: query.h:239
Indicates an error in the std::string serialisation of an object.
Definition: error.h:917
Class for iterating over a list of terms.
Definition: termiterator.h:41
Xapian::Weight subclass implementing the tf-idf weighting scheme.
Definition: weight.h:704
@ GLOBAL_FREQ
Global frequency IDF.
@ LOG_GLOBAL_FREQ
Log global frequency IDF.
@ SQRT_GLOBAL_FREQ
Square root global frequency IDF.
@ INCREMENTED_GLOBAL_FREQ
Incremented global frequency IDF.
@ AUG
Augmented max wdf.
@ AUG_AVERAGE
Augmented average term frequency.
Xapian::Weight subclass implementing the traditional probabilistic formula.
Definition: weight.h:1297
Abstract base class for weighting schemes.
Definition: weight.h:38
static const Weight * create(const std::string &scheme, const Registry &reg=Registry())
Return the appropriate weighting scheme object.
Definition: weight.cc:225
This class provides read/write access to a database.
Definition: database.h:964
Xapian::docid add_document(const Xapian::Document &doc)
Add a document to the database.
Definition: database.cc:561
string term
C++ STL heap implementation with extensions.
void pop(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:213
void replace(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:230
void make(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:259
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
Definition: types.h:114
#define CONST_STRLEN(S)
Returns the length of a string constant.
Definition: stringutils.h:48
Definition: header.h:215
#define TEST_REL(A, REL, B)
Test a relation holds,e.g. TEST_REL(a,>,b);.
Definition: testmacros.h:35
std::ostringstream tout
The debug printing stream.
Definition: testsuite.cc:104
#define FAIL_TEST(MSG)
Fail the current testcase with message MSG.
Definition: testsuite.h:65
#define TEST_EQUAL(a, b)
Test for equality of two things.
Definition: testsuite.h:276
#define TEST_NOT_EQUAL_DOUBLE(a, b)
Test two doubles for non-near-equality.
Definition: testsuite.h:298
#define TEST_EQUAL_DOUBLE(a, b)
Test two doubles for near equality.
Definition: testsuite.h:293
#define TEST(a)
Test a condition, without an additional explanation for failure.
Definition: testsuite.h:273
#define TEST_NOT_EQUAL(a, b)
Test for non-equality of two things.
Definition: testsuite.h:303
void mset_expect_order(const Xapian::MSet &A, Xapian::docid d1, Xapian::docid d2, Xapian::docid d3, Xapian::docid d4, Xapian::docid d5, Xapian::docid d6, Xapian::docid d7, Xapian::docid d8, Xapian::docid d9, Xapian::docid d10, Xapian::docid d11, Xapian::docid d12)
Definition: testutils.cc:224
Xapian-specific test helper functions and macros.
#define TEST_EXCEPTION(TYPE, CODE)
Check that CODE throws exactly Xapian exception TYPE.
Definition: testutils.h:112
Public interfaces for the Xapian library.