xapian-core  1.4.26
api_weight.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2004-2024 Olly Betts
5  * Copyright (C) 2013 Aarsh Shah
6  * Copyright (C) 2016 Vivek Pal
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include <config.h>
24 
25 #include "api_weight.h"
26 #include <cmath>
27 #include <memory>
28 
29 #include <xapian.h>
30 
31 #include "apitest.h"
32 #include "testutils.h"
33 
34 using namespace std;
35 
36 template<class W>
37 static inline void
39 {
40  tout << name << '\n';
41  W obj;
42  // Check name() returns the class name.
43  TEST_EQUAL(obj.name(), name);
44  // If there are no parameters, there's nothing to serialise.
45  string obj_serialised = obj.serialise();
46  TEST_EQUAL(obj_serialised.size(), 0);
47  // Check serialising and unserialising gives object with same serialisation.
48  unique_ptr<Xapian::Weight> wt(W().unserialise(obj_serialised));
49  TEST_EQUAL(obj_serialised, wt->serialise());
50  // Check that unserialise() throws suitable error for bad serialisation.
51  // The easy case to test is extra junk after the serialised weight.
52  try {
53  unique_ptr<Xapian::Weight> bad(W().unserialise(obj_serialised + "X"));
54  FAIL_TEST(name << " did not throw for unserialise with junk appended");
55  } catch (const Xapian::SerialisationError& e) {
56  // Check the exception message contains the weighting scheme name
57  // (regression test for TradWeight's exception saying "BM25").
58  string target = name + CONST_STRLEN("Xapian::");
59  TEST(e.get_msg().find(target) != string::npos);
60  }
61 }
62 
63 #define TEST_WEIGHT_CLASS_NO_PARAMS(W) test_weight_class_no_params<W>(#W)
64 
65 template<class W>
66 static inline void
67 test_weight_class(const char* name, const W& obj_default, const W& obj_other)
68 {
69  tout << name << '\n';
70  W obj;
71  // Check name() returns the class name.
72  TEST_EQUAL(obj.name(), name);
73  TEST_EQUAL(obj_default.name(), name);
74  TEST_EQUAL(obj_other.name(), name);
75  // Check serialisation matches that of object constructed with explicit
76  // parameter values of what the defaults are meant to be.
77  string obj_serialised = obj.serialise();
78  TEST_EQUAL(obj_serialised, obj_default.serialise());
79  // Check serialisation is different to object with different parameters.
80  string obj_other_serialised = obj_other.serialise();
81  TEST_NOT_EQUAL(obj_serialised, obj_other_serialised);
82  // Check serialising and unserialising gives object with same serialisation.
83  unique_ptr<Xapian::Weight> wt(W().unserialise(obj_serialised));
84  TEST_EQUAL(obj_serialised, wt->serialise());
85  // Check serialising and unserialising of object with different parameters.
86  unique_ptr<Xapian::Weight> wt2(W().unserialise(obj_other_serialised));
87  TEST_EQUAL(obj_other_serialised, wt2->serialise());
88  // Check that unserialise() throws suitable error for bad serialisation.
89  // The easy case to test is extra junk after the serialised weight.
90  try {
91  unique_ptr<Xapian::Weight> bad(W().unserialise(obj_serialised + "X"));
92  FAIL_TEST(name << " did not throw for unserialise with junk appended");
93  } catch (const Xapian::SerialisationError& e) {
94  // Check the exception message contains the weighting scheme name
95  // (regression test for TradWeight's exception saying "BM25").
96  string target = name + CONST_STRLEN("Xapian::");
97  TEST(e.get_msg().find(target) != string::npos);
98  }
99 }
100 
101 // W Should be the class name.
102 //
103 // DEFAULT should be a parenthesised parameter list to explicitly construct
104 // an object of class W with the documented default parameters.
105 //
106 // OTHER should be a parenthesised parameter list to construct an object with
107 // non-default parameters.
108 #define TEST_WEIGHT_CLASS(W, DEFAULT, OTHER) \
109  test_weight_class<W>(#W, W DEFAULT, W OTHER)
110 
112 DEFINE_TESTCASE(weightserialisation1, !backend) {
113  // Parameter-free weighting schemes.
118 
119  // Parameterised weighting schemes.
120  TEST_WEIGHT_CLASS(Xapian::TradWeight, (1.0), (2.0));
122  (1, 0, 1, 0.5, 0.5),
123  (1, 0.5, 1, 0.5, 0.5));
125  (1, 0, 1, 0.5, 0.5, 1.0),
126  (1, 0, 1, 0.5, 0.5, 2.0));
127  TEST_WEIGHT_CLASS(Xapian::TfIdfWeight, ("ntn"), ("bpn"));
128  TEST_WEIGHT_CLASS(Xapian::InL2Weight, (1.0), (2.0));
129  TEST_WEIGHT_CLASS(Xapian::IfB2Weight, (1.0), (2.0));
130  TEST_WEIGHT_CLASS(Xapian::IneB2Weight, (1.0), (2.0));
131  TEST_WEIGHT_CLASS(Xapian::BB2Weight, (1.0), (2.0));
132  TEST_WEIGHT_CLASS(Xapian::PL2Weight, (1.0), (2.0));
134  (1.0, 0.8),
135  (2.0, 0.9));
137  (0.0, Xapian::Weight::TWO_STAGE_SMOOTHING, 0.7, 2000.0),
139 }
140 
142 DEFINE_TESTCASE(weight1, backend) {
143  Xapian::Database db(get_database("etext"));
144  Xapian::Enquire enquire(db);
145  Xapian::Enquire enquire_scaled(db);
146  auto term = "robinson";
147  Xapian::Query q{term};
148  enquire.set_query(q);
149  enquire_scaled.set_query(q * 15.0);
150  auto expected_matches = db.get_termfreq(term);
151  auto helper = [&](const Xapian::Weight& weight,
152  const string& name,
153  const string& params) {
154  tout << name << '(' << params << ")\n";
155  enquire.set_weighting_scheme(weight);
156  enquire_scaled.set_weighting_scheme(weight);
157  Xapian::MSet mset = enquire.get_mset(0, expected_matches + 1);
158  TEST_EQUAL(mset.size(), expected_matches);
159  if (name == "Xapian::BoolWeight") {
160  /* All weights should be zero. */
161  TEST_EQUAL(mset[0].get_weight(), 0.0);
162  TEST_EQUAL(mset.back().get_weight(), 0.0);
163  } else if (name == "Xapian::CoordWeight") {
164  /* All weights should be 1 for a single term query. */
165  TEST_EQUAL(mset[0].get_weight(), 1.0);
166  TEST_EQUAL(mset.back().get_weight(), 1.0);
167  } else if (!params.empty()) {
168  /* All weights should be equal with these particular parameters. */
169  TEST_NOT_EQUAL(mset[0].get_weight(), 0.0);
170  TEST_EQUAL(mset[0].get_weight(), mset.back().get_weight());
171  } else {
172  TEST_NOT_EQUAL(mset[0].get_weight(), 0.0);
173  TEST_NOT_EQUAL(mset[0].get_weight(), mset.back().get_weight());
174  }
175  Xapian::MSet mset_scaled = enquire_scaled.get_mset(0, expected_matches);
176  TEST_EQUAL(mset_scaled.size(), expected_matches);
177  for (Xapian::doccount i = 0; i < expected_matches; ++i) {
178  TEST_EQUAL_DOUBLE(mset_scaled[i].get_weight(),
179  mset[i].get_weight() * 15.0);
180  }
181  };
182 
183  // MSVC gives nothing for #__VA_ARGS__ when there are no varargs.
184 #define TEST_WEIGHTING_SCHEME(W, ...) \
185  helper(W(__VA_ARGS__), #W, "" #__VA_ARGS__)
186 
202  // Regression test for bug fixed in 1.2.4.
204  /* As mentioned in the documentation, when parameter k is 0, wdf and
205  * document length don't affect the weights. Regression test for bug fixed
206  * in 1.2.4.
207  */
209 #undef TEST_WEIGHTING_SCHEME
210 }
211 
216 DEFINE_TESTCASE(bm25weight1, backend) {
217  Xapian::Enquire enquire(get_database("apitest_simpledata"));
218  enquire.set_weighting_scheme(Xapian::BM25Weight(1, 25, 1, 0.01, 0.5));
219  enquire.set_query(Xapian::Query("word"));
220 
221  Xapian::MSet mset = enquire.get_mset(0, 25);
222 }
223 
224 // Test parameter combinations which should be unaffected by doclength.
225 DEFINE_TESTCASE(bm25weight4, backend) {
226  Xapian::Database db = get_database("apitest_simpledata");
227  Xapian::Enquire enquire(db);
228  enquire.set_query(Xapian::Query("paragraph"));
229  Xapian::MSet mset;
230 
231  enquire.set_weighting_scheme(Xapian::BM25Weight(1, 0, 1, 0, 0.5));
232  mset = enquire.get_mset(0, 10);
233  TEST_EQUAL(mset.size(), 5);
234  // Expect: wdf has an effect on weight, but doclen doesn't.
235  TEST_REL(mset[0].get_weight(),>,mset[1].get_weight());
236  TEST_EQUAL_DOUBLE(mset[1].get_weight(), mset[2].get_weight());
237  TEST_REL(mset[2].get_weight(),>,mset[3].get_weight());
238  TEST_EQUAL_DOUBLE(mset[3].get_weight(), mset[4].get_weight());
239 
240  enquire.set_weighting_scheme(Xapian::BM25Weight(0, 0, 1, 1, 0.5));
241  mset = enquire.get_mset(0, 10);
242  TEST_EQUAL(mset.size(), 5);
243  // Expect: neither wdf nor doclen affects weight.
244  TEST_EQUAL_DOUBLE(mset[0].get_weight(), mset[4].get_weight());
245 }
246 
248 // Regression test for bug fixed in 1.2.17 and 1.3.2.
249 DEFINE_TESTCASE(bm25weight5, backend) {
250  Xapian::Database db = get_database("apitest_simpledata");
251  Xapian::Enquire enquire(db);
252  enquire.set_query(Xapian::Query("paragraph"));
253  Xapian::MSet mset;
254 
255  enquire.set_weighting_scheme(Xapian::BM25Weight(0, 1, 1, 0.5, 0.5));
256  mset = enquire.get_mset(0, 10);
257  TEST_EQUAL(mset.size(), 5);
258  // Expect: wdf has no effect on weight; shorter docs rank higher.
259  mset_expect_order(mset, 3, 5, 1, 4, 2);
260  TEST_EQUAL_DOUBLE(mset[0].get_weight(), mset[1].get_weight());
261  TEST_REL(mset[1].get_weight(),>,mset[2].get_weight());
262  TEST_REL(mset[2].get_weight(),>,mset[3].get_weight());
263  TEST_REL(mset[3].get_weight(),>,mset[4].get_weight());
264 }
265 
266 // Test parameter combinations which should be unaffected by doclength.
267 DEFINE_TESTCASE(bm25plusweight2, backend) {
268  Xapian::Database db = get_database("apitest_simpledata");
269  Xapian::Enquire enquire(db);
270  enquire.set_query(Xapian::Query("paragraph"));
271  Xapian::MSet mset;
272 
273  enquire.set_weighting_scheme(Xapian::BM25PlusWeight(1, 0, 1, 0, 0.5, 1));
274  mset = enquire.get_mset(0, 10);
275  TEST_EQUAL(mset.size(), 5);
276  // Expect: wdf has an effect on weight, but doclen doesn't.
277  TEST_REL(mset[0].get_weight(),>,mset[1].get_weight());
278  TEST_EQUAL_DOUBLE(mset[1].get_weight(), mset[2].get_weight());
279  TEST_REL(mset[2].get_weight(),>,mset[3].get_weight());
280  TEST_EQUAL_DOUBLE(mset[3].get_weight(), mset[4].get_weight());
281 
282  enquire.set_weighting_scheme(Xapian::BM25PlusWeight(0, 0, 1, 1, 0.5, 1));
283  mset = enquire.get_mset(0, 10);
284  TEST_EQUAL(mset.size(), 5);
285  // Expect: neither wdf nor doclen affects weight.
286  TEST_EQUAL_DOUBLE(mset[0].get_weight(), mset[4].get_weight());
287 }
288 
289 // Regression test for a mistake corrected in the BM25+ implementation.
290 DEFINE_TESTCASE(bm25plusweight3, backend) {
291  Xapian::Database db = get_database("apitest_simpledata");
292  Xapian::Enquire enquire(db);
293  enquire.set_query(Xapian::Query("paragraph"));
294  Xapian::MSet mset;
295 
296  enquire.set_weighting_scheme(Xapian::BM25PlusWeight(1, 0, 1, 0.5, 0.5, 1));
297  mset = enquire.get_mset(0, 10);
298  TEST_EQUAL(mset.size(), 5);
299 
300  // The value of each doc weight calculated manually from the BM25+ formulae
301  // by using the respective document statistics.
302  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 0.7920796567487473);
303  TEST_EQUAL_DOUBLE(mset[1].get_weight(), 0.7846980783848447);
304  TEST_EQUAL_DOUBLE(mset[2].get_weight(), 0.7558817623365934);
305  TEST_EQUAL_DOUBLE(mset[3].get_weight(), 0.7210119356168847);
306  TEST_EQUAL_DOUBLE(mset[4].get_weight(), 0.7210119356168847);
307 }
308 
309 
310 // Test for invalid values of c.
311 DEFINE_TESTCASE(inl2weight2, !backend) {
312  // InvalidArgumentError should be thrown if the parameter c is invalid.
314  Xapian::InL2Weight wt(-2.0));
315 
317  Xapian::InL2Weight wt2(0.0));
318 }
319 
320 // Feature tests for Inl2Weight
321 DEFINE_TESTCASE(inl2weight3, backend) {
322  Xapian::Database db = get_database("apitest_simpledata");
323  Xapian::Enquire enquire(db);
324  Xapian::Query query("banana");
325 
326  enquire.set_query(query);
328 
329  Xapian::MSet mset1;
330  mset1 = enquire.get_mset(0, 10);
331  TEST_EQUAL(mset1.size(), 1);
332  mset_expect_order(mset1, 6);
333 
334  /* The value has been calculated in the python interpreter by looking at the
335  * database statistics. */
336  TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 1.559711143842063);
337 }
338 
339 // Test for invalid values of c.
340 DEFINE_TESTCASE(ifb2weight2, !backend) {
341  // InvalidArgumentError should be thrown if the parameter c is invalid.
343  Xapian::IfB2Weight wt(-2.0));
344 
346  Xapian::IfB2Weight wt2(0.0));
347 }
348 
349 // Feature test
350 DEFINE_TESTCASE(ifb2weight3, backend) {
351  Xapian::Database db = get_database("apitest_simpledata");
352  Xapian::Enquire enquire(db);
353  Xapian::Query query("banana");
354 
355  enquire.set_query(query);
357 
358  Xapian::MSet mset1;
359  mset1 = enquire.get_mset(0, 10);
360  TEST_EQUAL(mset1.size(), 1);
361 
362  /* The value of the weight has been manually calculated using the statistics
363  * of the test database. */
364  TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 3.119422287684126);
365 }
366 
367 // Test for invalid values of c.
368 DEFINE_TESTCASE(ineb2weight2, !backend) {
369  // InvalidArgumentError should be thrown if parameter c is invalid.
371  Xapian::IneB2Weight wt(-2.0));
372 
374  Xapian::IneB2Weight wt2(0.0));
375 }
376 
377 // Feature test.
378 DEFINE_TESTCASE(ineb2weight3, backend) {
379  Xapian::Database db = get_database("apitest_simpledata");
380  Xapian::Enquire enquire(db);
381  Xapian::Query query("paragraph");
382  enquire.set_query(query);
384 
385  Xapian::MSet mset1;
386  mset1 = enquire.get_mset(0, 10);
387  TEST_EQUAL(mset1.size(), 5);
388 
389  // The third document in the database is 4th in the ranking.
390  /* The weight value has been manually calculated by using the statistics
391  * of the test database. */
392  TEST_EQUAL_DOUBLE(mset1[4].get_weight(), 0.61709730297692400036);
393 }
394 
395 // Test for invalid values of c.
396 DEFINE_TESTCASE(bb2weight2, !backend) {
397  // InvalidArgumentError should be thrown if the parameter c is invalid.
399  Xapian::BB2Weight wt(-2.0));
400 
402  Xapian::BB2Weight wt2(0.0));
403 }
404 
405 // Feature test
406 DEFINE_TESTCASE(bb2weight3, backend) {
407  Xapian::Database db = get_database("apitest_simpledata");
408  Xapian::Enquire enquire(db);
409  Xapian::Query query("paragraph");
410 
411  enquire.set_query(query);
413 
414  Xapian::MSet mset1;
415  mset1 = enquire.get_mset(0, 10);
416  TEST_EQUAL(mset1.size(), 5);
417  /* The third document in the database has the highest weight and is the
418  * first in the mset. */
419  // Value calculated manually by using the statistics of the test database.
420  TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 1.6823696969784483);
421 
422  // Test with OP_SCALE_WEIGHT and a small factor (regression test, as we
423  // were applying the factor to the upper bound twice).
424  enquire.set_query(Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 1.0 / 1024));
426 
427  Xapian::MSet mset3;
428  mset3 = enquire.get_mset(0, 10);
429  TEST_EQUAL(mset3.size(), 5);
430 
431  for (int i = 0; i < 5; ++i) {
432  TEST_EQUAL_DOUBLE(mset1[i].get_weight(), mset3[i].get_weight() * 1024);
433  }
434 }
435 
436 // Regression test: we used to calculate log2(0) when there was only one doc.
437 DEFINE_TESTCASE(bb2weight4, backend) {
438  Xapian::Database db = get_database("apitest_onedoc");
439  Xapian::Enquire enquire(db);
440  Xapian::Query query("word");
441 
442  enquire.set_query(query);
444 
445  Xapian::MSet mset1;
446  mset1 = enquire.get_mset(0, 10);
447  TEST_EQUAL(mset1.size(), 1);
448  TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 3.431020621347435);
449 }
450 
451 // Feature test.
452 DEFINE_TESTCASE(dlhweight1, backend) {
453  Xapian::Database db = get_database("apitest_simpledata");
454  Xapian::Enquire enquire(db);
455  Xapian::Query query("a");
456 
457  enquire.set_query(query);
459 
460  Xapian::MSet mset1;
461  mset1 = enquire.get_mset(0, 10);
462  TEST_EQUAL(mset1.size(), 3);
463  mset_expect_order(mset1, 3, 1, 2);
464  // Weights calculated manually using stats from the database.
465  TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 1.0046477754371292362);
466  TEST_EQUAL_DOUBLE(mset1[1].get_weight(), 0.97621929514640352757);
467  // The following weight would be negative but gets clamped to 0.
468  TEST_EQUAL_DOUBLE(mset1[2].get_weight(), 0.0);
469 }
470 
471 static void
473 {
474  Xapian::Document doc;
475  doc.add_term("solo", 37);
476  db.add_document(doc);
477 }
478 
479 // Test wdf == doclen.
480 DEFINE_TESTCASE(dlhweight3, backend) {
481  Xapian::Database db = get_database("wdf_eq_doclen", gen_wdf_eq_doclen_db);
482  Xapian::Enquire enquire(db);
483  Xapian::Query query("solo");
484 
485  enquire.set_query(query);
487 
488  Xapian::MSet mset1;
489  mset1 = enquire.get_mset(0, 10);
490  TEST_EQUAL(mset1.size(), 1);
491  // Weight gets clamped to zero.
492  TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 0.0);
493 }
494 
495 // Test for invalid values of c.
496 DEFINE_TESTCASE(pl2weight2, !backend) {
497  // InvalidArgumentError should be thrown if parameter c is invalid.
499  Xapian::PL2Weight wt(-2.0));
500 }
501 
502 // Feature Test.
503 DEFINE_TESTCASE(pl2weight3, backend) {
504  Xapian::Database db = get_database("apitest_simpledata");
505  Xapian::Enquire enquire(db);
506  Xapian::Query query("paragraph");
507  enquire.set_query(query);
508  Xapian::MSet mset;
509 
511  mset = enquire.get_mset(0, 10);
512  TEST_EQUAL(mset.size(), 5);
513  // Expected weight difference calculated in extended precision using stats
514  // from the test database.
515  TEST_EQUAL_DOUBLE(mset[2].get_weight(),
516  mset[3].get_weight() + 0.0086861771701328694);
517 }
518 
519 // Test for invalid values of parameters, c and delta.
520 DEFINE_TESTCASE(pl2plusweight2, !backend) {
521  // InvalidArgumentError should be thrown if parameter c is invalid.
523  Xapian::PL2PlusWeight wt(-2.0, 0.9));
524 
525  // InvalidArgumentError should be thrown if parameter delta is invalid.
527  Xapian::PL2PlusWeight wt(1.0, -1.9));
528 }
529 
530 // Feature Test 1 for PL2PlusWeight.
531 DEFINE_TESTCASE(pl2plusweight4, backend) {
532  Xapian::Database db = get_database("apitest_simpledata");
533  Xapian::Enquire enquire(db);
534  enquire.set_query(Xapian::Query("to"));
535  Xapian::MSet mset;
536 
537  enquire.set_weighting_scheme(Xapian::PL2PlusWeight(2.0, 0.8));
538  mset = enquire.get_mset(0, 10);
539  TEST_EQUAL(mset.size(), 3);
540  // Expected weight difference calculated in Python using stats from the
541  // test database.
542  TEST_EQUAL_DOUBLE(mset[1].get_weight(),
543  mset[2].get_weight() + 0.016760925252262027);
544 }
545 
546 // Feature Test 2 for PL2PlusWeight
547 DEFINE_TESTCASE(pl2plusweight5, backend) {
548  Xapian::Database db = get_database("apitest_simpledata");
549  Xapian::Enquire enquire(db);
550  Xapian::Query query("word");
551  enquire.set_query(query);
552  Xapian::MSet mset;
553 
554  enquire.set_weighting_scheme(Xapian::PL2PlusWeight(1.0, 0.8));
555  mset = enquire.get_mset(0, 10);
556  // Expect MSet contains two documents having query "word".
557  TEST_EQUAL(mset.size(), 2);
558  // Expect Document 2 has higher weight than document 4 because
559  // "word" appears more no. of times in document 2 than document 4.
560  mset_expect_order(mset, 2, 4);
561 }
562 
563 // Feature test
564 DEFINE_TESTCASE(dphweight1, backend) {
565  Xapian::Database db = get_database("apitest_simpledata");
566  Xapian::Enquire enquire(db);
567  Xapian::Query query("paragraph");
568 
569  enquire.set_query(query);
571 
572  Xapian::MSet mset1;
573  mset1 = enquire.get_mset(0, 10);
574  TEST_EQUAL(mset1.size(), 5);
575  /* The weight has been calculated manually by using the statistics of the
576  * test database. */
577  TEST_EQUAL_DOUBLE(mset1[2].get_weight() - mset1[4].get_weight(), 0.542623617687990167);
578 }
579 
580 // Test wdf == doclen.
581 DEFINE_TESTCASE(dphweight3, backend) {
582  Xapian::Database db = get_database("wdf_eq_doclen", gen_wdf_eq_doclen_db);
583  Xapian::Enquire enquire(db);
584  Xapian::Query query("solo");
585 
586  enquire.set_query(query);
588 
589  Xapian::MSet mset1;
590  mset1 = enquire.get_mset(0, 10);
591  TEST_EQUAL(mset1.size(), 1);
592  // Weight gets clamped to zero.
593  TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 0.0);
594 }
595 
596 // Test for various cases of normalization string.
597 DEFINE_TESTCASE(tfidfweight1, !backend) {
598  // InvalidArgumentError should be thrown if normalization string is invalid
600  Xapian::TfIdfWeight b("JOHN_LENNON"));
601 
603  Xapian::TfIdfWeight b("LOL"));
604 }
605 
606 // Feature tests for various normalization functions.
607 DEFINE_TESTCASE(tfidfweight3, backend) {
608  Xapian::Database db = get_database("apitest_simpledata");
609  Xapian::Enquire enquire(db);
610  Xapian::Query query("word");
611  Xapian::MSet mset;
612 
613  // Check for "ntn" when termfreq != N
614  enquire.set_query(query);
616  mset = enquire.get_mset(0, 10);
617  TEST_EQUAL(mset.size(), 2);
618  // doc 2 should have higher weight than 4 as only tf(wdf) will dominate.
619  mset_expect_order(mset, 2, 4);
620  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 * log(6.0 / 2));
621 
622  // Check that wqf is taken into account.
623  enquire.set_query(Xapian::Query("word", 2));
625  Xapian::MSet mset2 = enquire.get_mset(0, 10);
626  TEST_EQUAL(mset2.size(), 2);
627  // doc 2 should have higher weight than 4 as only tf(wdf) will dominate.
628  mset_expect_order(mset2, 2, 4);
629  // wqf is 2, so weights should be doubled.
630  TEST_EQUAL_DOUBLE(mset[0].get_weight() * 2, mset2[0].get_weight());
631  TEST_EQUAL_DOUBLE(mset[1].get_weight() * 2, mset2[1].get_weight());
632 
633  // check for "nfn" when termfreq != N
634  enquire.set_query(query);
636  mset = enquire.get_mset(0, 10);
637  TEST_EQUAL(mset.size(), 2);
638  mset_expect_order(mset, 2, 4);
639  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 / 2);
640 
641  // check for "nsn" when termfreq != N
642  enquire.set_query(query);
644  mset = enquire.get_mset(0, 10);
645  TEST_EQUAL(mset.size(), 2);
646  mset_expect_order(mset, 2, 4);
647  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 * pow(log(6.0 / 2), 2.0));
648 
649  // Check for "bnn" and for both branches of 'b'.
650  enquire.set_query(Xapian::Query("test"));
652  mset = enquire.get_mset(0, 10);
653  TEST_EQUAL(mset.size(), 1);
654  mset_expect_order(mset, 1);
655  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 1.0);
656 
657  // Check for "lnn" and for both branches of 'l'.
658  enquire.set_query(Xapian::Query("word"));
660  mset = enquire.get_mset(0, 10);
661  TEST_EQUAL(mset.size(), 2);
662  mset_expect_order(mset, 2, 4);
663  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 1 + log(8.0)); // idfn=1 and so wt=tfn=1+log(tf)
664  TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1.0); // idfn=1 and wt=tfn=1+log(tf)=1+log(1)=1
665 
666  // Check for "snn"
667  enquire.set_query(Xapian::Query("paragraph"));
668  enquire.set_weighting_scheme(Xapian::TfIdfWeight("snn")); // idf=1 and tfn=tf*tf
669  mset = enquire.get_mset(0, 10);
670  TEST_EQUAL(mset.size(), 5);
671  mset_expect_order(mset, 2, 1, 4, 3, 5);
672  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 9.0);
673  TEST_EQUAL_DOUBLE(mset[4].get_weight(), 1.0);
674 
675  // Check for "ntn" when termfreq=N
676  enquire.set_query(Xapian::Query("this")); // N=termfreq and so idfn=0 for "t"
678  mset = enquire.get_mset(0, 10);
679  TEST_EQUAL(mset.size(), 6);
680  mset_expect_order(mset, 1, 2, 3, 4, 5, 6);
681  for (int i = 0; i < 6; ++i) {
682  TEST_EQUAL_DOUBLE(mset[i].get_weight(), 0.0);
683  }
684 
685  // Check for "npn" and for both branches of 'p'
686  enquire.set_query(Xapian::Query("this")); // N=termfreq and so idfn=0 for "p"
688  mset = enquire.get_mset(0, 10);
689  TEST_EQUAL(mset.size(), 6);
690  mset_expect_order(mset, 1, 2, 3, 4, 5, 6);
691  for (int i = 0; i < 6; ++i) {
692  TEST_EQUAL_DOUBLE(mset[i].get_weight(), 0.0);
693  }
694 
695  // Check for "Lnn".
696  enquire.set_query(Xapian::Query("word"));
698  mset = enquire.get_mset(0, 10);
699  TEST_EQUAL(mset.size(), 2);
700  mset_expect_order(mset, 2, 4);
701  TEST_EQUAL_DOUBLE(mset[0].get_weight(), (1 + log(8.0)) / (1 + log(81.0 / 56.0)));
702  TEST_EQUAL_DOUBLE(mset[1].get_weight(), (1 + log(1.0)) / (1 + log(31.0 / 26.0)));
703 
704  enquire.set_query(Xapian::Query("word"));
706  mset = enquire.get_mset(0, 10);
707  TEST_EQUAL(mset.size(), 2);
708  mset_expect_order(mset, 2, 4);
709  TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8 * log((6.0 - 2) / 2));
710  TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1 * log((6.0 - 2) / 2));
711 }
712 
714  public:
715  double factor;
716 
717  unsigned & zero_inits, & non_zero_inits;
718 
719  CheckInitWeight(unsigned &z, unsigned &n)
720  : factor(-1.0), zero_inits(z), non_zero_inits(n) { }
721 
722  void init(double factor_) override {
723  factor = factor_;
724  if (factor == 0.0)
725  ++zero_inits;
726  else
727  ++non_zero_inits;
728  }
729 
730  Weight* clone() const override {
731  return new CheckInitWeight(zero_inits, non_zero_inits);
732  }
733 
735  Xapian::termcount) const override {
736  return 1.0;
737  }
738 
739  double get_maxpart() const override { return 1.0; }
740 
742  Xapian::termcount) const override {
743  return 1.0 / doclen;
744  }
745 
746  double get_maxextra() const override { return 1.0; }
747 };
748 
750 DEFINE_TESTCASE(checkinitweight1, backend && !multi && !remote) {
751  Xapian::Database db = get_database("apitest_simpledata");
752  Xapian::Enquire enquire(db);
754  Xapian::Query("this"), Xapian::Query("paragraph"));
755  enquire.set_query(q);
756  unsigned zero_inits = 0, non_zero_inits = 0;
757  CheckInitWeight wt(zero_inits, non_zero_inits);
758  enquire.set_weighting_scheme(wt);
759  Xapian::MSet mset = enquire.get_mset(0, 3);
760  TEST_EQUAL(zero_inits, 1);
761  TEST_EQUAL(non_zero_inits, 2);
762 }
763 
765  public:
766  double factor;
767 
769 
770  string term1;
771 
772  // When testing OP_SYNONYM, term2 is also set.
773  // When testing OP_WILDCARD, term2 == "*".
774  // When testing a repeated term, term2 == "=" for the first occurrence and
775  // "_" for subsequent occurrences.
776  mutable string term2;
777 
780 
784 
786  const string & term1_,
787  const string & term2_,
788  Xapian::termcount & sum_,
789  Xapian::termcount & sum_squares_)
790  : factor(-1.0), db(db_), term1(term1_), term2(term2_),
791  sum(sum_), sum_squares(sum_squares_),
792  len_upper(0), len_lower(Xapian::termcount(-1)), wdf_upper(0)
793  {
794  need_stat(COLLECTION_SIZE);
795  need_stat(RSET_SIZE);
796  need_stat(AVERAGE_LENGTH);
797  need_stat(TERMFREQ);
798  need_stat(RELTERMFREQ);
799  need_stat(QUERY_LENGTH);
800  need_stat(WQF);
801  need_stat(WDF);
802  need_stat(DOC_LENGTH);
803  need_stat(DOC_LENGTH_MIN);
804  need_stat(DOC_LENGTH_MAX);
805  need_stat(WDF_MAX);
806  need_stat(COLLECTION_FREQ);
807  need_stat(UNIQUE_TERMS);
808  need_stat(TOTAL_LENGTH);
809  }
810 
812  const string & term_,
813  Xapian::termcount & sum_,
814  Xapian::termcount & sum_squares_)
815  : CheckStatsWeight(db_, term_, string(), sum_, sum_squares_) { }
816 
817  void init(double factor_) override {
818  factor = factor_;
819  }
820 
821  Weight* clone() const override {
822  auto res = new CheckStatsWeight(db, term1, term2, sum, sum_squares);
823  if (term2 == "=") {
824  // The object passed to Enquire::set_weighting_scheme() is cloned
825  // right away, and then cloned again for each term, and then
826  // potentially once more for the term-independent weight
827  // contribution. In the repeated case, we want to handle the first
828  // actual term specially, so we arrange for that to have "=" for
829  // term2, and subsequent clones to have "_", so that we accumulate
830  // sum and sum_squares on the first occurrence only.
831  term2 = "_";
832  }
833  return res;
834  }
835 
837  Xapian::termcount doclen,
838  Xapian::termcount uniqueterms) const override {
839  Xapian::doccount num_docs = db.get_doccount();
840  TEST_EQUAL(get_collection_size(), num_docs);
841  TEST_EQUAL(get_rset_size(), 0);
842  TEST_EQUAL(get_average_length(), db.get_avlength());
843  Xapian::totallength totlen = get_total_length();
844  TEST_EQUAL(totlen, db.get_total_length());
845  double total_term_occurences = get_average_length() * num_docs;
846  TEST_EQUAL(Xapian::totallength(total_term_occurences + 0.5), totlen);
847  if (term2.empty() || term2 == "=" || term2 == "_") {
848  TEST_EQUAL(get_termfreq(), db.get_termfreq(term1));
849  TEST_EQUAL(get_collection_freq(), db.get_collection_freq(term1));
850  if (term2.empty()) {
851  TEST_EQUAL(get_query_length(), 1);
852  } else {
853  TEST_EQUAL(get_query_length(), 2);
854  }
855  } else {
856  Xapian::doccount tfmax = 0, tfsum = 0;
857  Xapian::termcount cfmax = 0, cfsum = 0;
858  if (term2 == "*") {
859  // OP_WILDCARD case.
860  for (auto&& t = db.allterms_begin(term1);
861  t != db.allterms_end(term1); ++t) {
862  Xapian::doccount tf = t.get_termfreq();
863  tout << "->" << *t << " " << tf << '\n';
864  tfsum += tf;
865  tfmax = max(tfmax, tf);
867  cfsum += cf;
868  cfmax = max(cfmax, cf);
869  }
870  TEST_EQUAL(get_query_length(), 1);
871  } else {
872  // OP_SYNONYM case.
873  Xapian::doccount tf1 = db.get_termfreq(term1);
874  Xapian::doccount tf2 = db.get_termfreq(term2);
875  tfsum = tf1 + tf2;
876  tfmax = max(tf1, tf2);
877  Xapian::termcount cf1 = db.get_collection_freq(term1);
878  Xapian::termcount cf2 = db.get_collection_freq(term2);
879  cfsum = cf1 + cf2;
880  cfmax = max(cf1, cf2);
881  TEST_EQUAL(get_query_length(), 2);
882  }
883  // Synonym occurs at least as many times as any term.
884  TEST_REL(get_termfreq(), >=, tfmax);
885  TEST_REL(get_collection_freq(), >=, cfmax);
886  // Synonym can't occur more times than the terms do.
887  TEST_REL(get_termfreq(), <=, tfsum);
888  TEST_REL(get_collection_freq(), <=, cfsum);
889  // Synonym can't occur more times than there are documents/terms.
890  TEST_REL(get_termfreq(), <=, num_docs);
891  TEST_REL(get_collection_freq(), <=, totlen);
892  }
893  TEST_EQUAL(get_reltermfreq(), 0);
894  TEST_EQUAL(get_wqf(), 1);
895  TEST_REL(doclen,>=,len_lower);
896  TEST_REL(doclen,<=,len_upper);
897  TEST_REL(uniqueterms,>=,1);
898  TEST_REL(uniqueterms,<=,doclen);
899  TEST_REL(wdf,<=,wdf_upper);
900  if (term2 != "_") {
901  sum += wdf;
902  sum_squares += wdf * wdf;
903  }
904  return 1.0;
905  }
906 
907  double get_maxpart() const override {
908  if (len_upper == 0) {
909  len_lower = get_doclength_lower_bound();
910  len_upper = get_doclength_upper_bound();
911  wdf_upper = get_wdf_upper_bound();
912  }
913  return 1.0;
914  }
915 
917  Xapian::termcount) const override {
918  return 1.0 / doclen;
919  }
920 
921  double get_maxextra() const override { return 1.0; }
922 };
923 
925 DEFINE_TESTCASE(checkstatsweight1, backend && !remote) {
926  Xapian::Database db = get_database("apitest_simpledata");
927  Xapian::Enquire enquire(db);
929  for (a = db.allterms_begin(); a != db.allterms_end(); ++a) {
930  const string & term = *a;
931  enquire.set_query(Xapian::Query(term));
932  Xapian::termcount sum = 0;
933  Xapian::termcount sum_squares = 0;
934  CheckStatsWeight wt(db, term, sum, sum_squares);
935  enquire.set_weighting_scheme(wt);
936  Xapian::MSet mset = enquire.get_mset(0, db.get_doccount());
937 
938  // The document order in the multi-db case isn't the same as the
939  // postlist order on the combined DB, so it's hard to compare the
940  // wdf for each document in the Weight objects, but we can sum
941  // the wdfs and the squares of the wdfs which provides a decent
942  // check that we're not getting the wrong wdf values (it ensures
943  // they have the right mean and standard deviation).
944  Xapian::termcount expected_sum = 0;
945  Xapian::termcount expected_sum_squares = 0;
947  for (i = db.postlist_begin(term); i != db.postlist_end(term); ++i) {
948  Xapian::termcount wdf = i.get_wdf();
949  expected_sum += wdf;
950  expected_sum_squares += wdf * wdf;
951  }
952  TEST_EQUAL(sum, expected_sum);
953  TEST_EQUAL(sum_squares, expected_sum_squares);
954  }
955 }
956 
958 // Regression test for bugs fixed in 1.4.1.
959 DEFINE_TESTCASE(checkstatsweight2, backend && !remote) {
960  Xapian::Database db = get_database("apitest_simpledata");
961  Xapian::Enquire enquire(db);
963  for (a = db.allterms_begin(); a != db.allterms_end(); ++a) {
964  const string & term1 = *a;
965  if (++a == db.allterms_end()) break;
966  const string & term2 = *a;
968  Xapian::Query(term1), Xapian::Query(term2));
969  tout << q.get_description() << '\n';
970  enquire.set_query(q);
971  Xapian::termcount sum = 0;
972  Xapian::termcount sum_squares = 0;
973  CheckStatsWeight wt(db, term1, term2, sum, sum_squares);
974  enquire.set_weighting_scheme(wt);
975  Xapian::MSet mset = enquire.get_mset(0, db.get_doccount());
976 
977  // The document order in the multi-db case isn't the same as the
978  // postlist order on the combined DB, so it's hard to compare the
979  // wdf for each document in the Weight objects, but we can sum
980  // the wdfs and the squares of the wdfs which provides a decent
981  // check that we're not getting the wrong wdf values (it ensures
982  // they have the right mean and standard deviation).
983  Xapian::termcount expected_sum = 0;
984  Xapian::termcount expected_sum_squares = 0;
987  Xapian::docid did1 = *i, did2 = *j;
988  while (true) {
989  // To calculate expected_sum_squares correctly we need to square
990  // the sum per document.
991  Xapian::termcount wdf;
992  if (did1 == did2) {
993  wdf = i.get_wdf() + j.get_wdf();
994  did1 = did2 = 0;
995  } else if (did1 < did2) {
996  wdf = i.get_wdf();
997  did1 = 0;
998  } else {
999  wdf = j.get_wdf();
1000  did2 = 0;
1001  }
1002  expected_sum += wdf;
1003  expected_sum_squares += wdf * wdf;
1004 
1005  if (did1 == 0) {
1006  if (++i != db.postlist_end(term1)) {
1007  did1 = *i;
1008  } else {
1009  if (did2 == Xapian::docid(-1)) break;
1010  did1 = Xapian::docid(-1);
1011  }
1012  }
1013  if (did2 == 0) {
1014  if (++j != db.postlist_end(term2)) {
1015  did2 = *j;
1016  } else {
1017  if (did1 == Xapian::docid(-1)) break;
1018  did2 = Xapian::docid(-1);
1019  }
1020  }
1021  }
1022  // The OP_SYNONYM's wdf should be equal to the sum of the wdfs of
1023  // the individual terms.
1024  TEST_EQUAL(sum, expected_sum);
1025  TEST_EQUAL(sum_squares, expected_sum_squares);
1026  }
1027 }
1028 
1030 // Regression test for bug fixed in 1.4.1.
1031 DEFINE_TESTCASE(checkstatsweight3, backend && !remote) {
1032  // The most correct thing to do would be to collate termfreqs across shards
1033  // for this, but if that's too hard to do efficiently we could at least
1034  // scale up the termfreqs proportional to the size of the shard.
1035  XFAIL_FOR_BACKEND("multi", "OP_WILDCARD+OP_SYNONYM use shard termfreqs");
1036 
1037  struct PlCmp {
1038  bool operator()(const Xapian::PostingIterator& a,
1039  const Xapian::PostingIterator& b) {
1040  return *a < *b;
1041  }
1042  };
1043 
1044  Xapian::Database db = get_database("apitest_simpledata");
1045  Xapian::Enquire enquire(db);
1047  static const char * const testcases[] = {
1048  "a", // a* matches all documents, but no term matches all.
1049  "pa", // Expands to only "paragraph", matching 5.
1050  "zulu", // No matches.
1051  "th", // Term "this" matches all documents.
1052  };
1053  for (auto pattern : testcases) {
1055  tout.str(string{});
1056  tout << q.get_description() << '\n';
1057  enquire.set_query(q);
1058  Xapian::termcount sum = 0;
1059  Xapian::termcount sum_squares = 0;
1060  CheckStatsWeight wt(db, pattern, "*", sum, sum_squares);
1061  enquire.set_weighting_scheme(wt);
1062  Xapian::MSet mset = enquire.get_mset(0, db.get_doccount());
1063 
1064  // The document order in the multi-db case isn't the same as the
1065  // postlist order on the combined DB, so it's hard to compare the
1066  // wdf for each document in the Weight objects, but we can sum
1067  // the wdfs and the squares of the wdfs which provides a decent
1068  // check that we're not getting the wrong wdf values (it ensures
1069  // they have the right mean and standard deviation).
1070  Xapian::termcount expected_sum = 0;
1071  Xapian::termcount expected_sum_squares = 0;
1072  vector<Xapian::PostingIterator> postlists;
1073  for (auto&& t = db.allterms_begin(pattern);
1074  t != db.allterms_end(pattern); ++t) {
1075  postlists.emplace_back(db.postlist_begin(*t));
1076  }
1077  make_heap(postlists.begin(), postlists.end(), PlCmp());
1078  Xapian::docid did = 0;
1079  Xapian::termcount wdf = 0;
1080  while (!postlists.empty()) {
1081  pop_heap(postlists.begin(), postlists.end(), PlCmp());
1082  Xapian::docid did_new = *postlists.back();
1083  Xapian::termcount wdf_new = postlists.back().get_wdf();
1084  if (++(postlists.back()) == Xapian::PostingIterator()) {
1085  postlists.pop_back();
1086  } else {
1087  push_heap(postlists.begin(), postlists.end(), PlCmp());
1088  }
1089  if (did_new != did) {
1090  expected_sum += wdf;
1091  expected_sum_squares += wdf * wdf;
1092  wdf = 0;
1093  did = did_new;
1094  }
1095  wdf += wdf_new;
1096  }
1097  expected_sum += wdf;
1098  expected_sum_squares += wdf * wdf;
1099  // The OP_SYNONYM's wdf should be equal to the sum of the wdfs of
1100  // the individual terms.
1101  TEST_EQUAL(sum, expected_sum);
1102  TEST_REL(sum_squares, >=, expected_sum_squares);
1103  }
1104 }
1105 
1107 // Regression test for bug fixed in 1.4.6. Doesn't work with
1108 // multi as the weight object is cloned more times.
1109 DEFINE_TESTCASE(checkstatsweight4, backend && !remote && !multi) {
1110  Xapian::Database db = get_database("apitest_simpledata");
1111  Xapian::Enquire enquire(db);
1113  for (a = db.allterms_begin(); a != db.allterms_end(); ++a) {
1114  const string & term = *a;
1115  enquire.set_query(Xapian::Query(term, 1, 1) |
1116  Xapian::Query(term, 1, 2));
1117  Xapian::termcount sum = 0;
1118  Xapian::termcount sum_squares = 0;
1119  CheckStatsWeight wt(db, term, "=", sum, sum_squares);
1120  enquire.set_weighting_scheme(wt);
1121  Xapian::MSet mset = enquire.get_mset(0, db.get_doccount());
1122 
1123  // The document order in the multi-db case isn't the same as the
1124  // postlist order on the combined DB, so it's hard to compare the
1125  // wdf for each document in the Weight objects, but we can sum
1126  // the wdfs and the squares of the wdfs which provides a decent
1127  // check that we're not getting the wrong wdf values (it ensures
1128  // they have the right mean and standard deviation).
1129  Xapian::termcount expected_sum = 0;
1130  Xapian::termcount expected_sum_squares = 0;
1132  for (i = db.postlist_begin(term); i != db.postlist_end(term); ++i) {
1133  Xapian::termcount wdf = i.get_wdf();
1134  expected_sum += wdf;
1135  expected_sum_squares += wdf * wdf;
1136  }
1137  TEST_EQUAL(sum, expected_sum);
1138  TEST_EQUAL(sum_squares, expected_sum_squares);
1139  }
1140 }
1141 
1142 // Two stage should perform same as Jelinek mercer if smoothing parameter for mercer is kept 1 in both.
1143 DEFINE_TESTCASE(unigramlmweight4, backend) {
1144  Xapian::Database db = get_database("apitest_simpledata");
1145  Xapian::Enquire enquire1(db);
1146  Xapian::Enquire enquire2(db);
1147  enquire1.set_query(Xapian::Query("paragraph"));
1148  Xapian::MSet mset1;
1149  enquire2.set_query(Xapian::Query("paragraph"));
1150  Xapian::MSet mset2;
1151  // 5 documents available with term paragraph so mset size should be 5
1154  mset1 = enquire1.get_mset(0, 10);
1155  mset2 = enquire2.get_mset(0, 10);
1156 
1157  TEST_EQUAL(mset1.size(), 5);
1158  TEST_EQUAL_DOUBLE(mset1[1].get_weight(), mset2[1].get_weight());
1159 }
1160 
1161 /* Test for checking if we don't use smoothing all
1162  * of them should give same result i.e wdf_double/len_double */
1163 DEFINE_TESTCASE(unigramlmweight5, backend) {
1164  Xapian::Database db = get_database("apitest_simpledata");
1165  Xapian::Enquire enquire1(db);
1166  Xapian::Enquire enquire2(db);
1167  Xapian::Enquire enquire3(db);
1168  Xapian::Enquire enquire4(db);
1169  enquire1.set_query(Xapian::Query("paragraph"));
1170  Xapian::MSet mset1;
1171  enquire2.set_query(Xapian::Query("paragraph"));
1172  Xapian::MSet mset2;
1173  enquire3.set_query(Xapian::Query("paragraph"));
1174  Xapian::MSet mset3;
1175  enquire4.set_query(Xapian::Query("paragraph"));
1176  Xapian::MSet mset4;
1177  // 5 documents available with term paragraph so mset size should be 5
1182 
1183  mset1 = enquire1.get_mset(0, 10);
1184  mset2 = enquire2.get_mset(0, 10);
1185  mset3 = enquire3.get_mset(0, 10);
1186  mset4 = enquire4.get_mset(0, 10);
1187 
1188  TEST_EQUAL(mset1.size(), 5);
1189  TEST_EQUAL(mset2.size(), 5);
1190  TEST_EQUAL(mset3.size(), 5);
1191  TEST_EQUAL(mset4.size(), 5);
1192  for (Xapian::doccount i = 0; i < 5; ++i) {
1193  TEST_EQUAL_DOUBLE(mset3[i].get_weight(), mset4[i].get_weight());
1194  TEST_EQUAL_DOUBLE(mset2[i].get_weight(), mset4[i].get_weight());
1195  TEST_EQUAL_DOUBLE(mset1[i].get_weight(), mset2[i].get_weight());
1196  TEST_EQUAL_DOUBLE(mset3[i].get_weight(), mset2[i].get_weight());
1197  TEST_EQUAL_DOUBLE(mset1[i].get_weight(), mset4[i].get_weight());
1198  TEST_EQUAL_DOUBLE(mset1[i].get_weight(), mset3[i].get_weight());
1199  }
1200 }
1201 
1202 // Feature test for Dir+ function.
1203 DEFINE_TESTCASE(unigramlmweight7, backend) {
1204  Xapian::Database db = get_database("apitest_simpledata");
1205  Xapian::Enquire enquire1(db);
1206  Xapian::Enquire enquire2(db);
1207  enquire1.set_query(Xapian::Query("paragraph"));
1208  enquire2.set_query(Xapian::Query("paragraph"));
1209  Xapian::MSet mset1;
1210  Xapian::MSet mset2;
1211 
1214 
1215  mset1 = enquire1.get_mset(0, 10);
1216  mset2 = enquire2.get_mset(0, 10);
1217 
1218  // mset size should be 5
1219  TEST_EQUAL(mset1.size(), 5);
1220  TEST_EQUAL(mset2.size(), 5);
1221 
1222  // Expect mset weights associated with Dir+ more than mset weights by Dir
1223  // because of the presence of extra weight component in Dir+ function.
1224  TEST_REL(mset2[0].get_weight(),>,mset1[0].get_weight());
1225  TEST_REL(mset2[1].get_weight(),>,mset1[1].get_weight());
1226  TEST_REL(mset2[2].get_weight(),>,mset1[2].get_weight());
1227  TEST_REL(mset2[3].get_weight(),>,mset1[3].get_weight());
1228  TEST_REL(mset2[4].get_weight(),>,mset1[4].get_weight());
1229 }
1230 
1231 // Regression test that OP_SCALE_WEIGHT works with LMWeight (fixed in 1.4.1).
1232 DEFINE_TESTCASE(unigramlmweight8, backend) {
1233  Xapian::Database db = get_database("apitest_simpledata");
1234  Xapian::Enquire enquire(db);
1235  Xapian::Query query("paragraph");
1236 
1237  enquire.set_query(query);
1239 
1240  Xapian::MSet mset1;
1241  mset1 = enquire.get_mset(0, 10);
1242  TEST_EQUAL(mset1.size(), 5);
1243 
1246 
1247  Xapian::MSet mset2;
1248  mset2 = enquire.get_mset(0, 10);
1249  TEST_EQUAL(mset2.size(), mset1.size());
1250  TEST_NOT_EQUAL_DOUBLE(mset1[0].get_weight(), 0.0);
1251  for (Xapian::doccount i = 0; i < mset1.size(); ++i) {
1252  TEST_EQUAL_DOUBLE(15.0 * mset1[i].get_weight(), mset2[i].get_weight());
1253  }
1254 }
1255 
1256 // Feature test for CoordWeight.
1257 DEFINE_TESTCASE(coordweight1, backend) {
1258  Xapian::Enquire enquire(get_database("apitest_simpledata"));
1260  static const char * const terms[] = {
1261  "this", "line", "paragraph", "rubbish"
1262  };
1264  terms, terms + sizeof(terms) / sizeof(terms[0]));
1265  enquire.set_query(query);
1266  Xapian::MSet mymset1 = enquire.get_mset(0, 100);
1267  // CoordWeight scores 1 for each matching term, so the weight should equal
1268  // the number of matching terms.
1269  for (Xapian::MSetIterator i = mymset1.begin(); i != mymset1.end(); ++i) {
1270  Xapian::termcount matching_terms = 0;
1272  while (t != enquire.get_matching_terms_end(i)) {
1273  ++matching_terms;
1274  ++t;
1275  }
1276  TEST_EQUAL(i.get_weight(), matching_terms);
1277  }
1278 }
double get_maxpart() const override
Return an upper bound on what get_sumpart() can return for any document.
Definition: api_weight.cc:739
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
Xapian::doccount size() const
Return number of items in this MSet object.
Definition: omenquire.cc:318
Weight * clone() const override
Clone this object.
Definition: api_weight.cc:730
Xapian::docid add_document(const Xapian::Document &document)
Add a new document to the database.
Definition: omdatabase.cc:902
Wildcard expansion.
Definition: query.h:255
double get_sumextra(Xapian::termcount doclen, Xapian::termcount) const override
Calculate the term-independent weight component for a document.
Definition: api_weight.cc:916
#define TEST(a)
Test a condition, without an additional explanation for failure.
Definition: testsuite.h:275
This class is used to access a database, or a group of databases.
Definition: database.h:68
Xapian::termcount get_wdf() const
Return the wdf for the document at the current position.
TermIterator get_matching_terms_end(Xapian::docid) const
End iterator corresponding to get_matching_terms_begin()
Definition: enquire.h:717
double weight
The weight of a document or term.
Definition: types.h:122
double get_maxextra() const override
Return an upper bound on what get_sumextra() can return for any document.
Definition: api_weight.cc:921
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
Definition: types.h:139
TermIterator allterms_end(const std::string &=std::string()) const
Corresponding end iterator to allterms_begin(prefix).
Definition: database.h:269
#define TEST_NOT_EQUAL_DOUBLE(a, b)
Test two doubles for non-near-equality.
Definition: testsuite.h:300
const std::string & get_msg() const
Message giving details of the error, intended for human consumption.
Definition: error.h:122
Weight * clone() const override
Clone this object.
Definition: api_weight.cc:821
Class representing a list of search results.
Definition: mset.h:44
This class implements the InL2 weighting scheme.
Definition: weight.h:844
STL namespace.
MSet get_mset(Xapian::doccount first, Xapian::doccount maxitems, Xapian::doccount checkatleast=0, const RSet *omrset=0, const MatchDecider *mdecider=0) const
Get (a portion of) the match set for the current query.
Definition: omenquire.cc:938
CheckInitWeight(unsigned &z, unsigned &n)
Definition: api_weight.cc:719
virtual std::string serialise() const
Return this object&#39;s parameters serialised as a single string.
Definition: weight.cc:141
Xapian::Weight subclass implementing the PL2+ probabilistic formula.
Definition: weight.h:1263
Xapian::doccount get_doccount() const
Get the number of documents in the database.
Definition: omdatabase.cc:267
Xapian::totallength get_total_length() const
Get the total length of all the documents in the database.
Definition: omdatabase.cc:312
TermIterator get_matching_terms_begin(Xapian::docid did) const
Get terms which match a given document, by document id.
Definition: omenquire.cc:962
test functionality of the Xapian API
Xapian::doclength get_avlength() const
Get the average length of the documents in the database.
Definition: omdatabase.cc:293
This class implements the BB2 weighting scheme.
Definition: weight.h:1060
Class for iterating over a list of terms.
Definition: termiterator.h:41
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
#define TEST_REL(A, REL, B)
Test a relation holds,e.g. TEST_REL(a,>,b);.
Definition: testmacros.h:32
Class for iterating over a list of terms.
#define TEST_NOT_EQUAL(a, b)
Test for non-equality of two things.
Definition: testsuite.h:305
Xapian::Weight subclass implementing Coordinate Matching.
Definition: weight.h:1516
InvalidArgumentError indicates an invalid parameter value was passed to the API.
Definition: error.h:241
Xapian::termcount & sum_squares
Definition: api_weight.cc:779
Xapian::termcount wdf_upper
Definition: api_weight.cc:783
Class implementing a "boolean" weighting scheme.
Definition: weight.h:433
This class provides read/write access to a database.
Definition: database.h:789
Indicates an error in the std::string serialisation of an object.
Definition: error.h:929
std::ostringstream tout
The debug printing stream.
Definition: testsuite.cc:104
Iterator over a Xapian::MSet.
Definition: mset.h:368
Scale the weight contributed by a subquery.
Definition: query.h:166
Public interfaces for the Xapian library.
CheckStatsWeight(const Xapian::Database &db_, const string &term1_, const string &term2_, Xapian::termcount &sum_, Xapian::termcount &sum_squares_)
Definition: api_weight.cc:785
#define TEST_WEIGHT_CLASS_NO_PARAMS(W)
Definition: api_weight.cc:63
void init(double factor_) override
Allow the subclass to perform any initialisation it needs to.
Definition: api_weight.cc:817
#define TEST_EXCEPTION(TYPE, CODE)
Check that CODE throws exactly Xapian exception TYPE.
Definition: testutils.h:109
MSetIterator begin() const
Return iterator pointing to the first item in this MSet.
Definition: mset.h:624
MSetIterator end() const
Return iterator pointing to just after the last item in this MSet.
Definition: mset.h:629
Xapian::termcount & sum
Definition: api_weight.cc:778
Xapian::Weight subclass implementing the traditional probabilistic formula.
Definition: weight.h:774
#define CONST_STRLEN(S)
Returns the length of a string constant.
Definition: stringutils.h:43
This class implements the DLH weighting scheme, which is a representative scheme of the Divergence fr...
Definition: weight.h:1136
This class implements the PL2 weighting scheme.
Definition: weight.h:1196
This class implements the IneB2 weighting scheme.
Definition: weight.h:988
static void test_weight_class(const char *name, const W &obj_default, const W &obj_other)
Definition: api_weight.cc:67
TermIterator allterms_begin(const std::string &prefix=std::string()) const
An iterator which runs across all terms with a given prefix.
Definition: omdatabase.cc:223
#define TEST_EQUAL_DOUBLE(a, b)
Test two doubles for near equality.
Definition: testsuite.h:295
static void test_weight_class_no_params(const char *name)
Definition: api_weight.cc:38
void set_query(const Xapian::Query &query, Xapian::termcount qlen=0)
Set the query to run.
Definition: omenquire.cc:793
Match like OP_OR but weighting as if a single term.
Definition: query.h:239
double get_maxpart() const override
Return an upper bound on what get_sumpart() can return for any document.
Definition: api_weight.cc:907
double get_weight() const
Get the weight for the current position.
Definition: omenquire.cc:460
This class implements the IfB2 weighting scheme.
Definition: weight.h:915
#define FAIL_TEST(MSG)
Fail the current testcase with message MSG.
Definition: testsuite.h:68
Match only documents which all subqueries match.
Definition: query.h:84
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
Definition: api_anydb.cc:63
double get_maxextra() const override
Return an upper bound on what get_sumextra() can return for any document.
Definition: api_weight.cc:746
double get_sumextra(Xapian::termcount doclen, Xapian::termcount) const override
Calculate the term-independent weight component for a document.
Definition: api_weight.cc:741
CheckStatsWeight(const Xapian::Database &db_, const string &term_, Xapian::termcount &sum_, Xapian::termcount &sum_squares_)
Definition: api_weight.cc:811
Xapian::Database get_database(const string &dbname)
Definition: apitest.cc:48
double get_sumpart(Xapian::termcount, Xapian::termcount, Xapian::termcount) const override
Calculate the weight contribution for this object&#39;s term to a document.
Definition: api_weight.cc:734
void XFAIL_FOR_BACKEND(const std::string &backend_prefix, const char *msg)
Definition: apitest.cc:147
char name[9]
Definition: dbcheck.cc:55
std::string get_description() const
Return a string describing this object.
Definition: query.cc:232
This class provides an interface to the information retrieval system for the purpose of searching...
Definition: enquire.h:152
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Xapian::termcount len_upper
Definition: api_weight.cc:781
This class implements the DPH weighting scheme.
Definition: weight.h:1359
Match documents which at least one subquery matches.
Definition: query.h:92
Xapian-specific test helper functions and macros.
void mset_expect_order(const Xapian::MSet &A, Xapian::docid d1, Xapian::docid d2, Xapian::docid d3, Xapian::docid d4, Xapian::docid d5, Xapian::docid d6, Xapian::docid d7, Xapian::docid d8, Xapian::docid d9, Xapian::docid d10, Xapian::docid d11, Xapian::docid d12)
Definition: testutils.cc:225
Definition: header.h:151
#define TEST_WEIGHT_CLASS(W, DEFAULT, OTHER)
Definition: api_weight.cc:108
void set_weighting_scheme(const Weight &weight_)
Set the weighting scheme to use for queries.
Definition: omenquire.cc:819
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
Class representing a query.
Definition: query.h:46
void init(double factor_) override
Allow the subclass to perform any initialisation it needs to.
Definition: api_weight.cc:722
#define TEST_EQUAL(a, b)
Test for equality of two things.
Definition: testsuite.h:278
static void gen_wdf_eq_doclen_db(Xapian::WritableDatabase &db, const string &)
Definition: api_weight.cc:472
PostingIterator postlist_end(const std::string &) const
Corresponding end iterator to postlist_begin().
Definition: database.h:225
MSetIterator back() const
Return iterator pointing to the last object in this MSet.
Definition: mset.h:641
Xapian::Weight subclass implementing the Language Model formula.
Definition: weight.h:1413
Xapian::Database db
Definition: api_weight.cc:768
Xapian::doccount get_termfreq(const std::string &tname) const
Get the number of documents in the database indexed by a given term.
Definition: omdatabase.cc:323
A handle representing a document in a Xapian database.
Definition: document.h:61
DEFINE_TESTCASE(weightserialisation1, !backend)
Test serialisation and introspection of built-in weighting schemes.
Definition: api_weight.cc:112
Xapian::Weight subclass implementing the BM25+ probabilistic formula.
Definition: weight.h:650
#define TEST_WEIGHTING_SCHEME(W,...)
double get_sumpart(Xapian::termcount wdf, Xapian::termcount doclen, Xapian::termcount uniqueterms) const override
Calculate the weight contribution for this object&#39;s term to a document.
Definition: api_weight.cc:836
Xapian::Weight subclass implementing the BM25 probabilistic formula.
Definition: weight.h:546
PostingIterator postlist_begin(const std::string &tname) const
An iterator pointing to the start of the postlist for a given term.
Definition: omdatabase.cc:162
Xapian::Weight subclass implementing the tf-idf weighting scheme.
Definition: weight.h:458
Xapian::termcount len_lower
Definition: api_weight.cc:782
void add_term(const std::string &tname, Xapian::termcount wdfinc=1)
Add a term to the document, without positional information.
Definition: omdocument.cc:140
Abstract base class for weighting schemes.
Definition: weight.h:35
Xapian::termcount get_collection_freq(const std::string &tname) const
Return the total number of occurrences of the given term.
Definition: omdatabase.cc:339
unsigned & zero_inits
Definition: api_weight.cc:717
static const testcase testcases[]
Definition: api_unicode.cc:39