42 bool empty = t2->
name().empty();
45 FAIL_TEST(
"Serialised TradWeight with junk appended unserialised to empty name!");
46 FAIL_TEST(
"Serialised TradWeight with junk appended unserialised OK");
62 bool empty = t2->
name().empty();
65 FAIL_TEST(
"Serialised LMWeight with junk appended unserialised to empty name!");
66 FAIL_TEST(
"Serialised LMWeight with junk appended unserialised OK");
79 bool empty = b2->
name().empty();
82 FAIL_TEST(
"Serialised BM25Weight with junk appended unserialised to empty name!");
83 FAIL_TEST(
"Serialised BM25Weight with junk appended unserialised OK");
100 TEST_REL(mset[0].get_weight(),>,mset[1].get_weight());
102 TEST_REL(mset[2].get_weight(),>,mset[3].get_weight());
126 TEST_REL(mset[1].get_weight(),>,mset[2].get_weight());
127 TEST_REL(mset[2].get_weight(),>,mset[3].get_weight());
128 TEST_REL(mset[3].get_weight(),>,mset[4].get_weight());
138 bool empty = b2->
name().empty();
141 FAIL_TEST(
"Serialised BM25PlusWeight with junk appended unserialised to empty name!");
142 FAIL_TEST(
"Serialised BM25PlusWeight with junk appended unserialised OK");
144 TEST(e.
get_msg().find(
"BM25Plus") != string::npos);
159 TEST_REL(mset[0].get_weight(),>,mset[1].get_weight());
161 TEST_REL(mset[2].get_weight(),>,mset[3].get_weight());
198 bool empty = b2->
name().empty();
201 FAIL_TEST(
"Serialised inl2weight with junk appended unserialised to empty name!");
202 FAIL_TEST(
"Serialised inl2weight with junk appended unserialised OK");
258 bool empty = b2->
name().empty();
261 FAIL_TEST(
"Serialised IfB2Weight with junk appended unserialised to empty name!");
262 FAIL_TEST(
"Serialised IfB2Weight with junk appended unserialised OK");
317 bool empty = b2->
name().empty();
320 FAIL_TEST(
"Serialised ineb2weight with junk appended unserialised to empty name!");
321 FAIL_TEST(
"Serialised ineb2weight with junk appended unserialised OK");
367 for (
int i = 0; i < 5; ++i) {
379 bool empty = b2->
name().empty();
382 FAIL_TEST(
"Serialised BB2Weight with junk appended unserialised to empty name!");
383 FAIL_TEST(
"Serialised BB2Weight with junk appended unserialised OK");
429 for (
int i = 0; i < 5; ++i) {
442 for (
int i = 0; i < 5; ++i) {
502 bool empty = t2->
name().empty();
505 FAIL_TEST(
"Serialised DLHWeight with junk appended unserialised to empty name!");
506 FAIL_TEST(
"Serialised DLHWeight with junk appended unserialised OK");
543 bool empty = b2->
name().empty();
546 FAIL_TEST(
"Serialised PL2Weight with junk appended unserialised to empty name!");
547 FAIL_TEST(
"Serialised PL2Weight with junk appended unserialised OK");
578 mset[3].get_weight() + 0.0086861771701328694);
588 for (
int i = 0; i < 5; ++i) {
600 bool empty = b2->
name().empty();
603 FAIL_TEST(
"Serialised PL2PlusWeight with junk appended unserialised to empty name!");
604 FAIL_TEST(
"Serialised PL2PlusWeight with junk appended unserialised OK");
645 mset[3].get_weight() + 0.0086861771701328694);
691 TEST_EQUAL_DOUBLE(mset1[2].get_weight() - mset1[4].get_weight(), 0.542623617687990167);
701 for (
int i = 0; i < 5; ++i) {
713 bool empty = t2->
name().empty();
716 FAIL_TEST(
"Serialised DPHWeight with junk appended unserialised to empty name!");
717 FAIL_TEST(
"Serialised DPHWeight with junk appended unserialised OK");
761 bool empty = b2->
name().empty();
764 FAIL_TEST(
"Serialised TfIdfWeight with junk appended unserialised to empty name!");
765 FAIL_TEST(
"Serialised TfIdfWeight with junk appended unserialised OK");
854 for (
int i = 0; i < 6; ++i) {
864 for (
int i = 0; i < 6; ++i) {
874 TEST_EQUAL_DOUBLE(mset[0].get_weight(), (1 + log(8.0)) / (1 + log(81.0 / 56.0)));
875 TEST_EQUAL_DOUBLE(mset[1].get_weight(), (1 + log(1.0)) / (1 + log(31.0 / 26.0)));
893 : factor(-1.0), zero_inits(z), non_zero_inits(n) { }
928 unsigned zero_inits = 0, non_zero_inits = 0;
958 const string & term1_,
959 const string & term2_,
962 : factor(-1.0), db(db_), term1(term1_), term2(term2_),
963 sum(sum_), sum_squares(sum_squares_),
966 need_stat(COLLECTION_SIZE);
967 need_stat(RSET_SIZE);
968 need_stat(AVERAGE_LENGTH);
970 need_stat(RELTERMFREQ);
971 need_stat(QUERY_LENGTH);
974 need_stat(DOC_LENGTH);
975 need_stat(DOC_LENGTH_MIN);
976 need_stat(DOC_LENGTH_MAX);
978 need_stat(COLLECTION_FREQ);
979 need_stat(UNIQUE_TERMS);
980 need_stat(TOTAL_LENGTH);
984 const string & term_,
1016 double total_term_occurences = get_average_length() * num_docs;
1018 if (term2.empty() || term2 ==
"=" || term2 ==
"_") {
1021 if (term2.empty()) {
1034 tout <<
"->" << *t <<
" " << tf << endl;
1036 tfmax = max(tfmax, tf);
1039 cfmax = max(cfmax, cf);
1047 tfmax = max(tf1, tf2);
1051 cfmax = max(cf1, cf2);
1055 TEST_REL(get_termfreq(), >=, tfmax);
1056 TEST_REL(get_collection_freq(), >=, cfmax);
1058 TEST_REL(get_termfreq(), <=, tfsum);
1059 TEST_REL(get_collection_freq(), <=, cfsum);
1061 TEST_REL(get_termfreq(), <=, num_docs);
1062 TEST_REL(get_collection_freq(), <=, totlen);
1073 sum_squares += wdf * wdf;
1079 if (len_upper == 0) {
1080 len_lower = get_doclength_lower_bound();
1081 len_upper = get_doclength_upper_bound();
1082 wdf_upper = get_wdf_upper_bound();
1088 return 1.0 / doclen;
1100 const string & term = *a;
1119 expected_sum += wdf;
1120 expected_sum_squares += wdf * wdf;
1123 TEST_EQUAL(sum_squares, expected_sum_squares);
1134 const string & term1 = *a;
1136 const string & term2 = *a;
1165 }
else if (did1 < did2) {
1172 expected_sum += wdf;
1173 expected_sum_squares += wdf * wdf;
1195 TEST_REL(sum_squares, >=, expected_sum_squares);
1214 static const char *
const testcases[] = {
1220 for (
auto pattern : testcases) {
1238 vector<Xapian::PostingIterator> postlists;
1243 make_heap(postlists.begin(), postlists.end(), PlCmp());
1246 while (!postlists.empty()) {
1247 pop_heap(postlists.begin(), postlists.end(), PlCmp());
1251 postlists.pop_back();
1253 push_heap(postlists.begin(), postlists.end(), PlCmp());
1255 if (did_new != did) {
1256 expected_sum += wdf;
1257 expected_sum_squares += wdf * wdf;
1263 expected_sum += wdf;
1264 expected_sum_squares += wdf * wdf;
1268 TEST_REL(sum_squares, >=, expected_sum_squares);
1280 const string & term = *a;
1300 expected_sum += wdf;
1301 expected_sum_squares += wdf * wdf;
1304 TEST_EQUAL(sum_squares, expected_sum_squares);
1358 for (
size_t i = 0; i < 5; ++i) {
1375 bool empty = d2->
name().empty();
1378 FAIL_TEST(
"Serialised LMWeight with junk appended unserialised to empty name!");
1379 FAIL_TEST(
"Serialised LMWeight with junk appended unserialised OK");
1407 TEST_REL(mset2[0].get_weight(),>,mset1[0].get_weight());
1408 TEST_REL(mset2[1].get_weight(),>,mset1[1].get_weight());
1409 TEST_REL(mset2[2].get_weight(),>,mset1[2].get_weight());
1410 TEST_REL(mset2[3].get_weight(),>,mset1[3].get_weight());
1411 TEST_REL(mset2[4].get_weight(),>,mset1[4].get_weight());
1447 bool empty = t2->
name().empty();
1450 FAIL_TEST(
"Serialised BoolWeight with junk appended unserialised to empty name!");
1451 FAIL_TEST(
"Serialised BoolWeight with junk appended unserialised OK");
1461 static const char *
const terms[] = {
1462 "this",
"line",
"paragraph",
"rubbish" 1465 terms, terms +
sizeof(terms) /
sizeof(terms[0]));
1485 TEST_EQUAL(15.0 * mymset1[i].get_weight(), mymset2[i].get_weight());
1496 bool empty = t2->
name().empty();
1499 FAIL_TEST(
"Serialised CoordWeight with junk appended unserialised to empty name!");
1500 FAIL_TEST(
"Serialised CoordWeight with junk appended unserialised OK");
The Xapian namespace contains public interfaces for the Xapian library.
Xapian::doccount size() const
Return number of items in this MSet object.
Xapian::docid add_document(const Xapian::Document &document)
Add a new document to the database.
void init(double factor_)
Allow the subclass to perform any initialisation it needs to.
#define TEST(a)
Test a condition, without an additional explanation for failure.
This class is used to access a database, or a group of databases.
Xapian::termcount get_wdf() const
Return the wdf for the document at the current position.
TermIterator get_matching_terms_end(Xapian::docid) const
End iterator corresponding to get_matching_terms_begin()
std::string serialise() const
Return this object's parameters serialised as a single string.
InL2Weight * unserialise(const std::string &serialised) const
Unserialise parameters.
std::string serialise() const
Return this object's parameters serialised as a single string.
PL2PlusWeight * unserialise(const std::string &serialised) const
Unserialise parameters.
std::string serialise() const
Return this object's parameters serialised as a single string.
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
TermIterator allterms_end(const std::string &=std::string()) const
Corresponding end iterator to allterms_begin(prefix).
#define TEST_NOT_EQUAL_DOUBLE(a, b)
Test two doubles for non-near-equality.
const std::string & get_msg() const
Message giving details of the error, intended for human consumption.
double get_sumpart(Xapian::termcount, Xapian::termcount, Xapian::termcount) const
Calculate the weight contribution for this object's term to a document.
Class representing a list of search results.
This class implements the InL2 weighting scheme.
std::string serialise() const
Return this object's parameters serialised as a single string.
MSet get_mset(Xapian::doccount first, Xapian::doccount maxitems, Xapian::doccount checkatleast=0, const RSet *omrset=0, const MatchDecider *mdecider=0) const
Get (a portion of) the match set for the current query.
std::string serialise() const
Return this object's parameters serialised as a single string.
std::string serialise() const
Return this object's parameters serialised as a single string.
CheckInitWeight(unsigned &z, unsigned &n)
BM25PlusWeight * unserialise(const std::string &serialised) const
Unserialise parameters.
DEFINE_TESTCASE(tradweight3, !backend)
Xapian::Weight subclass implementing the PL2+ probabilistic formula.
Xapian::doccount get_doccount() const
Get the number of documents in the database.
double get_maxextra() const
Return an upper bound on what get_sumextra() can return for any document.
Xapian::totallength get_total_length() const
Get the total length of all the documents in the database.
std::string serialise() const
Return this object's parameters serialised as a single string.
std::string serialise() const
Return this object's parameters serialised as a single string.
TermIterator get_matching_terms_begin(Xapian::docid did) const
Get terms which match a given document, by document id.
Weight * clone() const
Clone this object.
test functionality of the Xapian API
std::string name() const
Return the name of this weighting scheme.
std::string name() const
Return the name of this weighting scheme.
std::string name() const
Return the name of this weighting scheme.
double get_maxpart() const
Return an upper bound on what get_sumpart() can return for any document.
Xapian::doclength get_avlength() const
Get the average length of the documents in the database.
This class implements the BB2 weighting scheme.
Class for iterating over a list of terms.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
#define TEST_REL(A, REL, B)
Test a relation holds,e.g. TEST_REL(a,>,b);.
Class for iterating over a list of terms.
IfB2Weight * unserialise(const std::string &serialised) const
Unserialise parameters.
Xapian::Weight subclass implementing Coordinate Matching.
BM25Weight * unserialise(const std::string &serialised) const
Unserialise parameters.
InvalidArgumentError indicates an invalid parameter value was passed to the API.
std::string name() const
Return the name of this weighting scheme.
Xapian::termcount & sum_squares
double get_sumextra(Xapian::termcount doclen, Xapian::termcount) const
Calculate the term-independent weight component for a document.
Xapian::termcount wdf_upper
TfIdfWeight * unserialise(const std::string &serialised) const
Unserialise parameters.
Class implementing a "boolean" weighting scheme.
This class provides read/write access to a database.
Indicates an error in the std::string serialisation of an object.
std::ostringstream tout
The debug printing stream.
Iterator over a Xapian::MSet.
Scale the weight contributed by a subquery.
Public interfaces for the Xapian library.
CheckStatsWeight(const Xapian::Database &db_, const string &term1_, const string &term2_, Xapian::termcount &sum_, Xapian::termcount &sum_squares_)
std::string serialise() const
Return this object's parameters serialised as a single string.
double get_maxpart() const
Return an upper bound on what get_sumpart() can return for any document.
Weight * clone() const
Clone this object.
DPHWeight * unserialise(const std::string &serialised) const
Unserialise parameters.
#define TEST_EXCEPTION(TYPE, CODE)
Check that CODE throws exactly Xapian exception TYPE.
std::string name() const
Return the name of this weighting scheme.
IneB2Weight * unserialise(const std::string &serialised) const
Unserialise parameters.
MSetIterator begin() const
Return iterator pointing to the first item in this MSet.
MSetIterator end() const
Return iterator pointing to just after the last item in this MSet.
std::string name() const
Return the name of this weighting scheme.
double get_maxextra() const
Return an upper bound on what get_sumextra() can return for any document.
std::string name() const
Return the name of this weighting scheme.
Xapian::Weight subclass implementing the traditional probabilistic formula.
std::string serialise() const
Return this object's parameters serialised as a single string.
This class implements the DLH weighting scheme, which is a representative scheme of the Divergence fr...
std::string name() const
Return the name of this weighting scheme.
This class implements the PL2 weighting scheme.
std::string serialise() const
Return this object's parameters serialised as a single string.
This class implements the IneB2 weighting scheme.
BoolWeight * unserialise(const std::string &serialised) const
Unserialise parameters.
std::string name() const
Return the name of this weighting scheme.
TermIterator allterms_begin(const std::string &prefix=std::string()) const
An iterator which runs across all terms with a given prefix.
#define TEST_EQUAL_DOUBLE(a, b)
Test two doubles for near equality.
std::string serialise() const
Return this object's parameters serialised as a single string.
double get_sumextra(Xapian::termcount doclen, Xapian::termcount) const
Calculate the term-independent weight component for a document.
void set_query(const Xapian::Query &query, Xapian::termcount qlen=0)
Set the query to run.
std::string serialise() const
Return this object's parameters serialised as a single string.
Match like OP_OR but weighting as if a single term.
This class implements the IfB2 weighting scheme.
#define FAIL_TEST(MSG)
Fail the current testcase with message MSG.
Match only documents which all subqueries match.
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
CheckStatsWeight(const Xapian::Database &db_, const string &term_, Xapian::termcount &sum_, Xapian::termcount &sum_squares_)
Xapian::Database get_database(const string &dbname)
std::string name() const
Return the name of this weighting scheme.
std::string get_description() const
Return a string describing this object.
This class provides an interface to the information retrieval system for the purpose of searching...
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
CoordWeight * unserialise(const std::string &serialised) const
Unserialise parameters.
std::string name() const
Return the name of this weighting scheme.
std::string serialise() const
Return this object's parameters serialised as a single string.
Xapian::termcount len_upper
PL2Weight * unserialise(const std::string &serialised) const
Unserialise parameters.
void init(double factor_)
Allow the subclass to perform any initialisation it needs to.
This class implements the DPH weighting scheme.
double get_sumpart(Xapian::termcount wdf, Xapian::termcount doclen, Xapian::termcount uniqueterms) const
Calculate the weight contribution for this object's term to a document.
Match documents which at least one subquery matches.
Xapian-specific test helper functions and macros.
std::string name() const
Return the name of this weighting scheme.
LMWeight * unserialise(const std::string &serialised) const
Unserialise parameters.
void mset_expect_order(const Xapian::MSet &A, Xapian::docid d1, Xapian::docid d2, Xapian::docid d3, Xapian::docid d4, Xapian::docid d5, Xapian::docid d6, Xapian::docid d7, Xapian::docid d8, Xapian::docid d9, Xapian::docid d10, Xapian::docid d11, Xapian::docid d12)
void set_weighting_scheme(const Weight &weight_)
Set the weighting scheme to use for queries.
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
std::string name() const
Return the name of this weighting scheme.
Class representing a query.
#define TEST_EQUAL(a, b)
Test for equality of two things.
static void gen_wdf_eq_doclen_db(Xapian::WritableDatabase &db, const string &)
PostingIterator postlist_end(const std::string &) const
Corresponding end iterator to postlist_begin().
Xapian::Weight subclass implementing the Language Model formula.
std::string name() const
Return the name of this weighting scheme.
BB2Weight * unserialise(const std::string &serialised) const
Unserialise parameters.
std::string serialise() const
Return this object's parameters serialised as a single string.
Xapian::doccount get_termfreq(const std::string &tname) const
Get the number of documents in the database indexed by a given term.
A handle representing a document in a Xapian database.
Xapian::Weight subclass implementing the BM25+ probabilistic formula.
std::string name() const
Return the name of this weighting scheme.
Xapian::Weight subclass implementing the BM25 probabilistic formula.
PostingIterator postlist_begin(const std::string &tname) const
An iterator pointing to the start of the postlist for a given term.
Xapian::Weight subclass implementing the tf-idf weighting scheme.
Xapian::termcount len_lower
void add_term(const std::string &tname, Xapian::termcount wdfinc=1)
Add a term to the document, without positional information.
Abstract base class for weighting schemes.
TradWeight * unserialise(const std::string &serialised) const
Unserialise parameters.
Xapian::termcount get_collection_freq(const std::string &tname) const
Return the total number of occurrences of the given term.
DLHWeight * unserialise(const std::string &serialised) const
Unserialise parameters.
static const testcase testcases[]