42 #include <string_view>
43 #include <unordered_set>
49 MSet::MSet(
const MSet&) =
default;
52 MSet::operator=(
const MSet&) =
default;
54 MSet::MSet(MSet&&) =
default;
57 MSet::operator=(MSet&&) =
default;
68 internal->fetch(first, last);
74 internal->set_item_weight(i, weight);
87 evaluate_dmset(
const vector<Xapian::docid>& dmset,
92 const vector<double>& dissimilarity)
94 double score_1 = 0, score_2 = 0;
99 for (
auto mset_index : dmset)
100 score_1 += mset[mset_index].get_weight();
102 auto cset_size = cset.
size();
104 double min_dist = numeric_limits<double>::max();
105 unsigned int pos = 1;
106 for (
auto mset_index : dmset) {
108 double weight = dissimilarity[mset_index * cset_size + c];
109 weight /= log(1.0 +
pos);
110 min_dist = min(min_dist, weight);
116 return factor2 * score_2 - factor1 * score_1;
128 auto mset_size =
size();
129 if (mset_size <= k) {
140 std::vector<Xapian::doccount> main_dmset;
141 main_dmset.reserve(k);
144 TermListGroup tlg(*
this);
145 std::vector<Xapian::Point> points;
146 points.reserve(mset_size);
155 main_dmset.push_back(count);
176 auto cset_size = cset.
size();
177 std::vector<double> dissimilarity;
178 dissimilarity.reserve(cset_size * points.size());
181 for (
const auto& point : points) {
182 for (
unsigned int c = 0; c < cset_size; ++c) {
183 double dist = d.
similarity(point, cset[c].get_centroid());
184 dissimilarity.push_back(1.0 - dist);
191 vector<Xapian::docid> topc;
195 auto documents = cset[c].get_documents();
196 auto limit = std::min(r, documents.size());
198 auto mset_index = documents[d].internal->get_index();
199 topc.push_back(mset_index);
203 vector<Xapian::doccount> curr_dmset = main_dmset;
206 bool found_better_dmset =
false;
207 for (
unsigned int i = 0; i < main_dmset.size(); ++i) {
208 auto curr_doc = main_dmset[i];
209 double best_score = evaluate_dmset(curr_dmset, cset,
211 *
this, dissimilarity);
212 bool found_better_doc =
false;
214 for (
unsigned int j = 0; j < topc.size(); ++j) {
217 auto candidate_doc = find(curr_dmset.begin(), curr_dmset.end(),
219 if (candidate_doc != curr_dmset.end()) {
223 auto temp_doc = curr_dmset[i];
224 curr_dmset[i] = topc[j];
225 double score = evaluate_dmset(curr_dmset, cset,
227 *
this, dissimilarity);
229 if (score < best_score) {
230 curr_doc = curr_dmset[i];
232 found_better_doc =
true;
235 curr_dmset[i] = temp_doc;
237 if (found_better_doc) {
238 curr_dmset[i] = curr_doc;
239 found_better_dmset =
true;
245 if (!found_better_dmset)
248 main_dmset = curr_dmset;
255 unordered_set<Xapian::docid> promoted{k};
256 for (
auto mset_index : main_dmset) {
261 [&](
const Result& result) {
262 return promoted.count(result.get_docid());
277 return internal->convert_to_percent(weight);
296 return internal->enquire->get_termfreq(
term);
314 return internal->first;
320 return internal->matches_lower_bound;
336 return internal->matches_upper_bound;
342 return internal->uncollapsed_lower_bound;
358 return internal->uncollapsed_upper_bound;
364 return internal->max_attained;
370 return internal->max_possible;
376 return internal->items.size();
384 std::string_view hi_start,
385 std::string_view hi_end,
386 std::string_view omit)
const
389 return internal->snippet(text, length,
stemmer, flags,
390 hi_start, hi_end, omit);
396 return internal->get_description();
402 if (index >=
items.size()) {
403 string msg =
"Requested index ";
405 msg +=
" in MSet of size ";
416 if (items.empty() || !enquire) {
419 if (last > items.size() - 1) {
420 last = items.size() - 1;
422 if (first_ <= last) {
425 enquire->request_document(items[i].get_docid());
437 max_attained = weight;
439 max_attained = max(max_attained, weight);
444 max_possible = max(max_possible, max_attained);
445 items[i].set_weight(weight);
452 if (percent_scale_factor == 0.0) {
455 }
else if (weight <= 0.0) {
467 percent = int(weight * percent_scale_factor + 100.0 * DBL_EPSILON);
471 }
else if (percent > 100) {
485 for (
auto& result : items) {
486 result.unshard_docid(shard, n_shards);
493 if (snippet_bg_relevance.empty()) {
536 pack_uint(result, uncollapsed_lower_bound);
537 pack_uint(result, uncollapsed_estimated);
538 pack_uint(result, uncollapsed_upper_bound);
541 for (
auto&& item : items) {
546 pack_uint(result, item.get_collapse_count());
576 for ( ; msize; --msize) {
579 string sort_key, key;
587 items.emplace_back(wt, did, std::move(key), collapse_cnt,
588 std::move(sort_key));
600 string desc =
"MSet(matches_lower_bound=";
601 desc +=
str(matches_lower_bound);
602 desc +=
", matches_estimated=";
603 desc +=
str(matches_estimated);
604 desc +=
", matches_upper_bound=";
605 desc +=
str(matches_upper_bound);
606 if (uncollapsed_lower_bound != matches_lower_bound) {
607 desc +=
", uncollapsed_lower_bound=";
608 desc +=
str(uncollapsed_lower_bound);
610 if (uncollapsed_estimated != matches_estimated) {
611 desc +=
", uncollapsed_estimated=";
612 desc +=
str(uncollapsed_estimated);
614 if (uncollapsed_upper_bound != matches_upper_bound) {
615 desc +=
", uncollapsed_upper_bound=";
616 desc +=
str(uncollapsed_upper_bound);
622 if (max_possible > 0) {
623 desc +=
", max_possible=";
624 desc +=
str(max_possible);
626 if (max_attained > 0) {
627 desc +=
", max_attained=";
628 desc +=
str(max_attained);
632 for (
auto&& item : items) {
638 desc += item.get_description();
Class for storing the results returned by the Clusterer.
Xapian::doccount size() const
Return the number of clusters.
Class for calculating the cosine distance between two documents.
double similarity(const PointType &a, const PointType &b) const override
Calculates and returns the cosine similarity using the formula cos(theta) = a.b/(|a|*|b|)
Class representing a document.
Xapian::Internal::intrusive_ptr_nonnull< Internal > internal
LCD clusterer: This clusterer implements the LCD clustering algorithm adapted from Modelling efficien...
ClusterSet cluster(const MSet &mset) override
Implements the LCD clustering algorithm.
Xapian::doccount uncollapsed_upper_bound
std::string serialise() const
Serialise this object.
int convert_to_percent(double weight) const
Xapian::Internal::intrusive_ptr< const Enquire::Internal > enquire
std::unordered_map< std::string, double > snippet_bg_relevance
Relevance weights for non-query terms for generating snippets.
std::string get_description() const
Return a string describing this object.
std::unique_ptr< Xapian::Weight::Internal > stats
For looking up query term frequencies and weights.
std::vector< Result > items
The items in the MSet.
Xapian::doccount uncollapsed_lower_bound
Xapian::doccount matches_estimated
void unshard_docids(Xapian::doccount shard, Xapian::doccount n_shards)
void unserialise(const char *p, const char *p_end)
Unserialise a serialised Xapian::MSet::Internal object.
Xapian::Document get_document(Xapian::doccount index) const
void merge_stats(const Internal *o, bool collapsing)
void fetch(Xapian::doccount first, Xapian::doccount last) const
void set_item_weight(Xapian::doccount i, double weight)
Xapian::doccount matches_lower_bound
double percent_scale_factor
Scale factor to convert weights to percentages.
Xapian::doccount matches_upper_bound
Xapian::doccount uncollapsed_estimated
Class representing a list of search results.
Xapian::Internal::intrusive_ptr_nonnull< Internal > internal
Xapian::doccount get_termfreq(std::string_view term) const
Get the termfreq of a term.
void sort_by_relevance()
Sorts the list of documents in MSet according to their weights.
void set_item_weight(Xapian::doccount i, double wt)
Update the weight corresponding to the document indexed at position i with wt.
Xapian::doccount size() const
Return number of items in this MSet object.
MSet()
Default constructor.
double get_max_possible() const
The maximum possible weight any document could achieve.
void fetch_(Xapian::doccount first, Xapian::doccount last) const
Xapian::doccount get_uncollapsed_matches_upper_bound() const
Upper bound on the total number of matching documents before collapsing.
friend class MSetIterator
Xapian::doccount get_uncollapsed_matches_estimated() const
Estimate of the total number of matching documents before collapsing.
Xapian::doccount get_uncollapsed_matches_lower_bound() const
Lower bound on the total number of matching documents before collapsing.
int convert_to_percent(double weight) const
Convert a weight to a percentage.
std::string get_description() const
Return a string describing this object.
Xapian::doccount get_firstitem() const
Rank of first item in this MSet.
double get_termweight(std::string_view term) const
Get the term weight of a term.
Xapian::doccount get_matches_upper_bound() const
Upper bound on the total number of matching documents.
MSetIterator begin() const
Return iterator pointing to the first item in this MSet.
std::string snippet(std::string_view text, size_t length=500, const Xapian::Stem &stemmer=Xapian::Stem(), unsigned flags=SNIPPET_BACKGROUND_MODEL|SNIPPET_EXHAUSTIVE, std::string_view hi_start="<b>", std::string_view hi_end="</b>", std::string_view omit="...") const
Generate a snippet.
double get_max_attained() const
The maximum weight attained by any document.
Xapian::doccount get_matches_lower_bound() const
Lower bound on the total number of matching documents.
MSetIterator end() const
Return iterator pointing to just after the last item in this MSet.
Xapian::doccount get_matches_estimated() const
Estimate of the total number of matching documents.
Class to represent a document as a point in the Vector Space Model.
RangeError indicates an attempt to access outside the bounds of a container.
Class representing a stemming algorithm.
Class to hold statistics for a given collection.
Append a string to an object description, escaping invalid UTF-8.
Abstract base class for a document.
Class representing a list of search results.
MSetCmp get_msetcmp_function(Xapian::Enquire::Internal::sort_setting sort_by, bool sort_forward, bool sort_val_reverse)
Select the appropriate msetcmp function.
Result comparison functions.
void sort(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
string str(int value)
Convert int to std::string.
The Xapian namespace contains public interfaces for the Xapian library.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Various assertion macros.
#define AssertRel(A, REL, B)
void unpack_throw_serialisation_error(const char *p)
Throw appropriate SerialisationError.
Pack types into strings and unpack them again.
bool unpack_string(const char **p, const char *end, std::string &result)
Decode a std::string from a string.
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
void pack_uint(std::string &s, U value)
Append an encoded unsigned integer to a string.
void pack_string(std::string &s, std::string_view value)
Append an encoded std::string to a string.
Round a bounded estimate to an appropriate number of S.F.
Xapian::doccount round_estimate(T lb, T ub, T est)
Round a bounded estimate to an appropriate number of S.F.
string serialise_double(double v)
Serialise a double to a string.
double unserialise_double(const char **p, const char *end)
Unserialise a double serialised by serialise_double.
functions to serialise and unserialise a double
string serialise_stats(const Xapian::Weight::Internal &stats)
Serialise a stats object.
void unserialise_stats(const char *p, const char *p_end, Xapian::Weight::Internal &stat)
Unserialise a serialised stats object.
functions to convert classes to strings and back
static Xapian::Stem stemmer
Convert types to std::string.