xapian-core  2.0.0
localsubmatch.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2006-2026 Olly Betts
5  * Copyright (C) 2007,2008,2009 Lemur Consulting Ltd
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, see
19  * <https://www.gnu.org/licenses/>.
20  */
21 
22 #include <config.h>
23 
24 #include "localsubmatch.h"
25 
27 #include "backends/leafpostlist.h"
28 #include "debuglog.h"
29 #include "extraweightpostlist.h"
30 #include "omassert.h"
31 #include "queryoptimiser.h"
32 #include "synonympostlist.h"
33 #include "api/termlist.h"
34 #include "weight/weightinternal.h"
35 
36 #include "xapian/error.h"
37 
38 #include <memory>
39 #include <string>
40 
41 using namespace std;
42 
49 class LazyWeight : public Xapian::Weight {
51 
53 
55 
57 
59 
60  double factor;
61 
63 
64  LazyWeight* clone() const override;
65 
66  void init(double factor_) override;
67 
68  public:
70  Xapian::Weight * real_wt_,
71  Xapian::Weight::Internal * stats_,
72  Xapian::termcount qlen_,
73  Xapian::termcount wqf__,
74  double factor_,
75  const Xapian::Database::Internal* shard_)
76  : pl(pl_),
77  real_wt(real_wt_),
78  stats(stats_),
79  qlen(qlen_),
80  wqf(wqf__),
81  factor(factor_),
82  shard(shard_)
83  { }
84 
85  std::string name() const override;
86 
87  std::string serialise() const override;
88  LazyWeight* unserialise(const std::string& serialised) const override;
89 
90  double get_sumpart(Xapian::termcount wdf,
91  Xapian::termcount doclen,
92  Xapian::termcount uniqterms,
93  Xapian::termcount wdfdocmax) const override;
94  double get_maxpart() const override;
95 
96  double get_sumextra(Xapian::termcount doclen,
97  Xapian::termcount uniqterms,
98  Xapian::termcount wdfdocmax) const override;
99  double get_maxextra() const override;
100 };
101 
102 LazyWeight *
104 {
105  throw Xapian::InvalidOperationError("LazyWeight::clone()");
106 }
107 
108 void
109 LazyWeight::init(double factor_)
110 {
111  (void)factor_;
112  throw Xapian::InvalidOperationError("LazyWeight::init()");
113 }
114 
115 string
117 {
118  string desc = "LazyWeight(";
119  desc += real_wt->name();
120  desc += ")";
121  return desc;
122 }
123 
124 string
126 {
127  throw Xapian::InvalidOperationError("LazyWeight::serialise()");
128 }
129 
130 LazyWeight *
131 LazyWeight::unserialise(const string &) const
132 {
133  throw Xapian::InvalidOperationError("LazyWeight::unserialise()");
134 }
135 
136 double
138  Xapian::termcount doclen,
139  Xapian::termcount uniqterms,
140  Xapian::termcount wdfdocmax) const
141 {
142  (void)wdf;
143  (void)doclen;
144  (void)uniqterms;
145  (void)wdfdocmax;
146  throw Xapian::InvalidOperationError("LazyWeight::get_sumpart()");
147 }
148 
149 double
151  Xapian::termcount uniqterms,
152  Xapian::termcount wdfdocmax) const
153 {
154  (void)doclen;
155  (void)uniqterms;
156  (void)wdfdocmax;
157  throw Xapian::InvalidOperationError("LazyWeight::get_sumextra()");
158 }
159 
160 double
162 {
163  // This gets called first for the case we care about.
164  return pl->resolve_lazy_termweight(real_wt, stats, qlen, wqf, factor, shard);
165 }
166 
167 double
169 {
170  throw Xapian::InvalidOperationError("LazyWeight::get_maxextra()");
171 }
172 
175  Xapian::termcount * total_subqs_ptr)
176 {
177  LOGCALL(MATCH, PostList *, "LocalSubMatch::get_postlist", matcher | total_subqs_ptr);
178 
179  if (query.empty() || db->get_doccount() == 0)
180  return {nullptr, nullptr}; // MatchNothing
181 
182  // Build the postlist tree for the query. This calls
183  // LocalSubMatch::open_post_list() for each term in the query.
184  PostListAndEstimate plest;
185  {
186  QueryOptimiser opt(*db, *this, matcher, shard_index);
187  double factor = wt_factory.is_bool_weight_() ? 0.0 : 1.0;
188  plest = query.internal->postlist(&opt, factor, NULL);
189  *total_subqs_ptr = opt.get_total_subqs();
190  }
191 
192  if (plest.pl) {
193  unique_ptr<Xapian::Weight> extra_wt(wt_factory.clone());
194  // Only uses term-independent stats.
195  extra_wt->init_(*total_stats, qlen, db);
196  if (extra_wt->get_maxextra() != 0.0) {
197  // There's a term-independent weight contribution, so we combine
198  // the postlist tree with an ExtraWeightPostList which adds in this
199  // contribution.
200  plest.pl = new ExtraWeightPostList(plest.pl, extra_wt.release(),
201  matcher);
202  }
203  }
204 
205  return plest;
206 }
207 
210  PostListAndEstimate or_pl,
211  double factor,
212  const TermFreqs& termfreqs)
213 {
214  LOGCALL(MATCH, PostListAndEstimate, "LocalSubMatch::make_synonym_postlist", pltree | or_pl | factor | termfreqs);
215  bool needs_doclen = wt_factory.get_sumpart_needs_doclength_();
216  unique_ptr<SynonymPostList> res(new SynonymPostList(or_pl.pl, pltree,
217  needs_doclen));
218  unique_ptr<Xapian::Weight> wt(wt_factory.clone());
219 
220  // We shortcut an empty shard and avoid creating a postlist tree for it,
221  // and all shards must be empty for collection_size to be zero.
222  Assert(total_stats->collection_size);
223  wt->init_(*total_stats, qlen, factor,
224  termfreqs.termfreq, termfreqs.reltermfreq, termfreqs.collfreq,
225  db);
226 
227  res->set_weight(wt.release());
228  RETURN({res.release(), std::move(or_pl.est)});
229 }
230 
233  Xapian::termcount wqf,
234  double factor,
235  bool need_positions,
236  bool compound_weight,
237  QueryOptimiser* qopt,
238  bool lazy_weight,
239  TermFreqs* termfreqs)
240 {
241  LOGCALL(MATCH, PostListAndEstimate, "LocalSubMatch::open_post_list", term | wqf | factor | need_positions | qopt | lazy_weight | termfreqs);
242 
243  bool weighted = false;
244 
245  LeafPostList * pl = NULL;
246  if (term.empty()) {
247  Assert(!need_positions);
248  pl = db->open_leaf_post_list(term, false);
249  } else {
250  weighted = (factor != 0.0);
251  const LeafPostList* hint = qopt->get_hint_postlist();
252  if (!hint || !hint->open_nearby_postlist(term, need_positions, pl)) {
253  pl = db->open_leaf_post_list(term, need_positions);
254  }
255  if (pl) qopt->set_hint_postlist(pl);
256  if (pl && !need_positions) {
257  bool need_wdf = (weighted || compound_weight) &&
258  wt_factory.get_sumpart_needs_wdf_();
259  if (!need_wdf && pl->get_termfreq() == qopt->db_size) {
260  // If we're not going to use the wdf or term positions, and the
261  // term indexes all documents, we can replace it with the
262  // MatchAll postlist, which is especially efficient if there
263  // are no gaps in the docids.
264  //
265  // We opened the real PostList already as that's more efficient
266  // than asking the Database for the termfreq in the common case
267  // when the term doesn't index all documents, and is similar
268  // work in the case where it does (for glass, there's the extra
269  // overhead of creating a cursor but we still need to read and
270  // decode the same data).
271  //
272  // The real PostList got set as the QueryOptimiser's hint above
273  // so we can just hand ownership of it to the QueryOptimiser.
274  qopt->own_hint_postlist();
275  pl = db->open_leaf_post_list(string(), false);
276  // We shortcut an empty shard and avoid creating a postlist
277  // tree for it, so an alldocs postlist can't be NULL here.
278  Assert(pl);
279 
280  // Set the term name so the postlist looks up the correct term
281  // frequencies - this is necessary if the weighting scheme
282  // needs collection frequency or reltermfreq (termfreq would be
283  // correct anyway since it's just the collection size in this
284  // case).
285  pl->set_term(term);
286  }
287  }
288  }
289 
290  if (pl && weighted) {
291  Xapian::Weight * wt = wt_factory.clone();
292  if (!lazy_weight) {
293  wt->init_(*total_stats, qlen, term, wqf, factor, db, pl);
294  if (pl->get_termfreq() > 0)
295  total_stats->set_max_part(term, wt->get_maxpart());
296  } else {
297  // Delay initialising the actual weight object, so that we can
298  // gather stats for the terms lazily expanded from a wildcard
299  // (needed for the remote database case).
300  wt = new LazyWeight(pl, wt, total_stats, qlen, wqf, factor, db);
301  }
302  pl->set_termweight(wt);
303  }
304 
305  if (termfreqs) {
306  if (term.empty()) {
307  *termfreqs = TermFreqs(total_stats->collection_size,
308  total_stats->rset_size,
309  total_stats->total_length);
310  } else if (!lazy_weight) {
311  auto i = total_stats->termfreqs.find(term);
312  Assert(i != total_stats->termfreqs.end());
313  *termfreqs = i->second;
314  }
315  }
316 
317  if (!pl) {
318  RETURN({nullptr, nullptr});
319  }
320 
321  Xapian::docid first = 1, last = Xapian::docid(-1);
322  pl->get_docid_range(first, last);
323 
324  EstimateOp* est = nullptr;
325  if (!qopt->get_no_estimates())
326  est = new EstimateOp(pl->get_termfreq(), first, last);
327  RETURN({pl, est});
328 }
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
Definition: api_anydb.cc:62
char name[9]
Definition: dbcheck.cc:57
Class for estimating the total number of matching documents.
Definition: estimateop.h:64
PostList which adds on a term-independent weight contribution.
Xapian::Weight subclass which adds laziness.
double get_sumextra(Xapian::termcount doclen, Xapian::termcount uniqterms, Xapian::termcount wdfdocmax) const override
Calculate the term-independent weight component for a document.
double factor
double get_sumpart(Xapian::termcount wdf, Xapian::termcount doclen, Xapian::termcount uniqterms, Xapian::termcount wdfdocmax) const override
Calculate the weight contribution for this object's term to a document.
std::string name() const override
Return the name of this weighting scheme, e.g.
double get_maxextra() const override
Return an upper bound on what get_sumextra() can return for any document.
std::string serialise() const override
Return this object's parameters serialised as a single string.
const Xapian::Database::Internal * shard
LazyWeight * unserialise(const std::string &serialised) const override
Unserialise parameters.
LazyWeight(LeafPostList *pl_, Xapian::Weight *real_wt_, Xapian::Weight::Internal *stats_, Xapian::termcount qlen_, Xapian::termcount wqf__, double factor_, const Xapian::Database::Internal *shard_)
LazyWeight * clone() const override
Clone this object.
Xapian::Weight::Internal * stats
Xapian::termcount qlen
LeafPostList * pl
double get_maxpart() const override
Return an upper bound on what get_sumpart() can return for any document.
Xapian::termcount wqf
Xapian::Weight * real_wt
void init(double factor_) override
Allow the subclass to perform any initialisation it needs to.
Abstract base class for leaf postlists.
Definition: leafpostlist.h:40
void set_term(std::string_view term_)
Set the term name.
Definition: leafpostlist.h:155
void set_termweight(const Xapian::Weight *weight_)
Set the weighting scheme to use during matching.
Definition: leafpostlist.h:81
virtual bool open_nearby_postlist(std::string_view term_, bool need_read_pos, LeafPostList *&pl) const
Open another postlist from the same database.
Definition: leafpostlist.cc:69
PostListAndEstimate open_post_list(const std::string &term, Xapian::termcount wqf, double factor, bool need_positions, bool compound_weight, Xapian::Internal::QueryOptimiser *qopt, bool lazy_weight, TermFreqs *termfreqs)
PostListAndEstimate get_postlist(PostListTree *matcher, Xapian::termcount *total_subqs_ptr)
Get PostList.
PostListAndEstimate make_synonym_postlist(PostListTree *pltree, PostListAndEstimate or_pl, double factor, const TermFreqs &termfreqs)
Convert a postlist into a synonym postlist.
A postlist comprising several postlists SYNONYMed together.
Virtual base class for Database internals.
Abstract base class for postlists.
Definition: postlist.h:40
Xapian::doccount get_termfreq() const
Get an estimate of the number of documents this PostList will return.
Definition: postlist.h:67
virtual void get_docid_range(docid &first, docid &last) const
Get the bounds on the range of docids this PostList can return.
Definition: postlist.cc:72
Xapian::termcount get_total_subqs() const
void set_hint_postlist(LeafPostList *new_hint)
const LeafPostList * get_hint_postlist() const
InvalidOperationError indicates the API was used in an invalid way.
Definition: error.h:271
bool empty() const noexcept
Check if this query is Xapian::Query::MatchNothing.
Definition: query.h:661
Xapian::Internal::intrusive_ptr< Internal > internal
Definition: query.h:48
Class to hold statistics for a given collection.
Abstract base class for weighting schemes.
Definition: weight.h:38
virtual Weight * clone() const =0
Clone this object.
virtual double get_maxpart() const =0
Return an upper bound on what get_sumpart() can return for any document.
void init_(const Internal &stats, Xapian::termcount query_len_, const std::string &term, Xapian::termcount wqf_, double factor, const Xapian::Database::Internal *shard, void *postlist)
Definition: weight.cc:76
string term
Virtual base class for Database internals.
Debug logging macros.
#define RETURN(...)
Definition: debuglog.h:484
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:478
Hierarchy of classes which Xapian can throw as exceptions.
PostList which adds on a term-independent weight contribution.
Abstract base class for leaf postlists.
SubMatch class for a local database.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
Various assertion macros.
#define Assert(COND)
Definition: omassert.h:122
Details passed around while building PostList tree from Query tree.
std::unique_ptr< EstimateOp > est
Definition: estimateop.h:219
The frequencies for a term.
Xapian::doccount reltermfreq
Xapian::termcount collfreq
Combine subqueries, weighting as if they are synonyms.
Abstract base class for termlists.
Xapian::Weight::Internal class, holding database and term statistics.