xapian-core  1.4.25
localsubmatch.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2006,2007,2009,2010,2011,2013,2014,2015,2016,2018,2020 Olly Betts
5  * Copyright (C) 2007,2008,2009 Lemur Consulting Ltd
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include <config.h>
23 
24 #include "localsubmatch.h"
25 
26 #include "backends/database.h"
27 #include "debuglog.h"
28 #include "api/emptypostlist.h"
29 #include "extraweightpostlist.h"
30 #include "api/leafpostlist.h"
31 #include "omassert.h"
32 #include "queryoptimiser.h"
33 #include "synonympostlist.h"
34 #include "api/termlist.h"
35 #include "weight/weightinternal.h"
36 
37 #include "xapian/error.h"
38 
39 #include "autoptr.h"
40 #include <string>
41 
42 using namespace std;
43 
50 class LazyWeight : public Xapian::Weight {
52 
54 
56 
58 
60 
61  double factor;
62 
63  LazyWeight * clone() const;
64 
65  void init(double factor_);
66 
67  public:
69  Xapian::Weight * real_wt_,
70  Xapian::Weight::Internal * stats_,
71  Xapian::termcount qlen_,
72  Xapian::termcount wqf__,
73  double factor_)
74  : pl(pl_),
75  real_wt(real_wt_),
76  stats(stats_),
77  qlen(qlen_),
78  wqf(wqf__),
79  factor(factor_)
80  { }
81 
82  std::string name() const;
83 
84  std::string serialise() const;
85  LazyWeight * unserialise(const std::string & serialised) const;
86 
87  double get_sumpart(Xapian::termcount wdf,
88  Xapian::termcount doclen,
89  Xapian::termcount uniqterms) const;
90  double get_maxpart() const;
91 
92  double get_sumextra(Xapian::termcount doclen,
93  Xapian::termcount uniqterms) const;
94  double get_maxextra() const;
95 };
96 
97 LazyWeight *
99 {
100  throw Xapian::InvalidOperationError("LazyWeight::clone()");
101 }
102 
103 void
104 LazyWeight::init(double factor_)
105 {
106  (void)factor_;
107  throw Xapian::InvalidOperationError("LazyWeight::init()");
108 }
109 
110 string
112 {
113  string desc = "LazyWeight(";
114  desc += real_wt->name();
115  desc += ")";
116  return desc;
117 }
118 
119 string
121 {
122  throw Xapian::InvalidOperationError("LazyWeight::serialise()");
123 }
124 
125 LazyWeight *
126 LazyWeight::unserialise(const string &) const
127 {
128  throw Xapian::InvalidOperationError("LazyWeight::unserialise()");
129 }
130 
131 double
133  Xapian::termcount doclen,
134  Xapian::termcount uniqterms) const
135 {
136  (void)wdf;
137  (void)doclen;
138  (void)uniqterms;
139  throw Xapian::InvalidOperationError("LazyWeight::get_sumpart()");
140 }
141 
142 double
144  Xapian::termcount uniqterms) const
145 {
146  (void)doclen;
147  (void)uniqterms;
148  throw Xapian::InvalidOperationError("LazyWeight::get_sumextra()");
149 }
150 
151 double
153 {
154  // This gets called first for the case we care about.
155  return pl->resolve_lazy_termweight(real_wt, stats, qlen, wqf, factor);
156 }
157 
158 double
160 {
161  throw Xapian::InvalidOperationError("LazyWeight::get_maxextra()");
162 }
163 
164 bool
166  Xapian::Weight::Internal & total_stats)
167 {
168  LOGCALL(MATCH, bool, "LocalSubMatch::prepare_match", nowait | total_stats);
169  (void)nowait;
170  Assert(db);
171  total_stats.accumulate_stats(*db, rset);
172  RETURN(true);
173 }
174 
175 void
177  Xapian::doccount maxitems,
178  Xapian::doccount check_at_least,
179  Xapian::Weight::Internal & total_stats)
180 {
181  LOGCALL_VOID(MATCH, "LocalSubMatch::start_match", first | maxitems | check_at_least | total_stats);
182  (void)first;
183  (void)maxitems;
184  (void)check_at_least;
185  // Store a pointer to the total stats to use when building the Query tree.
186  stats = &total_stats;
187 }
188 
189 PostList *
191  Xapian::termcount* total_subqs_ptr,
193 {
194  LOGCALL(MATCH, PostList*, "LocalSubMatch::get_postlist", matcher | total_subqs_ptr | Literal("[total_subqs]"));
195 
196  if (query.empty() || db->get_doccount() == 0)
197  RETURN(new EmptyPostList); // MatchNothing
198 
199  // Build the postlist tree for the query. This calls
200  // LocalSubMatch::open_post_list() for each term in the query.
201  PostList * pl;
202  {
203  QueryOptimiser opt(*db, *this, matcher, shard_index);
204  double factor = wt_factory->is_bool_weight_() ? 0.0 : 1.0;
205  pl = query.internal->postlist(&opt, factor);
206  *total_subqs_ptr = opt.get_total_subqs();
207  }
208 
209  AutoPtr<Xapian::Weight> extra_wt(wt_factory->clone());
210  // Only uses term-independent stats.
211  extra_wt->init_(*stats, qlen);
212  if (extra_wt->get_maxextra() != 0.0) {
213  // There's a term-independent weight contribution, so we combine the
214  // postlist tree with an ExtraWeightPostList which adds in this
215  // contribution.
216  pl = new ExtraWeightPostList(pl, extra_wt.release(), matcher);
217  }
218 
219  RETURN(pl);
220 }
221 
222 PostList *
224  double factor,
225  bool wdf_disjoint)
226 {
227  LOGCALL(MATCH, PostList *, "LocalSubMatch::make_synonym_postlist", or_pl | matcher | factor | wdf_disjoint);
228  if (rare(or_pl->get_termfreq_max() == 0)) {
229  // or_pl is an EmptyPostList or equivalent.
230  return or_pl;
231  }
232  LOGVALUE(MATCH, or_pl->get_termfreq_est());
233  Xapian::termcount len_lb = db->get_doclength_lower_bound();
234  AutoPtr<SynonymPostList> res(new SynonymPostList(or_pl, matcher, len_lb,
235  wdf_disjoint));
236  AutoPtr<Xapian::Weight> wt(wt_factory->clone());
237 
238  TermFreqs freqs;
239  // Avoid calling get_termfreq_est_using_stats() if the database is empty
240  // so we don't need to special case that repeatedly when implementing it.
241  // FIXME: it would be nicer to handle an empty database higher up, though
242  // we need to catch the case where all the non-empty subdatabases have
243  // failed, so we can't just push this right up to the start of get_mset().
244  if (usual(stats->collection_size != 0)) {
245  freqs = or_pl->get_termfreq_est_using_stats(*stats);
246  }
247  wt->init_(*stats, qlen, factor,
248  freqs.termfreq, freqs.reltermfreq, freqs.collfreq);
249 
250  res->set_weight(wt.release());
251  RETURN(res.release());
252 }
253 
254 LeafPostList *
255 LocalSubMatch::open_post_list(const string& term,
256  Xapian::termcount wqf,
257  double factor,
258  bool need_positions,
259  bool in_synonym,
260  QueryOptimiser * qopt,
261  bool lazy_weight)
262 {
263  LOGCALL(MATCH, LeafPostList *, "LocalSubMatch::open_post_list", term | wqf | factor | need_positions | qopt | lazy_weight);
264 
265  bool weighted = (factor != 0.0 && !term.empty());
266 
267  LeafPostList * pl = NULL;
268  if (!term.empty() && !need_positions) {
269  if ((!weighted && !in_synonym) ||
270  !wt_factory->get_sumpart_needs_wdf_()) {
271  Xapian::doccount sub_tf;
272  db->get_freqs(term, &sub_tf, NULL);
273  if (sub_tf == qopt->db_size) {
274  // If we're not going to use the wdf or term positions, and the
275  // term indexes all documents, we can replace it with the
276  // MatchAll postlist, which is especially efficient if there
277  // are no gaps in the docids.
278  pl = db->open_post_list(string());
279  // Set the term name so the postlist looks up the correct term
280  // frequencies - this is necessary if the weighting scheme
281  // needs collection frequency or reltermfreq (termfreq would be
282  // correct anyway since it's just the collection size in this
283  // case).
284  pl->set_term(term);
285  }
286  }
287  }
288 
289  if (!pl) {
290  const LeafPostList * hint = qopt->get_hint_postlist();
291  if (hint)
292  pl = hint->open_nearby_postlist(term);
293  if (!pl)
294  pl = db->open_post_list(term);
295  qopt->set_hint_postlist(pl);
296  }
297 
298  if (lazy_weight) {
299  auto res = stats->termfreqs.emplace(term, TermFreqs());
300  if (res.second) {
301  // Term came from a wildcard, but the same term may be elsewhere
302  // in the query so only accumulate its TermFreqs if emplace()
303  // created a new element.
304  db->get_freqs(term,
305  &res.first->second.termfreq,
306  &res.first->second.collfreq);
307  }
308  }
309 
310  if (weighted) {
311  Xapian::Weight * wt = wt_factory->clone();
312  if (!lazy_weight) {
313  wt->init_(*stats, qlen, term, wqf, factor, pl);
314  if (pl->get_termfreq() > 0)
315  stats->set_max_part(term, wt->get_maxpart());
316  } else {
317  // Delay initialising the actual weight object, so that we can
318  // gather stats for the terms lazily expanded from a wildcard
319  // (needed for the remote database case).
320  wt = new LazyWeight(pl, wt, stats, qlen, wqf, factor);
321  }
322  pl->set_termweight(wt);
323  }
324  RETURN(pl);
325 }
#define RETURN(A)
Definition: debuglog.h:493
#define Assert(COND)
Definition: omassert.h:122
Xapian::doccount db_size
Xapian::termcount wqf
Abstract base class for postlists.
Definition: postlist.h:37
InvalidOperationError indicates the API was used in an invalid way.
Definition: error.h:283
A PostList which contains no entries.
A PostList which contains no entries.
Definition: emptypostlist.h:27
const LeafPostList * get_hint_postlist() const
#define usual(COND)
Definition: config.h:566
Xapian::Weight * real_wt
Xapian::Weight::Internal * stats
Xapian::Weight subclass which adds laziness.
double get_sumpart(Xapian::termcount wdf, Xapian::termcount doclen, Xapian::termcount uniqterms) const
Calculate the weight contribution for this object&#39;s term to a document.
#define LOGCALL_VOID(CATEGORY, FUNC, PARAMS)
Definition: debuglog.h:488
STL namespace.
virtual Xapian::doccount get_termfreq() const =0
Return the exact term frequency.
void set_hint_postlist(LeafPostList *new_hint)
Abstract base class for leaf postlists.
Definition: leafpostlist.h:38
Xapian::Internal::intrusive_ptr< Internal > internal
Definition: query.h:49
A postlist which adds on an extra weight contribution.
virtual Weight * clone() const =0
Clone this object.
Abstract base class for leaf postlists.
LazyWeight(LeafPostList *pl_, Xapian::Weight *real_wt_, Xapian::Weight::Internal *stats_, Xapian::termcount qlen_, Xapian::termcount wqf__, double factor_)
#define rare(COND)
Definition: config.h:565
virtual double get_maxpart() const =0
Return an upper bound on what get_sumpart() can return for any document.
double get_maxextra() const
Return an upper bound on what get_sumextra() can return for any document.
bool prepare_match(bool nowait, Xapian::Weight::Internal &total_stats)
Fetch and collate statistics.
Hierarchy of classes which Xapian can throw as exceptions.
std::string name() const
Return the name of this weighting scheme.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
LazyWeight * unserialise(const std::string &serialised) const
Unserialise parameters.
Definition: pretty.h:45
void set_termweight(const Xapian::Weight *weight_)
Set the weighting scheme to use during matching.
Definition: leafpostlist.cc:57
double get_sumextra(Xapian::termcount doclen, Xapian::termcount uniqterms) const
Calculate the term-independent weight component for a document.
LazyWeight * clone() const
Clone this object.
Xapian::termcount get_total_subqs() const
PostList * get_postlist(MultiMatch *matcher, Xapian::termcount *total_subqs_ptr, Xapian::Weight::Internal &total_stats)
Get PostList.
void init_(const Internal &stats, Xapian::termcount query_len_, const std::string &term, Xapian::termcount wqf_, double factor)
Definition: weight.cc:93
LeafPostList * pl
virtual TermFreqs get_termfreq_est_using_stats(const Xapian::Weight::Internal &stats) const
Get an estimate for the termfreq and reltermfreq, given the stats.
Definition: postlist.cc:36
virtual Xapian::doccount get_termfreq_max() const =0
Get an upper bound on the number of documents indexed by this term.
virtual Xapian::doccount get_termfreq_est() const =0
Get an estimate of the number of documents indexed by this term.
#define LOGVALUE(a, b)
Definition: debuglog.h:495
Xapian::Weight::Internal class, holding database and term statistics.
std::string serialise() const
Return this object&#39;s parameters serialised as a single string.
Class to hold statistics for a given collection.
double get_maxpart() const
Return an upper bound on what get_sumpart() can return for any document.
A postlist comprising several postlists SYNONYMed together.
void init(double factor_)
Allow the subclass to perform any initialisation it needs to.
Details passed around while building PostList tree from Query tree.
void accumulate_stats(const Xapian::Database::Internal &sub_db, const Xapian::RSet &rset)
Accumulate the rtermfreqs for terms in the query.
SubMatch class for a local database.
Combine subqueries, weighting as if they are synonyms.
Xapian::termcount qlen
virtual LeafPostList * open_nearby_postlist(const std::string &term_) const
Open another postlist from the same database.
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
Definition: api_anydb.cc:63
The frequencies for a term.
void set_term(const std::string &term_)
Set the term name.
Definition: leafpostlist.h:136
char name[9]
Definition: dbcheck.cc:55
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
bool empty() const
Check if this query is Xapian::Query::MatchNothing.
Definition: query.h:524
double factor
LeafPostList * open_post_list(const std::string &term, Xapian::termcount wqf, double factor, bool need_positions, bool in_synonym, QueryOptimiser *qopt, bool lazy_weight)
Abstract base class for termlists.
PostList * make_synonym_postlist(PostList *or_pl, MultiMatch *matcher, double factor, bool wdf_disjoint)
Convert a postlist into a synonym postlist.
Various assertion macros.
Wrapper around standard unique_ptr template.
Debug logging macros.
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:487
void start_match(Xapian::doccount first, Xapian::doccount maxitems, Xapian::doccount check_at_least, Xapian::Weight::Internal &total_stats)
Start the match.
add on extra weight contribution
Abstract base class for weighting schemes.
Definition: weight.h:35