xapian-core  1.4.21
localsubmatch.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2006,2007,2009,2010,2011,2013,2014,2015,2016,2018,2020 Olly Betts
5  * Copyright (C) 2007,2008,2009 Lemur Consulting Ltd
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include <config.h>
23 
24 #include "localsubmatch.h"
25 
26 #include "backends/database.h"
27 #include "debuglog.h"
28 #include "api/emptypostlist.h"
29 #include "extraweightpostlist.h"
30 #include "api/leafpostlist.h"
31 #include "omassert.h"
32 #include "queryoptimiser.h"
33 #include "synonympostlist.h"
34 #include "api/termlist.h"
35 #include "weight/weightinternal.h"
36 
37 #include "xapian/error.h"
38 
39 #include "autoptr.h"
40 #include <map>
41 #include <string>
42 
43 using namespace std;
44 
51 class LazyWeight : public Xapian::Weight {
53 
55 
57 
59 
61 
62  double factor;
63 
64  LazyWeight * clone() const;
65 
66  void init(double factor_);
67 
68  public:
70  Xapian::Weight * real_wt_,
71  Xapian::Weight::Internal * stats_,
72  Xapian::termcount qlen_,
73  Xapian::termcount wqf__,
74  double factor_)
75  : pl(pl_),
76  real_wt(real_wt_),
77  stats(stats_),
78  qlen(qlen_),
79  wqf(wqf__),
80  factor(factor_)
81  { }
82 
83  std::string name() const;
84 
85  std::string serialise() const;
86  LazyWeight * unserialise(const std::string & serialised) const;
87 
88  double get_sumpart(Xapian::termcount wdf,
89  Xapian::termcount doclen,
90  Xapian::termcount uniqterms) const;
91  double get_maxpart() const;
92 
93  double get_sumextra(Xapian::termcount doclen,
94  Xapian::termcount uniqterms) const;
95  double get_maxextra() const;
96 };
97 
98 LazyWeight *
100 {
101  throw Xapian::InvalidOperationError("LazyWeight::clone()");
102 }
103 
104 void
105 LazyWeight::init(double factor_)
106 {
107  (void)factor_;
108  throw Xapian::InvalidOperationError("LazyWeight::init()");
109 }
110 
111 string
113 {
114  string desc = "LazyWeight(";
115  desc += real_wt->name();
116  desc += ")";
117  return desc;
118 }
119 
120 string
122 {
123  throw Xapian::InvalidOperationError("LazyWeight::serialise()");
124 }
125 
126 LazyWeight *
127 LazyWeight::unserialise(const string &) const
128 {
129  throw Xapian::InvalidOperationError("LazyWeight::unserialise()");
130 }
131 
132 double
134  Xapian::termcount doclen,
135  Xapian::termcount uniqterms) const
136 {
137  (void)wdf;
138  (void)doclen;
139  (void)uniqterms;
140  throw Xapian::InvalidOperationError("LazyWeight::get_sumpart()");
141 }
142 
143 double
145  Xapian::termcount uniqterms) const
146 {
147  (void)doclen;
148  (void)uniqterms;
149  throw Xapian::InvalidOperationError("LazyWeight::get_sumextra()");
150 }
151 
152 double
154 {
155  // This gets called first for the case we care about.
156  return pl->resolve_lazy_termweight(real_wt, stats, qlen, wqf, factor);
157 }
158 
159 double
161 {
162  throw Xapian::InvalidOperationError("LazyWeight::get_maxextra()");
163 }
164 
165 bool
167  Xapian::Weight::Internal & total_stats)
168 {
169  LOGCALL(MATCH, bool, "LocalSubMatch::prepare_match", nowait | total_stats);
170  (void)nowait;
171  Assert(db);
172  total_stats.accumulate_stats(*db, rset);
173  RETURN(true);
174 }
175 
176 void
178  Xapian::doccount maxitems,
179  Xapian::doccount check_at_least,
180  Xapian::Weight::Internal & total_stats)
181 {
182  LOGCALL_VOID(MATCH, "LocalSubMatch::start_match", first | maxitems | check_at_least | total_stats);
183  (void)first;
184  (void)maxitems;
185  (void)check_at_least;
186  // Store a pointer to the total stats to use when building the Query tree.
187  stats = &total_stats;
188 }
189 
190 PostList *
192  Xapian::termcount* total_subqs_ptr,
194 {
195  LOGCALL(MATCH, PostList*, "LocalSubMatch::get_postlist", matcher | total_subqs_ptr | Literal("[total_subqs]"));
196 
197  if (query.empty() || db->get_doccount() == 0)
198  RETURN(new EmptyPostList); // MatchNothing
199 
200  // Build the postlist tree for the query. This calls
201  // LocalSubMatch::open_post_list() for each term in the query.
202  PostList * pl;
203  {
204  QueryOptimiser opt(*db, *this, matcher, shard_index);
205  double factor = wt_factory->is_bool_weight_() ? 0.0 : 1.0;
206  pl = query.internal->postlist(&opt, factor);
207  *total_subqs_ptr = opt.get_total_subqs();
208  }
209 
210  AutoPtr<Xapian::Weight> extra_wt(wt_factory->clone());
211  // Only uses term-independent stats.
212  extra_wt->init_(*stats, qlen);
213  if (extra_wt->get_maxextra() != 0.0) {
214  // There's a term-independent weight contribution, so we combine the
215  // postlist tree with an ExtraWeightPostList which adds in this
216  // contribution.
217  pl = new ExtraWeightPostList(pl, extra_wt.release(), matcher);
218  }
219 
220  RETURN(pl);
221 }
222 
223 PostList *
225  double factor,
226  bool wdf_disjoint)
227 {
228  LOGCALL(MATCH, PostList *, "LocalSubMatch::make_synonym_postlist", or_pl | matcher | factor | wdf_disjoint);
229  if (rare(or_pl->get_termfreq_max() == 0)) {
230  // or_pl is an EmptyPostList or equivalent.
231  return or_pl;
232  }
233  LOGVALUE(MATCH, or_pl->get_termfreq_est());
234  Xapian::termcount len_lb = db->get_doclength_lower_bound();
235  AutoPtr<SynonymPostList> res(new SynonymPostList(or_pl, matcher, len_lb,
236  wdf_disjoint));
237  AutoPtr<Xapian::Weight> wt(wt_factory->clone());
238 
239  TermFreqs freqs;
240  // Avoid calling get_termfreq_est_using_stats() if the database is empty
241  // so we don't need to special case that repeatedly when implementing it.
242  // FIXME: it would be nicer to handle an empty database higher up, though
243  // we need to catch the case where all the non-empty subdatabases have
244  // failed, so we can't just push this right up to the start of get_mset().
245  if (usual(stats->collection_size != 0)) {
246  freqs = or_pl->get_termfreq_est_using_stats(*stats);
247  }
248  wt->init_(*stats, qlen, factor,
249  freqs.termfreq, freqs.reltermfreq, freqs.collfreq);
250 
251  res->set_weight(wt.release());
252  RETURN(res.release());
253 }
254 
255 LeafPostList *
256 LocalSubMatch::open_post_list(const string& term,
257  Xapian::termcount wqf,
258  double factor,
259  bool need_positions,
260  bool in_synonym,
261  QueryOptimiser * qopt,
262  bool lazy_weight)
263 {
264  LOGCALL(MATCH, LeafPostList *, "LocalSubMatch::open_post_list", term | wqf | factor | need_positions | qopt | lazy_weight);
265 
266  bool weighted = (factor != 0.0 && !term.empty());
267 
268  LeafPostList * pl = NULL;
269  if (!term.empty() && !need_positions) {
270  if ((!weighted && !in_synonym) ||
271  !wt_factory->get_sumpart_needs_wdf_()) {
272  Xapian::doccount sub_tf;
273  db->get_freqs(term, &sub_tf, NULL);
274  if (sub_tf == qopt->db_size) {
275  // If we're not going to use the wdf or term positions, and the
276  // term indexes all documents, we can replace it with the
277  // MatchAll postlist, which is especially efficient if there
278  // are no gaps in the docids.
279  pl = db->open_post_list(string());
280  // Set the term name so the postlist looks up the correct term
281  // frequencies - this is necessary if the weighting scheme
282  // needs collection frequency or reltermfreq (termfreq would be
283  // correct anyway since it's just the collection size in this
284  // case).
285  pl->set_term(term);
286  }
287  }
288  }
289 
290  if (!pl) {
291  const LeafPostList * hint = qopt->get_hint_postlist();
292  if (hint)
293  pl = hint->open_nearby_postlist(term);
294  if (!pl)
295  pl = db->open_post_list(term);
296  qopt->set_hint_postlist(pl);
297  }
298 
299  if (lazy_weight) {
300  auto res = stats->termfreqs.emplace(term, TermFreqs());
301  if (res.second) {
302  // Term came from a wildcard, but the same term may be elsewhere
303  // in the query so only accumulate its TermFreqs if emplace()
304  // created a new element.
305  db->get_freqs(term,
306  &res.first->second.termfreq,
307  &res.first->second.collfreq);
308  }
309  }
310 
311  if (weighted) {
312  Xapian::Weight * wt = wt_factory->clone();
313  if (!lazy_weight) {
314  wt->init_(*stats, qlen, term, wqf, factor, pl);
315  if (pl->get_termfreq() > 0)
316  stats->set_max_part(term, wt->get_maxpart());
317  } else {
318  // Delay initialising the actual weight object, so that we can
319  // gather stats for the terms lazily expanded from a wildcard
320  // (needed for the remote database case).
321  wt = new LazyWeight(pl, wt, stats, qlen, wqf, factor);
322  }
323  pl->set_termweight(wt);
324  }
325  RETURN(pl);
326 }
#define RETURN(A)
Definition: debuglog.h:482
#define Assert(COND)
Definition: omassert.h:122
Xapian::doccount db_size
Xapian::termcount wqf
Abstract base class for postlists.
Definition: postlist.h:37
InvalidOperationError indicates the API was used in an invalid way.
Definition: error.h:283
A PostList which contains no entries.
A PostList which contains no entries.
Definition: emptypostlist.h:27
const LeafPostList * get_hint_postlist() const
#define usual(COND)
Definition: config.h:574
Xapian::Weight * real_wt
Xapian::Weight::Internal * stats
Xapian::Weight subclass which adds laziness.
double get_sumpart(Xapian::termcount wdf, Xapian::termcount doclen, Xapian::termcount uniqterms) const
Calculate the weight contribution for this object&#39;s term to a document.
#define LOGCALL_VOID(CATEGORY, FUNC, PARAMS)
Definition: debuglog.h:477
STL namespace.
virtual Xapian::doccount get_termfreq() const =0
Return the exact term frequency.
void set_hint_postlist(LeafPostList *new_hint)
Abstract base class for leaf postlists.
Definition: leafpostlist.h:38
Xapian::Internal::intrusive_ptr< Internal > internal
Definition: query.h:49
A postlist which adds on an extra weight contribution.
virtual Weight * clone() const =0
Clone this object.
Abstract base class for leaf postlists.
LazyWeight(LeafPostList *pl_, Xapian::Weight *real_wt_, Xapian::Weight::Internal *stats_, Xapian::termcount qlen_, Xapian::termcount wqf__, double factor_)
#define rare(COND)
Definition: config.h:573
virtual double get_maxpart() const =0
Return an upper bound on what get_sumpart() can return for any document.
double get_maxextra() const
Return an upper bound on what get_sumextra() can return for any document.
bool prepare_match(bool nowait, Xapian::Weight::Internal &total_stats)
Fetch and collate statistics.
Hierarchy of classes which Xapian can throw as exceptions.
std::string name() const
Return the name of this weighting scheme.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
LazyWeight * unserialise(const std::string &serialised) const
Unserialise parameters.
Definition: pretty.h:45
void set_termweight(const Xapian::Weight *weight_)
Set the weighting scheme to use during matching.
Definition: leafpostlist.cc:57
double get_sumextra(Xapian::termcount doclen, Xapian::termcount uniqterms) const
Calculate the term-independent weight component for a document.
LazyWeight * clone() const
Clone this object.
Xapian::termcount get_total_subqs() const
PostList * get_postlist(MultiMatch *matcher, Xapian::termcount *total_subqs_ptr, Xapian::Weight::Internal &total_stats)
Get PostList.
void init_(const Internal &stats, Xapian::termcount query_len_, const std::string &term, Xapian::termcount wqf_, double factor)
Definition: weight.cc:93
LeafPostList * pl
virtual TermFreqs get_termfreq_est_using_stats(const Xapian::Weight::Internal &stats) const
Get an estimate for the termfreq and reltermfreq, given the stats.
Definition: postlist.cc:36
virtual Xapian::doccount get_termfreq_max() const =0
Get an upper bound on the number of documents indexed by this term.
virtual Xapian::doccount get_termfreq_est() const =0
Get an estimate of the number of documents indexed by this term.
#define LOGVALUE(a, b)
Definition: debuglog.h:484
Xapian::Weight::Internal class, holding database and term statistics.
std::string serialise() const
Return this object&#39;s parameters serialised as a single string.
Class to hold statistics for a given collection.
double get_maxpart() const
Return an upper bound on what get_sumpart() can return for any document.
A postlist comprising several postlists SYNONYMed together.
void init(double factor_)
Allow the subclass to perform any initialisation it needs to.
Details passed around while building PostList tree from Query tree.
void accumulate_stats(const Xapian::Database::Internal &sub_db, const Xapian::RSet &rset)
Accumulate the rtermfreqs for terms in the query.
SubMatch class for a local database.
Combine subqueries, weighting as if they are synonyms.
Xapian::termcount qlen
virtual LeafPostList * open_nearby_postlist(const std::string &term_) const
Open another postlist from the same database.
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
Definition: api_anydb.cc:63
The frequencies for a term.
void set_term(const std::string &term_)
Set the term name.
Definition: leafpostlist.h:136
char name[9]
Definition: dbcheck.cc:55
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
bool empty() const
Check if this query is Xapian::Query::MatchNothing.
Definition: query.h:524
double factor
LeafPostList * open_post_list(const std::string &term, Xapian::termcount wqf, double factor, bool need_positions, bool in_synonym, QueryOptimiser *qopt, bool lazy_weight)
Abstract base class for termlists.
PostList * make_synonym_postlist(PostList *or_pl, MultiMatch *matcher, double factor, bool wdf_disjoint)
Convert a postlist into a synonym postlist.
Various assertion macros.
Wrapper around standard unique_ptr template.
Debug logging macros.
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:476
void start_match(Xapian::doccount first, Xapian::doccount maxitems, Xapian::doccount check_at_least, Xapian::Weight::Internal &total_stats)
Start the match.
add on extra weight contribution
Abstract base class for weighting schemes.
Definition: weight.h:35