xapian-core  2.0.0
maxpostlist.h
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007,2009,2010,2011,2012,2013,2017,2018 Olly Betts
5  * Copyright (C) 2009 Lemur Consulting Ltd
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, see
19  * <https://www.gnu.org/licenses/>.
20  */
21 
22 #ifndef XAPIAN_INCLUDED_MAXPOSTLIST_H
23 #define XAPIAN_INCLUDED_MAXPOSTLIST_H
24 
25 #include "backends/postlist.h"
26 #include "postlisttree.h"
27 
28 #include <algorithm>
29 
31 class MaxPostList : public PostList {
33  void operator=(const MaxPostList &);
34 
37 
40 
42  size_t n_kids;
43 
45  PostList** plist = nullptr;
46 
49 
51  void erase_sublist(size_t i) {
52  delete plist[i];
53  --n_kids;
54  for (size_t j = i; j < n_kids; ++j) {
55  plist[j] = plist[j + 1];
56  }
58  }
59 
60  public:
64  template<class RandomItor>
65  MaxPostList(RandomItor pl_begin, RandomItor pl_end,
66  PostListTree* matcher_, Xapian::doccount db_size)
67  : n_kids(pl_end - pl_begin), matcher(matcher_)
68  {
69  plist = new PostList * [n_kids];
70  std::copy(pl_begin, pl_end, plist);
71 
72  // We shortcut an empty shard and avoid creating a postlist tree for it.
73  Assert(db_size);
74 
75  // We calculate the estimate assuming independence. The simplest
76  // way to calculate this seems to be a series of (n_kids - 1) pairwise
77  // calculations, which gives the same answer regardless of the order.
78  double scale = 1.0 / db_size;
79  double P_est = plist[0]->get_termfreq() * scale;
80  for (size_t i = 1; i < n_kids; ++i) {
81  double P_i = plist[i]->get_termfreq() * scale;
82  P_est += P_i - P_est * P_i;
83  }
84  termfreq = static_cast<Xapian::doccount>(P_est * db_size + 0.5);
85  }
86 
87  ~MaxPostList();
88 
89  Xapian::docid get_docid() const;
90 
91  double get_weight(Xapian::termcount doclen,
92  Xapian::termcount unique_terms,
93  Xapian::termcount wdfdocmax) const;
94 
95  bool at_end() const;
96 
97  double recalc_maxweight();
98 
100  return NULL;
101  }
102 
103  PostList* next(double w_min);
104 
105  PostList* skip_to(Xapian::docid, double w_min);
106 
107  void get_docid_range(Xapian::docid& first, Xapian::docid& last) const;
108 
109  std::string get_description() const;
110 
118  Xapian::termcount get_wdf() const;
119 
121 };
122 
123 #endif // XAPIAN_INCLUDED_MAXPOSTLIST_H
N-way OR postlist with wt=max(wt_i).
Definition: maxpostlist.h:31
MaxPostList(const MaxPostList &)
Don't allow copying.
void operator=(const MaxPostList &)
Don't allow assignment.
void get_docid_range(Xapian::docid &first, Xapian::docid &last) const
Get the bounds on the range of docids this PostList can return.
Definition: maxpostlist.cc:170
Xapian::termcount count_matching_subqs() const
Count the number of leaf subqueries which match at the current position.
Definition: maxpostlist.cc:206
double recalc_maxweight()
Recalculate the upper bound on what get_weight() can return.
Definition: maxpostlist.cc:70
void erase_sublist(size_t i)
Erase a sub-postlist.
Definition: maxpostlist.h:51
double get_weight(Xapian::termcount doclen, Xapian::termcount unique_terms, Xapian::termcount wdfdocmax) const
Return the weight contribution for the current position.
Definition: maxpostlist.cc:48
Xapian::docid did
The current docid, or zero if we haven't started or are at_end.
Definition: maxpostlist.h:39
bool at_end() const
Return true if the current position is past the last entry in this list.
Definition: maxpostlist.cc:64
PositionList * read_position_list()
Read the position list for the term in the current document and return a pointer to it (owned by the ...
Definition: maxpostlist.h:99
PostList ** plist
Array of pointers to sub-postlists.
Definition: maxpostlist.h:45
Xapian::termcount get_wdf() const
get_wdf() for MaxPostlist returns the sum of the wdfs of the sub postlists which match the current do...
Definition: maxpostlist.cc:195
Xapian::docid get_docid() const
Return the current docid.
Definition: maxpostlist.cc:42
size_t n_kids
The number of sub-postlists.
Definition: maxpostlist.h:42
PostList * skip_to(Xapian::docid, double w_min)
Skip forward to the specified docid.
Definition: maxpostlist.cc:130
std::string get_description() const
Return a string description of this object.
Definition: maxpostlist.cc:182
PostListTree * matcher
Pointer to the matcher object, so we can report pruning.
Definition: maxpostlist.h:48
MaxPostList(RandomItor pl_begin, RandomItor pl_end, PostListTree *matcher_, Xapian::doccount db_size)
Construct from 2 random-access iterators to a container of PostList*, a pointer to the matcher,...
Definition: maxpostlist.h:65
void force_recalc()
Definition: postlisttree.h:125
Abstract base class for postlists.
Definition: postlist.h:40
Xapian::doccount get_termfreq() const
Get an estimate of the number of documents this PostList will return.
Definition: postlist.h:67
PostList * next()
Advance the current position to the next document in the postlist.
Definition: postlist.h:168
Xapian::doccount termfreq
Estimate of the number of documents this PostList will return.
Definition: postlist.h:52
Abstract base class for iterating term positions in a document.
Definition: positionlist.h:32
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
#define Assert(COND)
Definition: omassert.h:122
Abstract base class for postlists.
Class for managing a tree of PostList objects.