xapian-core  2.0.0
phrasepostlist.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2006,2007,2009,2010,2011,2014,2015,2017 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include "phrasepostlist.h"
24 
25 #include "debuglog.h"
26 #include "backends/positionlist.h"
27 #include "omassert.h"
28 #include "str.h"
29 
30 #include <algorithm>
31 #include <vector>
32 
33 using namespace std;
34 
36  EstimateOp* estimate_op_,
37  Xapian::termpos window_,
38  const vector<PostList*>::const_iterator &terms_begin,
39  const vector<PostList*>::const_iterator &terms_end,
40  PostListTree* pltree_)
41  : SelectPostList(source_, estimate_op_, pltree_),
42  window(window_),
43  terms(terms_begin, terms_end)
44 {
45  size_t n = terms.size();
46  Assert(n > 1);
47  poslists = new PositionList*[n];
48 
49  // It's hard to estimate how many times the phrase will occur as
50  // it depends a lot on the phrase, but usually the phrase will
51  // occur significantly less often than the individual terms.
52  termfreq = pl->get_termfreq() / 3;
53 }
54 
56 {
57  delete [] poslists;
58 }
59 
60 void
62 {
63  poslists[i] = terms[i]->read_position_list();
64 }
65 
66 bool
68 {
69  LOGCALL(MATCH, bool, "PhrasePostList::test_doc", NO_ARGS);
70 
72  if (!poslists[0]->next()) {
73  ++rejected;
74  RETURN(false);
75  }
76 
77  unsigned read_hwm = 0;
79  do {
81  Xapian::termpos pos = base;
82  unsigned i = 0;
83  do {
84  if (++i == terms.size()) {
85  ++accepted;
86  RETURN(true);
87  }
88  if (i > read_hwm) {
89  read_hwm = i;
91  }
92  if (!poslists[i]->skip_to(pos + 1)) {
93  goto reject;
94  }
95  pos = poslists[i]->get_position();
96  b = pos + (terms.size() - i);
97  } while (b - base <= window);
98  // Advance the start of the window to the first position it could match
99  // in given the current position of term i.
100  } while (poslists[0]->skip_to(b - window));
101 reject:
102  ++rejected;
103  RETURN(false);
104 }
105 
108 {
109  // Calculate an estimate for the wdf of a phrase postlist.
110  //
111  // We use the minimum wdf of a sub-postlist as our estimate. See the
112  // comment in NearPostList::get_wdf() for justification of this estimate.
113  vector<PostList *>::const_iterator i = terms.begin();
114  Xapian::termcount wdf = (*i)->get_wdf();
115  while (++i != terms.end()) {
116  wdf = min(wdf, (*i)->get_wdf());
117  }
118  return wdf;
119 }
120 
121 string
123 {
124  string m = "(Phrase ";
125  m += str(window);
126  m += ' ';
127  m += pl->get_description();
128  m += ")";
129  return m;
130 }
Class for estimating the total number of matching documents.
Definition: estimateop.h:64
void start_position_list(unsigned i)
Start reading from the i-th position list.
PhrasePostList(PostList *source_, EstimateOp *estimate_op_, Xapian::termpos window_, const std::vector< PostList * >::const_iterator &terms_begin, const std::vector< PostList * >::const_iterator &terms_end, PostListTree *pltree_)
std::vector< PostList * > terms
PositionList ** poslists
std::string get_description() const
Return a string description of this object.
Xapian::termcount get_wdf() const
Return the wdf for the document at the current position.
Xapian::termpos window
bool test_doc()
Test if the current document contains the terms as a phrase.
Base class for classes which filter another PostList.
Xapian::doccount accepted
Number of times test_doc() returned true.
Xapian::doccount rejected
Number of times test_doc() returned false.
PostList * skip_to(Xapian::docid did, double w_min)
Skip forward to the specified docid.
Abstract base class for postlists.
Definition: postlist.h:40
Xapian::doccount get_termfreq() const
Get an estimate of the number of documents this PostList will return.
Definition: postlist.h:67
PostList * next()
Advance the current position to the next document in the postlist.
Definition: postlist.h:168
virtual std::string get_description() const =0
Return a string description of this object.
Xapian::doccount termfreq
Estimate of the number of documents this PostList will return.
Definition: postlist.h:52
Abstract base class for iterating term positions in a document.
Definition: positionlist.h:32
virtual Xapian::termpos get_position() const =0
Return the current position.
Xapian::termpos pos
Debug logging macros.
#define RETURN(...)
Definition: debuglog.h:484
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:478
string str(int value)
Convert int to std::string.
Definition: str.cc:91
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
Definition: types.h:75
Various assertion macros.
#define Assert(COND)
Definition: omassert.h:122
Return docs containing terms forming a particular phrase.
Abstract base class for iterating term positions in a document.
Convert types to std::string.