xapian-core  1.4.27
phrasepostlist.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2006,2007,2009,2010,2011,2014,2015,2017 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <config.h>
22 
23 #include "phrasepostlist.h"
24 
25 #include "debuglog.h"
26 #include "backends/positionlist.h"
27 #include "omassert.h"
28 #include "str.h"
29 
30 #include <algorithm>
31 #include <vector>
32 
33 using namespace std;
34 
36  Xapian::termpos window_,
37  const vector<PostList*>::const_iterator &terms_begin,
38  const vector<PostList*>::const_iterator &terms_end)
39  : SelectPostList(source_), window(window_), terms(terms_begin, terms_end)
40 {
41  size_t n = terms.size();
42  Assert(n > 1);
43  poslists = new PositionList*[n];
44 }
45 
47 {
48  delete [] poslists;
49 }
50 
51 void
53 {
54  poslists[i] = terms[i]->read_position_list();
55 }
56 
57 bool
59 {
60  LOGCALL(MATCH, bool, "PhrasePostList::test_doc", NO_ARGS);
61 
63  if (!poslists[0]->next())
64  RETURN(false);
65 
66  unsigned read_hwm = 0;
68  do {
70  Xapian::termpos pos = base;
71  unsigned i = 0;
72  do {
73  if (++i == terms.size()) RETURN(true);
74  if (i > read_hwm) {
75  read_hwm = i;
77  }
78  if (!poslists[i]->skip_to(pos + 1))
79  RETURN(false);
80  pos = poslists[i]->get_position();
81  b = pos + (terms.size() - i);
82  } while (b - base <= window);
83  // Advance the start of the window to the first position it could match
84  // in given the current position of term i.
85  } while (poslists[0]->skip_to(b - window));
86  RETURN(false);
87 }
88 
91 {
92  // Calculate an estimate for the wdf of a phrase postlist.
93  //
94  // We use the minimum wdf of a sub-postlist as our estimate. See the
95  // comment in NearPostList::get_wdf() for justification of this estimate.
96  vector<PostList *>::const_iterator i = terms.begin();
97  Xapian::termcount wdf = (*i)->get_wdf();
98  while (++i != terms.end()) {
99  wdf = min(wdf, (*i)->get_wdf());
100  }
101  return wdf;
102 }
103 
106 {
107  // It's hard to estimate how many times the phrase will occur as
108  // it depends a lot on the phrase, but usually the phrase will
109  // occur significantly less often than the individual terms.
110  return source->get_termfreq_est() / 3;
111 }
112 
113 TermFreqs
115  const Xapian::Weight::Internal & stats) const
116 {
117  LOGCALL(MATCH, TermFreqs, "PhrasePostList::get_termfreq_est_using_stats", stats);
118  // No idea how to estimate this - do the same as get_termfreq_est() for
119  // now.
121  result.termfreq /= 3;
122  result.reltermfreq /= 3;
123  result.collfreq /= 3;
124  RETURN(result);
125 }
126 
127 string
129 {
130  string m = "(Phrase ";
131  m += str(window);
132  m += ' ';
133  m += source->get_description();
134  m += ")";
135  return m;
136 }
#define RETURN(A)
Definition: debuglog.h:493
#define Assert(COND)
Definition: omassert.h:122
PostList * source
Abstract base class for postlists.
Definition: postlist.h:37
bool test_doc()
Test if the current document contains the terms as a phrase.
PositionList ** poslists
Abstract base class for iterating term positions in a document.
STL namespace.
Xapian::termpos window
Convert types to std::string.
Xapian::doccount termfreq
Return docs containing terms forming a particular phrase.
Xapian::doccount get_termfreq_est() const
Get an estimate of the number of documents indexed by this term.
std::vector< PostList * > terms
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
virtual TermFreqs get_termfreq_est_using_stats(const Xapian::Weight::Internal &stats) const
Get an estimate for the termfreq and reltermfreq, given the stats.
Definition: postlist.cc:36
virtual Xapian::doccount get_termfreq_est() const =0
Get an estimate of the number of documents indexed by this term.
Class to hold statistics for a given collection.
Internal * next()
Advance the current position to the next document in the postlist.
Definition: postlist.h:194
string str(int value)
Convert int to std::string.
Definition: str.cc:90
PhrasePostList(PostList *source_, Xapian::termpos window_, const std::vector< PostList *>::const_iterator &terms_begin, const std::vector< PostList *>::const_iterator &terms_end)
std::string get_description() const
Return a string description of this object.
virtual Xapian::termpos get_position() const =0
Return the current position.
Xapian::termcount get_wdf() const
Return the wdf for the document at the current position.
A postlist parent class for classes which only return selected docs from a source postlist (e...
PostList * skip_to(Xapian::docid did, double w_min)
Skip forward to the specified docid.
The frequencies for a term.
virtual std::string get_description() const =0
Return a string description of this object.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
Definition: types.h:83
Various assertion macros.
TermFreqs get_termfreq_est_using_stats(const Xapian::Weight::Internal &stats) const
Get an estimate for the termfreq and reltermfreq, given the stats.
Abstract base class for iterating term positions in a document.
Definition: positionlist.h:31
void start_position_list(unsigned i)
Start reading from the i-th position list.
Debug logging macros.
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:487