xapian-core  1.4.20
expandweight.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007,2008,2011,2017 Olly Betts
5  * Copyright (C) 2011 Action Without Borders
6  * Copyright (C) 2013 Aarsh Shah
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as
10  * published by the Free Software Foundation; either version 2 of the
11  * License, or (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include <config.h>
24 
25 #include "expandweight.h"
26 
27 #include "debuglog.h"
28 #include "omassert.h"
29 #include "api/termlist.h"
30 
31 using namespace std;
32 
33 namespace Xapian {
34 namespace Internal {
35 
36 void
37 ExpandWeight::collect_stats(TermList * merger, const std::string & term)
38 {
39  LOGCALL_VOID(API, "ExpandWeight::collect_stats", merger | term);
40 
41  stats.clear_stats();
42 
43  merger->accumulate_stats(stats);
44 
45  collection_freq = db.get_collection_freq(term);
46 
47  LOGVALUE(EXPAND, rsize);
48  LOGVALUE(EXPAND, stats.rtermfreq);
49 
50  LOGVALUE(EXPAND, dbsize);
51  LOGVALUE(EXPAND, stats.dbsize);
52  if (stats.dbsize == dbsize) {
53  // Either we're expanding from just one database, or we got stats from
54  // all the sub-databases (because at least one relevant document from
55  // each sub-database contained this term), so termfreq should already
56  // be exact.
57  AssertEqParanoid(stats.termfreq, db.get_termfreq(term));
58  } else {
59  AssertRel(stats.dbsize,<,dbsize);
60  // We're expanding from more than one database and the stats we've got
61  // only cover some of the sub-databases, so termfreq only includes
62  // those sub-databases.
63  if (use_exact_termfreq) {
64  LOGLINE(EXPAND, "Had to request exact termfreq");
65  stats.termfreq = db.get_termfreq(term);
66  } else {
67  // Approximate the termfreq by scaling it up from the databases we
68  // do have information from.
69  double tf = double(stats.termfreq) * dbsize / stats.dbsize;
70  LOGLINE(EXPAND, "termfreq is approx " << stats.termfreq << " * " <<
71  dbsize << " / " << stats.dbsize << " = " <<
72  tf);
73 
74  stats.termfreq = static_cast<Xapian::doccount>(tf + 0.5);
75 
76  // termfreq can't be more than (dbsize - rsize + rtermfreq)
77  // since the number of relevant documents not indexed by this
78  // term can't be more than the number of documents not indexed
79  // by this term, so:
80  //
81  // rsize - rtermfreq <= dbsize - termfreq
82  // <=> termfreq <= dbsize - (rsize - rtermfreq)
83  auto termfreq_upper_bound = dbsize - (rsize - stats.rtermfreq);
84  if (stats.termfreq > termfreq_upper_bound) {
85  LOGLINE(EXPAND, "termfreq can't be more than "
86  "dbsize - (rsize + rtermfreq)");
87  stats.termfreq = termfreq_upper_bound;
88  }
89  }
90  }
91  LOGVALUE(EXPAND, stats.termfreq);
92 }
93 
94 }
95 }
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
#define AssertRel(A, REL, B)
Definition: omassert.h:123
#define LOGCALL_VOID(CATEGORY, FUNC, PARAMS)
Definition: debuglog.h:477
virtual void accumulate_stats(Xapian::Internal::ExpandStats &stats) const
Collate weighting information for the current term.
Definition: termlist.cc:34
Abstract base class for termlists.
Definition: termlist.h:39
STL namespace.
Collate statistics and calculate the term weights for the ESet.
#define LOGVALUE(a, b)
Definition: debuglog.h:484
#define AssertEqParanoid(A, B)
Definition: omassert.h:131
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Abstract base class for termlists.
Various assertion macros.
#define LOGLINE(a, b)
Definition: debuglog.h:483
Debug logging macros.