xapian-core  1.4.26
expandweight.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007,2008,2011,2017,2023 Olly Betts
5  * Copyright (C) 2011 Action Without Borders
6  * Copyright (C) 2013 Aarsh Shah
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as
10  * published by the Free Software Foundation; either version 2 of the
11  * License, or (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include <config.h>
24 
25 #include "expandweight.h"
26 
27 #include "debuglog.h"
28 #include "omassert.h"
29 #include "api/termlist.h"
30 
31 using namespace std;
32 
33 namespace Xapian {
34 namespace Internal {
35 
36 void
37 ExpandWeight::collect_stats(TermList * merger, const std::string & term)
38 {
39  LOGCALL_VOID(API, "ExpandWeight::collect_stats", merger | term);
40 
41  stats.clear_stats();
42 
43  merger->accumulate_stats(stats);
44 
45  if (want_collection_freq)
46  collection_freq = db.get_collection_freq(term);
47 
48  LOGVALUE(EXPAND, rsize);
49  LOGVALUE(EXPAND, stats.rtermfreq);
50 
51  LOGVALUE(EXPAND, dbsize);
52  LOGVALUE(EXPAND, stats.dbsize);
53  if (stats.dbsize == dbsize) {
54  // Either we're expanding from just one database, or we got stats from
55  // all the sub-databases (because at least one relevant document from
56  // each sub-database contained this term), so termfreq should already
57  // be exact.
58  AssertEqParanoid(stats.termfreq, db.get_termfreq(term));
59  } else {
60  AssertRel(stats.dbsize,<,dbsize);
61  // We're expanding from more than one database and the stats we've got
62  // only cover some of the sub-databases, so termfreq only includes
63  // those sub-databases.
64  if (use_exact_termfreq) {
65  LOGLINE(EXPAND, "Had to request exact termfreq");
66  stats.termfreq = db.get_termfreq(term);
67  } else {
68  // Approximate the termfreq by scaling it up from the databases we
69  // do have information from.
70  double tf = double(stats.termfreq) * dbsize / stats.dbsize;
71  LOGLINE(EXPAND, "termfreq is approx " << stats.termfreq << " * " <<
72  dbsize << " / " << stats.dbsize << " = " <<
73  tf);
74 
75  stats.termfreq = static_cast<Xapian::doccount>(tf + 0.5);
76 
77  // termfreq can't be more than (dbsize - rsize + rtermfreq)
78  // since the number of relevant documents not indexed by this
79  // term can't be more than the number of documents not indexed
80  // by this term, so:
81  //
82  // rsize - rtermfreq <= dbsize - termfreq
83  // <=> termfreq <= dbsize - (rsize - rtermfreq)
84  auto termfreq_upper_bound = dbsize - (rsize - stats.rtermfreq);
85  if (stats.termfreq > termfreq_upper_bound) {
86  LOGLINE(EXPAND, "termfreq can't be more than "
87  "dbsize - (rsize + rtermfreq)");
88  stats.termfreq = termfreq_upper_bound;
89  }
90  }
91  }
92  LOGVALUE(EXPAND, stats.termfreq);
93 }
94 
95 }
96 }
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
#define AssertRel(A, REL, B)
Definition: omassert.h:123
#define LOGCALL_VOID(CATEGORY, FUNC, PARAMS)
Definition: debuglog.h:488
virtual void accumulate_stats(Xapian::Internal::ExpandStats &stats) const
Collate weighting information for the current term.
Definition: termlist.cc:34
Abstract base class for termlists.
Definition: termlist.h:39
STL namespace.
Collate statistics and calculate the term weights for the ESet.
#define LOGVALUE(a, b)
Definition: debuglog.h:495
#define AssertEqParanoid(A, B)
Definition: omassert.h:131
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Abstract base class for termlists.
Various assertion macros.
#define LOGLINE(a, b)
Definition: debuglog.h:494
Debug logging macros.