xapian-core  2.0.0
honey_spelling.h
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007-2024 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #ifndef XAPIAN_INCLUDED_HONEY_SPELLING_H
22 #define XAPIAN_INCLUDED_HONEY_SPELLING_H
23 
24 #include <xapian/types.h>
25 
26 #include "honey_lazytable.h"
27 #include "api/termlist.h"
28 
29 #include <map>
30 #include <set>
31 #include <string>
32 #include <string_view>
33 #include <cstring> // For memcpy() and memcmp().
34 
35 namespace Honey {
36 
37 const unsigned KEY_PREFIX_BOOKEND = 0x00;
38 const unsigned KEY_PREFIX_HEAD = 0x01;
39 const unsigned KEY_PREFIX_MIDDLE = 0x02;
40 const unsigned KEY_PREFIX_TAIL = 0x03;
41 const unsigned KEY_PREFIX_WORD = 0x04;
42 
43 inline std::string
44 make_spelling_wordlist_key(std::string_view word)
45 {
46  if (rare(static_cast<unsigned char>(word[0]) <= KEY_PREFIX_WORD))
47  return std::string(1, KEY_PREFIX_WORD).append(word);
48  return std::string(word);
49 }
50 
51 struct fragment {
52  char data[4];
53 
55  fragment() { }
56 
58  explicit fragment(int) { std::memset(data, 0, 4); }
59 
61  explicit fragment(char data_[4]) { std::memcpy(data, data_, 4); }
62 
63  char& operator[](unsigned i) { return data[i]; }
64  const char& operator[](unsigned i) const { return data[i]; }
65 
66  operator std::string() const {
67  return std::string(data, data[0] == KEY_PREFIX_MIDDLE ? 4 : 3);
68  }
69 
70  bool operator<(const fragment& b) const {
71  return std::memcmp(data, b.data, 4) < 0;
72  }
73 };
74 
75 }
76 
78  void toggle_word(const std::string& word);
79  void toggle_fragment(Honey::fragment frag, const std::string& word);
80 
81  mutable std::map<std::string,
83  std::less<>> wordfreq_changes;
84 
93  mutable std::map<Honey::fragment, std::set<std::string>> termlist_deltas;
94 
97 
98  public:
107  HoneySpellingTable(const std::string& dbdir, bool readonly)
108  : HoneyLazyTable("spelling", dbdir + "/spelling.", readonly) { }
109 
110  HoneySpellingTable(int fd, off_t offset_, bool readonly)
111  : HoneyLazyTable("spelling", fd, offset_, readonly) { }
112 
114  void merge_changes();
115 
116  void add_word(const std::string& word, Xapian::termcount freqinc);
117  Xapian::termcount remove_word(const std::string& word,
118  Xapian::termcount freqdec);
119 
120  TermList* open_termlist(std::string_view word);
121 
122  Xapian::doccount get_word_frequency(std::string_view word) const;
123 
126  }
127 
135  bool is_modified() const {
136  return !wordfreq_changes.empty() || HoneyTable::is_modified();
137  }
138 
141  merge_changes();
143  return wordfreq_upper_bound;
144  }
145 
146  void cancel(const Honey::RootInfo& root_info,
148  // Discard batched-up changes.
149  wordfreq_changes.clear();
150  termlist_deltas.clear();
151 
152  HoneyTable::cancel(root_info, rev);
153  }
154 
155  // @}
156 };
157 
161  std::string data;
162 
164  unsigned p = 0;
165 
172  int tail = 0;
173 
176 
179 
180  public:
182  explicit HoneySpellingTermList(const std::string& data_)
183  : data(data_) { }
184 
186  HoneySpellingTermList(const std::string& data_,
187  const char* key)
188  : data(data_) {
189  unsigned char first_ch = key[0];
190  AssertRel(first_ch, <, Honey::KEY_PREFIX_WORD);
191  switch (first_ch) {
193  tail = -1;
194  break;
196  tail = -2;
197  break;
199  tail = 2;
200  break;
201  }
202  if (tail != 0)
203  current_term.assign(key + 1, 2);
204  }
205 
207 
208  Xapian::termcount get_wdf() const;
209 
211 
212  TermList* next();
213 
214  TermList* skip_to(std::string_view term);
215 
217 
219 };
220 
221 #endif // XAPIAN_INCLUDED_HONEY_SPELLING_H
void set_wordfreq_upper_bound(Xapian::termcount ub)
void merge_changes()
Merge in batched-up changes.
std::map< Honey::fragment, std::set< std::string > > termlist_deltas
Changes to make to the termlists.
std::map< std::string, Xapian::termcount, std::less<> > wordfreq_changes
void toggle_word(const std::string &word)
Xapian::termcount remove_word(const std::string &word, Xapian::termcount freqdec)
HoneySpellingTable(int fd, off_t offset_, bool readonly)
HoneySpellingTable(const std::string &dbdir, bool readonly)
Create a new HoneySpellingTable object.
void toggle_fragment(Honey::fragment frag, const std::string &word)
bool is_modified() const
Override methods of HoneyTable.
TermList * open_termlist(std::string_view word)
Xapian::termcount flush_db()
Returns updated wordfreq upper bound.
void cancel(const Honey::RootInfo &root_info, honey_revision_number_t rev)
Override methods of HoneyTable.
Xapian::doccount get_word_frequency(std::string_view word) const
void add_word(const std::string &word, Xapian::termcount freqinc)
Xapian::termcount wordfreq_upper_bound
Used to track an upper bound on wordfreq.
The list of words containing a particular trigram.
TermList * next()
Advance the current position to the next term in the termlist.
Xapian::termcount positionlist_count() const
Return the length of the position list for the current position.
HoneySpellingTermList(const HoneySpellingTermList &)
Copying is not allowed.
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
int tail
Number of constant characters on the end of the value.
PositionList * positionlist_begin() const
Return PositionList for the current position.
HoneySpellingTermList(const std::string &data_)
Constructor.
unsigned p
Position in the data.
HoneySpellingTermList(const std::string &data_, const char *key)
Constructor for head/bookend/tail terms.
std::string data
The encoded data.
TermList * skip_to(std::string_view term)
Skip forward to the specified term.
Xapian::doccount get_termfreq() const
Return the term frequency for the term at the current position.
Xapian::termcount get_approx_size() const
Return approximate size of this termlist.
void operator=(const HoneySpellingTermList &)
Assignment is not allowed.
void cancel(const Honey::RootInfo &, honey_revision_number_t)
Definition: honey_table.h:648
bool is_modified() const
Definition: honey_table.h:678
void flush_db()
Definition: honey_table.h:643
Abstract base class for iterating term positions in a document.
Definition: positionlist.h:32
Abstract base class for termlists.
Definition: termlist.h:42
std::string current_term
The current term.
Definition: termlist.h:54
#define rare(COND)
Definition: config.h:607
string term
uint4 honey_revision_number_t
The revision number of a honey database.
Definition: honey_defs.h:104
Subclass of HoneyTable for deriving lazy tables from.
const unsigned KEY_PREFIX_WORD
const unsigned KEY_PREFIX_MIDDLE
const unsigned KEY_PREFIX_TAIL
std::string make_spelling_wordlist_key(std::string_view word)
const unsigned KEY_PREFIX_BOOKEND
const unsigned KEY_PREFIX_HEAD
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
XAPIAN_REVISION_TYPE rev
Revision number of a database.
Definition: types.h:108
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
#define AssertRel(A, REL, B)
Definition: omassert.h:123
char & operator[](unsigned i)
fragment(char data_[4])
Allow implicit conversion.
bool operator<(const fragment &b) const
fragment()
Default constructor.
fragment(int)
Zero-initialising constructor.
const char & operator[](unsigned i) const
Abstract base class for termlists.
typedefs for Xapian