xapian-core  2.0.0
honey_spelling.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2004-2024 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include <xapian/error.h>
24 #include <xapian/types.h>
25 
26 #include "expand/expandweight.h"
27 #include "expand/termlistmerger.h"
28 #include "honey_spelling.h"
29 #include "omassert.h"
30 #include "pack.h"
31 
32 #include "../prefix_compressed_strings.h"
33 
34 #include <algorithm>
35 #include <map>
36 #include <queue>
37 #include <vector>
38 #include <set>
39 #include <string>
40 #include <string_view>
41 
42 using namespace Honey;
43 using namespace std;
44 
45 void
47 {
48  for (auto i : termlist_deltas) {
49  const string& key = i.first;
50  const set<string>& changes = i.second;
51 
52  auto d = changes.begin();
53  if (d == changes.end()) continue;
54 
55  string updated;
56  string current;
57  PrefixCompressedStringWriter out(updated);
58  if (get_exact_entry(key, current)) {
59  PrefixCompressedStringItor in(current, key);
60  updated.reserve(current.size()); // FIXME plus some?
61  while (!in.at_end() && d != changes.end()) {
62  const string& word = *in;
63  Assert(d != changes.end());
64  int cmp = word.compare(*d);
65  if (cmp < 0) {
66  out.append(word);
67  ++in;
68  } else if (cmp > 0) {
69  out.append(*d);
70  ++d;
71  } else {
72  // If an existing entry is in the changes list, that means
73  // we should remove it.
74  ++in;
75  ++d;
76  }
77  }
78  if (!in.at_end()) {
79  // FIXME : easy to optimise this to a fix-up and substring copy.
80  while (!in.at_end()) {
81  out.append(*in++);
82  }
83  }
84  }
85  while (d != changes.end()) {
86  out.append(*d++);
87  }
88  if (!updated.empty()) {
89  add(key, updated);
90  } else {
91  del(key);
92  }
93  }
94  termlist_deltas.clear();
95 
96  for (auto j = wordfreq_changes.begin(); j != wordfreq_changes.end(); ++j) {
97  const string& key = make_spelling_wordlist_key(j->first);
98  Xapian::termcount wordfreq = j->second;
99  if (wordfreq) {
100  string tag;
101  pack_uint_last(tag, wordfreq);
102  add(key, tag);
103  if (wordfreq > wordfreq_upper_bound)
104  wordfreq_upper_bound = wordfreq;
105  } else {
106  del(key);
107  }
108  }
109  wordfreq_changes.clear();
110 }
111 
112 void
114 {
115  auto i = termlist_deltas.find(frag);
116  if (i == termlist_deltas.end()) {
117  i = termlist_deltas.insert(make_pair(frag, set<string>())).first;
118  }
119  // The commonest case is that we're adding lots of words, so try insert
120  // first and if that reports that the word already exists, remove it.
121  auto res = i->second.insert(word);
122  if (!res.second) {
123  // word is already in the set, so remove it.
124  i->second.erase(res.first);
125  }
126 }
127 
128 void
130 {
131  if (word.size() <= 1) return;
132 
133  auto i = wordfreq_changes.find(word);
134  if (i != wordfreq_changes.end()) {
135  // Word "word" already exists and has been modified.
136  if (i->second) {
137  i->second += freqinc;
138  return;
139  }
140  // If "word" is currently modified such that it no longer exists, so
141  // we need to execute the code below to re-add trigrams for it.
142  i->second = freqinc;
143  } else {
144  string data;
145  if (get_exact_entry(make_spelling_wordlist_key(word), data)) {
146  // Word "word" already exists, so increment its count.
147  Xapian::termcount freq;
148  const char* p = data.data();
149  if (!unpack_uint_last(&p, p + data.size(), &freq) || freq == 0) {
150  throw Xapian::DatabaseCorruptError("Bad spelling word freq");
151  }
152  wordfreq_changes[word] = freq + freqinc;
153  return;
154  }
155  wordfreq_changes[word] = freqinc;
156  }
157 
158  // Add trigrams for word.
159  toggle_word(word);
160 }
161 
164 {
165  if (word.size() <= 1) return freqdec;
166 
167  auto i = wordfreq_changes.find(word);
168  if (i != wordfreq_changes.end()) {
169  if (i->second == 0) {
170  // Word has already been deleted.
171  return freqdec;
172  }
173  // Word "word" exists and has been modified.
174  if (freqdec < i->second) {
175  i->second -= freqdec;
176  return 0;
177  }
178  freqdec -= i->second;
179 
180  // Mark word as deleted.
181  i->second = 0;
182  } else {
183  string data;
184  if (!get_exact_entry(make_spelling_wordlist_key(word), data)) {
185  // This word doesn't exist.
186  return freqdec;
187  }
188 
189  Xapian::termcount freq;
190  const char* p = data.data();
191  if (!unpack_uint_last(&p, p + data.size(), &freq)) {
192  throw Xapian::DatabaseCorruptError("Bad spelling word freq");
193  }
194  if (freqdec < freq) {
195  wordfreq_changes[word] = freq - freqdec;
196  return 0;
197  }
198  freqdec -= freq;
199 
200  // Mark word as deleted.
201  wordfreq_changes[word] = 0;
202  }
203 
204  // Remove trigrams for word.
205  toggle_word(word);
206 
207  return freqdec;
208 }
209 
210 void
212 {
213  fragment buf(0);
214 
215  if (word.size() <= 4) {
216  // We also generate 'bookends' for two, three, and four character
217  // terms so we can handle transposition of the middle two characters
218  // of a four character word, substitution or deletion of the middle
219  // character of a three character word, or insertion in the middle of a
220  // two character word.
221  // 'Bookends':
222  buf[0] = KEY_PREFIX_BOOKEND;
223  buf[1] = word[0];
224  buf[2] = word[word.size() - 1];
225  toggle_fragment(buf, word);
226  }
227 
228  // Head:
229  buf[0] = KEY_PREFIX_HEAD;
230  buf[1] = word[0];
231  buf[2] = word[1];
232  toggle_fragment(buf, word);
233 
234  // Tail:
235  buf[0] = KEY_PREFIX_TAIL;
236  buf[1] = word[word.size() - 2];
237  buf[2] = word[word.size() - 1];
238  toggle_fragment(buf, word);
239 
240  if (word.size() > 2) {
241  set<fragment> done;
242  // Middles:
243  buf[0] = KEY_PREFIX_MIDDLE;
244  for (size_t start = 0; start <= word.size() - 3; ++start) {
245  memcpy(buf.data + 1, word.data() + start, 3);
246  // Don't toggle the same fragment twice or it will cancel out.
247  // Bug fixed in 1.2.6.
248  if (done.insert(buf).second)
249  toggle_fragment(buf, word);
250  }
251  }
252 }
253 
255  bool operator()(const TermList* a, const TermList* b) const {
256  return a->get_approx_size() > b->get_approx_size();
257  }
258 };
259 
260 TermList*
262 {
263  // This should have been handled by Database::get_spelling_suggestion().
264  AssertRel(word.size(),>,1);
265 
266  // Merge any pending changes to disk, but don't call commit() so they
267  // won't be switched live.
268  if (!wordfreq_changes.empty()) merge_changes();
269 
270  vector<TermList*> termlists;
271  try {
272  string data;
273  fragment buf(0);
274 
275  if (word.size() <= 4) {
276  // We also generate 'bookends' for two, three, and four character
277  // terms so we can handle transposition of the middle two
278  // characters of a four character word, substitution or deletion of
279  // the middle character of a three character word, or insertion in
280  // the middle of a two character word.
281  buf[0] = KEY_PREFIX_BOOKEND;
282  buf[1] = word[0];
283  buf[2] = word[word.size() - 1];
284  if (get_exact_entry(string(buf), data))
285  termlists.push_back(new HoneySpellingTermList(data, buf.data));
286  }
287 
288  // Head:
289  buf[0] = KEY_PREFIX_HEAD;
290  buf[1] = word[0];
291  buf[2] = word[1];
292  if (get_exact_entry(string(buf), data))
293  termlists.push_back(new HoneySpellingTermList(data, buf.data));
294 
295  if (word.size() == 2) {
296  // For two letter words, we generate H and T terms for the
297  // transposed form so that we can produce good spelling
298  // suggestions.
299  // AB -> BA
300  buf[1] = word[1];
301  buf[2] = word[0];
302  if (get_exact_entry(string(buf), data))
303  termlists.push_back(new HoneySpellingTermList(data, buf.data));
304  buf[0] = KEY_PREFIX_TAIL;
305  if (get_exact_entry(string(buf), data))
306  termlists.push_back(new HoneySpellingTermList(data, buf.data));
307  }
308 
309  // Tail:
310  buf[0] = KEY_PREFIX_TAIL;
311  buf[1] = word[word.size() - 2];
312  buf[2] = word[word.size() - 1];
313  if (get_exact_entry(string(buf), data))
314  termlists.push_back(new HoneySpellingTermList(data, buf.data));
315 
316  if (word.size() > 2) {
317  // Middles:
318  buf[0] = KEY_PREFIX_MIDDLE;
319  for (size_t start = 0; start <= word.size() - 3; ++start) {
320  memcpy(buf.data + 1, word.data() + start, 3);
321  if (get_exact_entry(string(buf), data))
322  termlists.push_back(new HoneySpellingTermList(data));
323  }
324 
325  if (word.size() == 3) {
326  // For three letter words, we generate the two "single
327  // transposition" forms too, so that we can produce good
328  // spelling suggestions.
329  // ABC -> BAC
330  buf[1] = word[1];
331  buf[2] = word[0];
332  if (get_exact_entry(string(buf), data))
333  termlists.push_back(new HoneySpellingTermList(data));
334  // ABC -> ACB
335  buf[1] = word[0];
336  buf[2] = word[2];
337  buf[3] = word[1];
338  if (get_exact_entry(string(buf), data))
339  termlists.push_back(new HoneySpellingTermList(data));
340  }
341  }
342 
343  return make_termlist_merger(termlists);
344  } catch (...) {
345  // Make sure we delete all the TermList objects to avoid leaking
346  // memory.
347  for (auto& t : termlists) {
348  delete t;
349  }
350  throw;
351  }
352 }
353 
356 {
357  auto i = wordfreq_changes.find(word);
358  if (i != wordfreq_changes.end()) {
359  // Modified frequency for word:
360  return i->second;
361  }
362 
363  string data;
364  if (get_exact_entry(make_spelling_wordlist_key(word), data)) {
365  // Word "word" already exists.
366  Xapian::termcount freq;
367  const char* p = data.data();
368  if (!unpack_uint_last(&p, p + data.size(), &freq)) {
369  throw Xapian::DatabaseCorruptError("Bad spelling word freq");
370  }
371  return freq;
372  }
373 
374  return 0;
375 }
376 
378 
381 {
382  // This is only used to decide how to build a OR-tree of TermList objects
383  // so we just need to return "sizes" which are ordered roughly correctly.
384  return data.size();
385 }
386 
389 {
390  return 1;
391 }
392 
395 {
396  return 1;
397 }
398 
399 TermList*
401 {
402  if (p == data.size()) {
403  return this;
404  }
405 
406  size_t keep = 0;
407  if (rare(tail < 0)) {
408  tail += 2;
409  keep = current_term.size() - tail;
410  } else if (usual(!current_term.empty())) {
411  keep = data[p++] ^ MAGIC_XOR_VALUE;
412  }
413  size_t add;
414  if (p == data.size() ||
415  (add = data[p] ^ MAGIC_XOR_VALUE) >= data.size() - p) {
416  throw Xapian::DatabaseCorruptError("Bad spelling data (too little "
417  "left)");
418  }
419  if (rare(keep + tail > current_term.size())) {
420  // The initial part to keep overlaps with the tail part which is an
421  // unusual case requiring special handling.
422  string tail_string(current_term, current_term.size() - tail);
423  current_term.replace(keep, string::npos, data.data() + p + 1, add);
424  current_term += tail_string;
425  } else {
426  current_term.replace(keep, current_term.size() - tail - keep,
427  data.data() + p + 1, add);
428  }
429  p += add + 1;
430 
431  return NULL;
432 }
433 
434 TermList*
436 {
437  while (current_term < term) {
439  return this;
440  }
441  return NULL;
442 }
443 
446 {
447  throw
448  Xapian::UnimplementedError("HoneySpellingTermList::"
449  "positionlist_count() "
450  "not implemented");
451 }
452 
455 {
456  throw
457  Xapian::UnimplementedError("HoneySpellingTermList::"
458  "positionlist_begin() "
459  "not implemented");
460 }
#define MAGIC_XOR_VALUE
void merge_changes()
Merge in batched-up changes.
void toggle_word(const std::string &word)
Xapian::termcount remove_word(const std::string &word, Xapian::termcount freqdec)
void toggle_fragment(Honey::fragment frag, const std::string &word)
TermList * open_termlist(std::string_view word)
Xapian::doccount get_word_frequency(std::string_view word) const
void add_word(const std::string &word, Xapian::termcount freqinc)
The list of words containing a particular trigram.
TermList * next()
Advance the current position to the next term in the termlist.
Xapian::termcount positionlist_count() const
Return the length of the position list for the current position.
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
PositionList * positionlist_begin() const
Return PositionList for the current position.
TermList * skip_to(std::string_view term)
Skip forward to the specified term.
Xapian::doccount get_termfreq() const
Return the term frequency for the term at the current position.
Xapian::termcount get_approx_size() const
Return approximate size of this termlist.
void append(const std::string &word)
DatabaseCorruptError indicates database corruption was detected.
Definition: error.h:397
Abstract base class for iterating term positions in a document.
Definition: positionlist.h:32
Abstract base class for termlists.
Definition: termlist.h:42
virtual Xapian::termcount get_approx_size() const =0
Return approximate size of this termlist.
UnimplementedError indicates an attempt to use an unimplemented feature.
Definition: error.h:313
#define usual(COND)
Definition: config.h:608
#define rare(COND)
Definition: config.h:607
string term
PositionList * p
Hierarchy of classes which Xapian can throw as exceptions.
Collate statistics and calculate the term weights for the ESet.
Spelling correction data for a honey database.
const unsigned KEY_PREFIX_MIDDLE
const unsigned KEY_PREFIX_TAIL
std::string make_spelling_wordlist_key(std::string_view word)
const unsigned KEY_PREFIX_BOOKEND
const unsigned KEY_PREFIX_HEAD
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
Various assertion macros.
#define AssertRel(A, REL, B)
Definition: omassert.h:123
#define Assert(COND)
Definition: omassert.h:122
Pack types into strings and unpack them again.
bool unpack_uint_last(const char **p, const char *end, U *result)
Decode an unsigned integer as the last item in a string.
Definition: pack.h:118
void pack_uint_last(std::string &s, U value)
Append an encoded unsigned integer to a string as the last item.
Definition: pack.h:100
bool operator()(const TermList *a, const TermList *b) const
Build tree to merge TermList objects.
TermList * make_termlist_merger(std::vector< TermList * > &termlists)
typedefs for Xapian