xapian-core  2.0.0
glass_spelling.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2004-2024 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include <xapian/error.h>
24 #include <xapian/types.h>
25 
26 #include "expand/expandweight.h"
27 #include "expand/termlistmerger.h"
28 #include "glass_spelling.h"
29 #include "omassert.h"
30 #include "pack.h"
31 
32 #include "../prefix_compressed_strings.h"
33 
34 #include <algorithm>
35 #include <map>
36 #include <queue>
37 #include <vector>
38 #include <set>
39 #include <string>
40 #include <string_view>
41 
42 using namespace Glass;
43 using namespace std;
44 
45 void
47 {
48  for (auto i : termlist_deltas) {
49  const string& key = i.first;
50  const set<string>& changes = i.second;
51 
52  auto d = changes.begin();
53  if (d == changes.end()) continue;
54 
55  string updated;
56  string current;
57  PrefixCompressedStringWriter out(updated);
58  if (get_exact_entry(key, current)) {
59  PrefixCompressedStringItor in(current);
60  updated.reserve(current.size()); // FIXME plus some?
61  while (!in.at_end() && d != changes.end()) {
62  const string & word = *in;
63  Assert(d != changes.end());
64  int cmp = word.compare(*d);
65  if (cmp < 0) {
66  out.append(word);
67  ++in;
68  } else if (cmp > 0) {
69  out.append(*d);
70  ++d;
71  } else {
72  // If an existing entry is in the changes list, that means
73  // we should remove it.
74  ++in;
75  ++d;
76  }
77  }
78  if (!in.at_end()) {
79  // FIXME : easy to optimise this to a fix-up and substring copy.
80  while (!in.at_end()) {
81  out.append(*in++);
82  }
83  }
84  }
85  while (d != changes.end()) {
86  out.append(*d++);
87  }
88  if (!updated.empty()) {
89  add(key, updated);
90  } else {
91  del(key);
92  }
93  }
94  termlist_deltas.clear();
95 
96  for (auto&& j : wordfreq_changes) {
97  string key = "W" + j.first;
98  Xapian::termcount wordfreq = j.second;
99  if (wordfreq) {
100  string tag;
101  pack_uint_last(tag, wordfreq);
102  add(key, tag);
103  if (wordfreq > wordfreq_upper_bound)
104  wordfreq_upper_bound = wordfreq;
105  } else {
106  del(key);
107  }
108  }
109  wordfreq_changes.clear();
110 }
111 
112 void
114 {
115  auto i = termlist_deltas.find(frag);
116  if (i == termlist_deltas.end()) {
117  i = termlist_deltas.insert(make_pair(frag, set<string>())).first;
118  }
119  // The commonest case is that we're adding lots of words, so try insert
120  // first and if that reports that the word already exists, remove it.
121  auto res = i->second.emplace(word);
122  if (!res.second) {
123  // word is already in the set, so remove it.
124  i->second.erase(res.first);
125  }
126 }
127 
128 void
130 {
131  if (word.size() <= 1) return;
132 
133  auto i = wordfreq_changes.find(word);
134  if (i != wordfreq_changes.end()) {
135  // Word "word" already exists and has been modified.
136  if (i->second) {
137  i->second += freqinc;
138  return;
139  }
140  // If "word" is currently modified such that it no longer exists, so
141  // we need to execute the code below to re-add trigrams for it.
142  i->second = freqinc;
143  } else {
144  string key = "W"s.append(word);
145  string data;
146  if (get_exact_entry(key, data)) {
147  // Word "word" already exists, so increment its count.
148  Xapian::termcount freq;
149  const char * p = data.data();
150  if (!unpack_uint_last(&p, p + data.size(), &freq) || freq == 0) {
151  throw Xapian::DatabaseCorruptError("Bad spelling word freq");
152  }
153  wordfreq_changes.emplace(word, freq + freqinc);
154  return;
155  }
156  wordfreq_changes.emplace(word, freqinc);
157  }
158 
159  // Add trigrams for word.
160  toggle_word(word);
161 }
162 
165 {
166  if (word.size() <= 1) return freqdec;
167 
168  auto i = wordfreq_changes.find(word);
169  if (i != wordfreq_changes.end()) {
170  if (i->second == 0) {
171  // Word has already been deleted.
172  return freqdec;
173  }
174  // Word "word" exists and has been modified.
175  if (freqdec < i->second) {
176  i->second -= freqdec;
177  return 0;
178  }
179  freqdec -= i->second;
180 
181  // Mark word as deleted.
182  i->second = 0;
183  } else {
184  string key = "W"s.append(word);
185  string data;
186  if (!get_exact_entry(key, data)) {
187  // This word doesn't exist.
188  return freqdec;
189  }
190 
191  Xapian::termcount freq;
192  const char *p = data.data();
193  if (!unpack_uint_last(&p, p + data.size(), &freq)) {
194  throw Xapian::DatabaseCorruptError("Bad spelling word freq");
195  }
196  if (freqdec < freq) {
197  wordfreq_changes.emplace(word, freq - freqdec);
198  return 0;
199  }
200  freqdec -= freq;
201 
202  // Mark word as deleted.
203  wordfreq_changes.emplace(word, 0);
204  }
205 
206  // Remove trigrams for word.
207  toggle_word(word);
208 
209  return freqdec;
210 }
211 
212 void
214 {
215  fragment buf;
216  // Head:
217  buf[0] = 'H';
218  buf[1] = word[0];
219  buf[2] = word[1];
220  buf[3] = '\0';
221  toggle_fragment(buf, word);
222 
223  // Tail:
224  buf[0] = 'T';
225  buf[1] = word[word.size() - 2];
226  buf[2] = word[word.size() - 1];
227  buf[3] = '\0';
228  toggle_fragment(buf, word);
229 
230  if (word.size() <= 4) {
231  // We also generate 'bookends' for two, three, and four character
232  // terms so we can handle transposition of the middle two characters
233  // of a four character word, substitution or deletion of the middle
234  // character of a three character word, or insertion in the middle of a
235  // two character word.
236  // 'Bookends':
237  buf[0] = 'B';
238  buf[1] = word[0];
239  buf[3] = '\0';
240  toggle_fragment(buf, word);
241  }
242  if (word.size() > 2) {
243  set<fragment> done;
244  // Middles:
245  buf[0] = 'M';
246  for (size_t start = 0; start <= word.size() - 3; ++start) {
247  memcpy(buf.data + 1, word.data() + start, 3);
248  // Don't toggle the same fragment twice or it will cancel out.
249  // Bug fixed in 1.2.6.
250  if (done.insert(buf).second)
251  toggle_fragment(buf, word);
252  }
253  }
254 }
255 
257  bool operator()(const TermList *a, const TermList *b) const {
258  return a->get_approx_size() > b->get_approx_size();
259  }
260 };
261 
262 TermList*
264 {
265  // This should have been handled by Database::get_spelling_suggestion().
266  AssertRel(word.size(),>,1);
267 
268  // Merge any pending changes to disk, but don't call commit() so they
269  // won't be switched live.
270  if (!wordfreq_changes.empty()) merge_changes();
271 
272  vector<TermList*> termlists;
273  try {
274  string data;
275  fragment buf;
276 
277  // Head:
278  buf[0] = 'H';
279  buf[1] = word[0];
280  buf[2] = word[1];
281  if (get_exact_entry(string(buf), data))
282  termlists.push_back(new GlassSpellingTermList(data));
283 
284  // Tail:
285  buf[0] = 'T';
286  buf[1] = word[word.size() - 2];
287  buf[2] = word[word.size() - 1];
288  if (get_exact_entry(string(buf), data))
289  termlists.push_back(new GlassSpellingTermList(data));
290 
291  if (word.size() <= 4) {
292  // We also generate 'bookends' for two, three, and four character
293  // terms so we can handle transposition of the middle two
294  // characters of a four character word, substitution or deletion of
295  // the middle character of a three character word, or insertion in
296  // the middle of a two character word.
297  buf[0] = 'B';
298  buf[1] = word[0];
299  buf[3] = '\0';
300  if (get_exact_entry(string(buf), data))
301  termlists.push_back(new GlassSpellingTermList(data));
302  }
303  if (word.size() > 2) {
304  // Middles:
305  buf[0] = 'M';
306  for (size_t start = 0; start <= word.size() - 3; ++start) {
307  memcpy(buf.data + 1, word.data() + start, 3);
308  if (get_exact_entry(string(buf), data))
309  termlists.push_back(new GlassSpellingTermList(data));
310  }
311 
312  if (word.size() == 3) {
313  // For three letter words, we generate the two "single
314  // transposition" forms too, so that we can produce good
315  // spelling suggestions.
316  // ABC -> BAC
317  buf[1] = word[1];
318  buf[2] = word[0];
319  if (get_exact_entry(string(buf), data))
320  termlists.push_back(new GlassSpellingTermList(data));
321  // ABC -> ACB
322  buf[1] = word[0];
323  buf[2] = word[2];
324  buf[3] = word[1];
325  if (get_exact_entry(string(buf), data))
326  termlists.push_back(new GlassSpellingTermList(data));
327  }
328  } else {
329  Assert(word.size() == 2);
330  // For two letter words, we generate H and T terms for the
331  // transposed form so that we can produce good spelling
332  // suggestions.
333  // AB -> BA
334  buf[0] = 'H';
335  buf[1] = word[1];
336  buf[2] = word[0];
337  if (get_exact_entry(string(buf), data))
338  termlists.push_back(new GlassSpellingTermList(data));
339  buf[0] = 'T';
340  if (get_exact_entry(string(buf), data))
341  termlists.push_back(new GlassSpellingTermList(data));
342  }
343 
344  return make_termlist_merger(termlists);
345  } catch (...) {
346  // Make sure we delete all the TermList objects to avoid leaking
347  // memory.
348  for (auto& t : termlists) {
349  delete t;
350  }
351  throw;
352  }
353 }
354 
357 {
358  auto i = wordfreq_changes.find(word);
359  if (i != wordfreq_changes.end()) {
360  // Modified frequency for word:
361  return i->second;
362  }
363 
364  string key = "W"s.append(word);
365  string data;
366  if (get_exact_entry(key, data)) {
367  // Word "word" already exists.
368  Xapian::termcount freq;
369  const char *p = data.data();
370  if (!unpack_uint_last(&p, p + data.size(), &freq)) {
371  throw Xapian::DatabaseCorruptError("Bad spelling word freq");
372  }
373  return freq;
374  }
375 
376  return 0;
377 }
378 
380 
383 {
384  // This is only used to decide how to build a OR-tree of TermList objects
385  // so we just need to return "sizes" which are ordered roughly correctly.
386  return data.size();
387 }
388 
391 {
392  return 1;
393 }
394 
397 {
398  return 1;
399 }
400 
401 TermList *
403 {
404  if (p == data.size()) {
405  return this;
406  }
407  if (!current_term.empty()) {
408  current_term.resize(uint8_t(data[p++]) ^ MAGIC_XOR_VALUE);
409  }
410  size_t add;
411  if (p == data.size() ||
412  (add = uint8_t(data[p]) ^ MAGIC_XOR_VALUE) >= data.size() - p)
413  throw Xapian::DatabaseCorruptError("Bad spelling termlist");
414  current_term.append(data.data() + p + 1, add);
415  p += add + 1;
416  return NULL;
417 }
418 
419 TermList*
421 {
422  while (current_term < term) {
424  return this;
425  }
426  return NULL;
427 }
428 
431 {
432  throw Xapian::UnimplementedError("GlassSpellingTermList::positionlist_count() not implemented");
433 }
434 
437 {
438  throw Xapian::UnimplementedError("GlassSpellingTermList::positionlist_begin() not implemented");
439 }
#define MAGIC_XOR_VALUE
Xapian::termcount remove_word(std::string_view word, Xapian::termcount freqdec)
void merge_changes()
Merge in batched-up changes.
TermList * open_termlist(std::string_view word)
void toggle_fragment(Glass::fragment frag, std::string_view word)
void toggle_word(std::string_view word)
void add_word(std::string_view word, Xapian::termcount freqinc)
Xapian::doccount get_word_frequency(std::string_view word) const
The list of words containing a particular trigram.
Xapian::doccount get_termfreq() const
Return the term frequency for the term at the current position.
TermList * skip_to(std::string_view term)
Skip forward to the specified term.
Xapian::termcount positionlist_count() const
Return the length of the position list for the current position.
TermList * next()
Advance the current position to the next term in the termlist.
Xapian::termcount get_approx_size() const
Return approximate size of this termlist.
PositionList * positionlist_begin() const
Return PositionList for the current position.
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
void append(const std::string &word)
DatabaseCorruptError indicates database corruption was detected.
Definition: error.h:397
Abstract base class for iterating term positions in a document.
Definition: positionlist.h:32
Abstract base class for termlists.
Definition: termlist.h:42
virtual Xapian::termcount get_approx_size() const =0
Return approximate size of this termlist.
UnimplementedError indicates an attempt to use an unimplemented feature.
Definition: error.h:313
string term
PositionList * p
Hierarchy of classes which Xapian can throw as exceptions.
Collate statistics and calculate the term weights for the ESet.
Spelling correction data for a glass database.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
Various assertion macros.
#define AssertRel(A, REL, B)
Definition: omassert.h:123
#define Assert(COND)
Definition: omassert.h:122
Pack types into strings and unpack them again.
bool unpack_uint_last(const char **p, const char *end, U *result)
Decode an unsigned integer as the last item in a string.
Definition: pack.h:118
void pack_uint_last(std::string &s, U value)
Append an encoded unsigned integer to a string as the last item.
Definition: pack.h:100
bool operator()(const TermList *a, const TermList *b) const
Build tree to merge TermList objects.
TermList * make_termlist_merger(std::vector< TermList * > &termlists)
typedefs for Xapian