xapian-core  1.4.26
glass_spelling.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2004,2005,2006,2007,2008,2009,2010,2011,2015 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <config.h>
22 
23 #include <xapian/error.h>
24 #include <xapian/types.h>
25 
26 #include "expand/expandweight.h"
27 #include "glass_spelling.h"
28 #include "omassert.h"
29 #include "expand/ortermlist.h"
30 #include "pack.h"
31 
32 #include "../prefix_compressed_strings.h"
33 
34 #include <algorithm>
35 #include <map>
36 #include <queue>
37 #include <vector>
38 #include <set>
39 #include <string>
40 
41 using namespace Glass;
42 using namespace std;
43 
44 void
46 {
47  for (auto i : termlist_deltas) {
48  const string& key = i.first;
49  const set<string>& changes = i.second;
50 
51  auto d = changes.begin();
52  if (d == changes.end()) continue;
53 
54  string updated;
55  string current;
56  PrefixCompressedStringWriter out(updated);
57  if (get_exact_entry(key, current)) {
58  PrefixCompressedStringItor in(current);
59  updated.reserve(current.size()); // FIXME plus some?
60  while (!in.at_end() && d != changes.end()) {
61  const string & word = *in;
62  Assert(d != changes.end());
63  int cmp = word.compare(*d);
64  if (cmp < 0) {
65  out.append(word);
66  ++in;
67  } else if (cmp > 0) {
68  out.append(*d);
69  ++d;
70  } else {
71  // If an existing entry is in the changes list, that means
72  // we should remove it.
73  ++in;
74  ++d;
75  }
76  }
77  if (!in.at_end()) {
78  // FIXME : easy to optimise this to a fix-up and substring copy.
79  while (!in.at_end()) {
80  out.append(*in++);
81  }
82  }
83  }
84  while (d != changes.end()) {
85  out.append(*d++);
86  }
87  if (!updated.empty()) {
88  add(key, updated);
89  } else {
90  del(key);
91  }
92  }
93  termlist_deltas.clear();
94 
95  map<string, Xapian::termcount>::const_iterator j;
96  for (j = wordfreq_changes.begin(); j != wordfreq_changes.end(); ++j) {
97  string key = "W" + j->first;
98  Xapian::termcount wordfreq = j->second;
99  if (wordfreq) {
100  string tag;
101  pack_uint_last(tag, wordfreq);
102  add(key, tag);
103  if (wordfreq > wordfreq_upper_bound)
104  wordfreq_upper_bound = wordfreq;
105  } else {
106  del(key);
107  }
108  }
109  wordfreq_changes.clear();
110 }
111 
112 void
114 {
115  auto i = termlist_deltas.find(frag);
116  if (i == termlist_deltas.end()) {
117  i = termlist_deltas.insert(make_pair(frag, set<string>())).first;
118  }
119  // The commonest case is that we're adding lots of words, so try insert
120  // first and if that reports that the word already exists, remove it.
121  auto res = i->second.insert(word);
122  if (!res.second) {
123  // word is already in the set, so remove it.
124  i->second.erase(res.first);
125  }
126 }
127 
128 void
129 GlassSpellingTable::add_word(const string & word, Xapian::termcount freqinc)
130 {
131  if (word.size() <= 1) return;
132 
133  map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
134  if (i != wordfreq_changes.end()) {
135  // Word "word" already exists and has been modified.
136  if (i->second) {
137  i->second += freqinc;
138  return;
139  }
140  // If "word" is currently modified such that it no longer exists, so
141  // we need to execute the code below to re-add trigrams for it.
142  i->second = freqinc;
143  } else {
144  string key = "W" + word;
145  string data;
146  if (get_exact_entry(key, data)) {
147  // Word "word" already exists, so increment its count.
148  Xapian::termcount freq;
149  const char * p = data.data();
150  if (!unpack_uint_last(&p, p + data.size(), &freq) || freq == 0) {
151  throw Xapian::DatabaseCorruptError("Bad spelling word freq");
152  }
153  wordfreq_changes[word] = freq + freqinc;
154  return;
155  }
156  wordfreq_changes[word] = freqinc;
157  }
158 
159  // Add trigrams for word.
160  toggle_word(word);
161 }
162 
163 void
165 {
166  if (word.size() <= 1) return;
167 
168  map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
169  if (i != wordfreq_changes.end()) {
170  if (i->second == 0) {
171  // Word has already been deleted.
172  return;
173  }
174  // Word "word" exists and has been modified.
175  if (freqdec < i->second) {
176  i->second -= freqdec;
177  return;
178  }
179 
180  // Mark word as deleted.
181  i->second = 0;
182  } else {
183  string key = "W" + word;
184  string data;
185  if (!get_exact_entry(key, data)) {
186  // This word doesn't exist.
187  return;
188  }
189 
190  Xapian::termcount freq;
191  const char *p = data.data();
192  if (!unpack_uint_last(&p, p + data.size(), &freq)) {
193  throw Xapian::DatabaseCorruptError("Bad spelling word freq");
194  }
195  if (freqdec < freq) {
196  wordfreq_changes[word] = freq - freqdec;
197  return;
198  }
199  // Mark word as deleted.
200  wordfreq_changes[word] = 0;
201  }
202 
203  // Remove trigrams for word.
204  toggle_word(word);
205 }
206 
207 void
209 {
210  fragment buf;
211  // Head:
212  buf[0] = 'H';
213  buf[1] = word[0];
214  buf[2] = word[1];
215  buf[3] = '\0';
216  toggle_fragment(buf, word);
217 
218  // Tail:
219  buf[0] = 'T';
220  buf[1] = word[word.size() - 2];
221  buf[2] = word[word.size() - 1];
222  buf[3] = '\0';
223  toggle_fragment(buf, word);
224 
225  if (word.size() <= 4) {
226  // We also generate 'bookends' for two, three, and four character
227  // terms so we can handle transposition of the middle two characters
228  // of a four character word, substitution or deletion of the middle
229  // character of a three character word, or insertion in the middle of a
230  // two character word.
231  // 'Bookends':
232  buf[0] = 'B';
233  buf[1] = word[0];
234  buf[3] = '\0';
235  toggle_fragment(buf, word);
236  }
237  if (word.size() > 2) {
238  set<fragment> done;
239  // Middles:
240  buf[0] = 'M';
241  for (size_t start = 0; start <= word.size() - 3; ++start) {
242  memcpy(buf.data + 1, word.data() + start, 3);
243  // Don't toggle the same fragment twice or it will cancel out.
244  // Bug fixed in 1.2.6.
245  if (done.insert(buf).second)
246  toggle_fragment(buf, word);
247  }
248  }
249 }
250 
252  bool operator()(const TermList *a, const TermList *b) const {
253  return a->get_approx_size() > b->get_approx_size();
254  }
255 };
256 
257 TermList *
259 {
260  // This should have been handled by Database::get_spelling_suggestion().
261  AssertRel(word.size(),>,1);
262 
263  // Merge any pending changes to disk, but don't call commit() so they
264  // won't be switched live.
265  if (!wordfreq_changes.empty()) merge_changes();
266 
267  // Build a priority queue of TermList objects which returns those of
268  // greatest approximate size first.
269  priority_queue<TermList*, vector<TermList*>, TermListGreaterApproxSize> pq;
270  try {
271  string data;
272  fragment buf;
273 
274  // Head:
275  buf[0] = 'H';
276  buf[1] = word[0];
277  buf[2] = word[1];
278  if (get_exact_entry(string(buf), data))
279  pq.push(new GlassSpellingTermList(data));
280 
281  // Tail:
282  buf[0] = 'T';
283  buf[1] = word[word.size() - 2];
284  buf[2] = word[word.size() - 1];
285  if (get_exact_entry(string(buf), data))
286  pq.push(new GlassSpellingTermList(data));
287 
288  if (word.size() <= 4) {
289  // We also generate 'bookends' for two, three, and four character
290  // terms so we can handle transposition of the middle two
291  // characters of a four character word, substitution or deletion of
292  // the middle character of a three character word, or insertion in
293  // the middle of a two character word.
294  buf[0] = 'B';
295  buf[1] = word[0];
296  buf[3] = '\0';
297  if (get_exact_entry(string(buf), data))
298  pq.push(new GlassSpellingTermList(data));
299  }
300  if (word.size() > 2) {
301  // Middles:
302  buf[0] = 'M';
303  for (size_t start = 0; start <= word.size() - 3; ++start) {
304  memcpy(buf.data + 1, word.data() + start, 3);
305  if (get_exact_entry(string(buf), data))
306  pq.push(new GlassSpellingTermList(data));
307  }
308 
309  if (word.size() == 3) {
310  // For three letter words, we generate the two "single
311  // transposition" forms too, so that we can produce good
312  // spelling suggestions.
313  // ABC -> BAC
314  buf[1] = word[1];
315  buf[2] = word[0];
316  if (get_exact_entry(string(buf), data))
317  pq.push(new GlassSpellingTermList(data));
318  // ABC -> ACB
319  buf[1] = word[0];
320  buf[2] = word[2];
321  buf[3] = word[1];
322  if (get_exact_entry(string(buf), data))
323  pq.push(new GlassSpellingTermList(data));
324  }
325  } else {
326  Assert(word.size() == 2);
327  // For two letter words, we generate H and T terms for the
328  // transposed form so that we can produce good spelling
329  // suggestions.
330  // AB -> BA
331  buf[0] = 'H';
332  buf[1] = word[1];
333  buf[2] = word[0];
334  if (get_exact_entry(string(buf), data))
335  pq.push(new GlassSpellingTermList(data));
336  buf[0] = 'T';
337  if (get_exact_entry(string(buf), data))
338  pq.push(new GlassSpellingTermList(data));
339  }
340 
341  if (pq.empty()) return NULL;
342 
343  // Build up an OrTermList tree by combine leaves and/or branches in
344  // pairs. The tree is balanced by the approximated sizes of the leaf
345  // GlassSpellingTermList objects - the way the tree is built are very
346  // similar to how an optimal Huffman code is often constructed.
347  //
348  // Balancing the tree like this should tend to minimise the amount of
349  // work done.
350  while (pq.size() > 1) {
351  // Build the tree such that left is always >= right so that
352  // OrTermList can rely on this when trying to minimise work.
353  TermList * termlist = pq.top();
354  pq.pop();
355 
356  termlist = new OrTermList(pq.top(), termlist);
357  pq.pop();
358  pq.push(termlist);
359  }
360 
361  return pq.top();
362  } catch (...) {
363  // Make sure we delete all the TermList objects to avoid leaking
364  // memory.
365  while (!pq.empty()) {
366  delete pq.top();
367  pq.pop();
368  }
369  throw;
370  }
371 }
372 
374 GlassSpellingTable::get_word_frequency(const string & word) const
375 {
376  map<string, Xapian::termcount>::const_iterator i;
377  i = wordfreq_changes.find(word);
378  if (i != wordfreq_changes.end()) {
379  // Modified frequency for word:
380  return i->second;
381  }
382 
383  string key = "W" + word;
384  string data;
385  if (get_exact_entry(key, data)) {
386  // Word "word" already exists.
387  Xapian::termcount freq;
388  const char *p = data.data();
389  if (!unpack_uint_last(&p, p + data.size(), &freq)) {
390  throw Xapian::DatabaseCorruptError("Bad spelling word freq");
391  }
392  return freq;
393  }
394 
395  return 0;
396 }
397 
399 
402 {
403  // This is only used to decide how to build a OR-tree of TermList objects
404  // so we just need to return "sizes" which are ordered roughly correctly.
405  return data.size();
406 }
407 
408 std::string
410 {
411  return current_term;
412 }
413 
416 {
417  return 1;
418 }
419 
422 {
423  return 1;
424 }
425 
426 TermList *
428 {
429  if (p == data.size()) {
430  p = 0;
431  data.resize(0);
432  return NULL;
433  }
434  if (!current_term.empty()) {
435  current_term.resize(uint8_t(data[p++]) ^ MAGIC_XOR_VALUE);
436  }
437  size_t add;
438  if (p == data.size() ||
439  (add = uint8_t(data[p]) ^ MAGIC_XOR_VALUE) >= data.size() - p)
440  throw Xapian::DatabaseCorruptError("Bad spelling termlist");
441  current_term.append(data.data() + p + 1, add);
442  p += add + 1;
443  return NULL;
444 }
445 
446 TermList *
447 GlassSpellingTermList::skip_to(const string & term)
448 {
449  while (!data.empty() && current_term < term) {
451  }
452  return NULL;
453 }
454 
455 bool
457 {
458  return data.empty();
459 }
460 
463 {
464  throw Xapian::UnimplementedError("GlassSpellingTermList::positionlist_count() not implemented");
465 }
466 
469 {
470  throw Xapian::UnimplementedError("GlassSpellingTermList::positionlist_begin() not implemented");
471 }
#define Assert(COND)
Definition: omassert.h:122
typedefs for Xapian
#define AssertRel(A, REL, B)
Definition: omassert.h:123
void toggle_fragment(Glass::fragment frag, const std::string &word)
void merge_changes()
Merge in batched-up changes.
Merge two TermList objects using an OR operation.
Xapian::termcount get_approx_size() const
Return approximate size of this termlist.
Abstract base class for termlists.
Definition: termlist.h:39
STL namespace.
TermList * next()
Advance the current position to the next term in the termlist.
Xapian::PositionIterator positionlist_begin() const
Return a PositionIterator for the current position.
virtual Xapian::termcount get_approx_size() const =0
Return approximate size of this termlist.
Hierarchy of classes which Xapian can throw as exceptions.
Xapian::termcount positionlist_count() const
Return the length of the position list for the current position.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
TermList * skip_to(const std::string &term)
Skip forward to the specified term.
void pack_uint_last(std::string &s, U value)
Append an encoded unsigned integer to a string as the last item.
Definition: pack.h:93
void add_word(const std::string &word, Xapian::termcount freqinc)
Collate statistics and calculate the term weights for the ESet.
bool at_end() const
Return true if the current position is past the last term in this list.
Class for iterating over term positions.
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
TermList * open_termlist(const std::string &word)
std::string get_termname() const
Return the termname at the current position.
DatabaseCorruptError indicates database corruption was detected.
Definition: error.h:409
void remove_word(const std::string &word, Xapian::termcount freqdec)
void toggle_word(const std::string &word)
Xapian::doccount get_termfreq() const
Return the term frequency for the term at the current position.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
#define MAGIC_XOR_VALUE
Xapian::doccount get_word_frequency(const std::string &word) const
Pack types into strings and unpack them again.
bool unpack_uint_last(const char **p, const char *end, U *result)
Decode an unsigned integer as the last item in a string.
Definition: pack.h:111
void append(const std::string &word)
Various assertion macros.
Spelling correction data for a glass database.
bool operator()(const TermList *a, const TermList *b) const
The list of words containing a particular trigram.
UnimplementedError indicates an attempt to use an unimplemented feature.
Definition: error.h:325