xapian-core  1.4.21
chert_spelling.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2004,2005,2006,2007,2008,2009,2010,2011 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <config.h>
22 
23 #include <xapian/error.h>
24 #include <xapian/types.h>
25 
26 #include "expand/expandweight.h"
27 #include "chert_spelling.h"
28 #include "omassert.h"
29 #include "expand/ortermlist.h"
30 #include "pack.h"
31 
32 #include "../prefix_compressed_strings.h"
33 
34 #include <algorithm>
35 #include <map>
36 #include <queue>
37 #include <vector>
38 #include <set>
39 #include <string>
40 
41 using namespace std;
42 
43 void
45 {
46  for (auto i : termlist_deltas) {
47  const string& key = i.first;
48  const set<string>& changes = i.second;
49 
50  auto d = changes.begin();
51  if (d == changes.end()) continue;
52 
53  string updated;
54  string current;
55  PrefixCompressedStringWriter out(updated);
56  if (get_exact_entry(key, current)) {
57  PrefixCompressedStringItor in(current);
58  updated.reserve(current.size()); // FIXME plus some?
59  while (!in.at_end() && d != changes.end()) {
60  const string & word = *in;
61  Assert(d != changes.end());
62  int cmp = word.compare(*d);
63  if (cmp < 0) {
64  out.append(word);
65  ++in;
66  } else if (cmp > 0) {
67  out.append(*d);
68  ++d;
69  } else {
70  // If an existing entry is in the changes list, that means
71  // we should remove it.
72  ++in;
73  ++d;
74  }
75  }
76  if (!in.at_end()) {
77  // FIXME : easy to optimise this to a fix-up and substring copy.
78  while (!in.at_end()) {
79  out.append(*in++);
80  }
81  }
82  }
83  while (d != changes.end()) {
84  out.append(*d++);
85  }
86  if (!updated.empty()) {
87  add(key, updated);
88  } else {
89  del(key);
90  }
91  }
92  termlist_deltas.clear();
93 
94  map<string, Xapian::termcount>::const_iterator j;
95  for (j = wordfreq_changes.begin(); j != wordfreq_changes.end(); ++j) {
96  string key = "W" + j->first;
97  if (j->second) {
98  string tag;
99  pack_uint_last(tag, j->second);
100  add(key, tag);
101  } else {
102  del(key);
103  }
104  }
105  wordfreq_changes.clear();
106 }
107 
108 void
110 {
111  auto i = termlist_deltas.find(frag);
112  if (i == termlist_deltas.end()) {
113  i = termlist_deltas.insert(make_pair(frag, set<string>())).first;
114  }
115  // The commonest case is that we're adding lots of words, so try insert
116  // first and if that reports that the word already exists, remove it.
117  auto res = i->second.insert(word);
118  if (!res.second) {
119  // word is already in the set, so remove it.
120  i->second.erase(res.first);
121  }
122 }
123 
124 void
125 ChertSpellingTable::add_word(const string & word, Xapian::termcount freqinc)
126 {
127  if (word.size() <= 1) return;
128 
129  map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
130  if (i != wordfreq_changes.end()) {
131  // Word "word" already exists and has been modified.
132  if (i->second) {
133  i->second += freqinc;
134  return;
135  }
136  // If "word" is currently modified such that it no longer exists, so
137  // we need to execute the code below to re-add trigrams for it.
138  i->second = freqinc;
139  } else {
140  string key = "W" + word;
141  string data;
142  if (get_exact_entry(key, data)) {
143  // Word "word" already exists, so increment its count.
144  Xapian::termcount freq;
145  const char * p = data.data();
146  if (!unpack_uint_last(&p, p + data.size(), &freq) || freq == 0) {
147  throw Xapian::DatabaseCorruptError("Bad spelling word freq");
148  }
149  wordfreq_changes[word] = freq + freqinc;
150  return;
151  }
152  wordfreq_changes[word] = freqinc;
153  }
154 
155  // Add trigrams for word.
156  toggle_word(word);
157 }
158 
159 void
161 {
162  if (word.size() <= 1) return;
163 
164  map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
165  if (i != wordfreq_changes.end()) {
166  if (i->second == 0) {
167  // Word has already been deleted.
168  return;
169  }
170  // Word "word" exists and has been modified.
171  if (freqdec < i->second) {
172  i->second -= freqdec;
173  return;
174  }
175 
176  // Mark word as deleted.
177  i->second = 0;
178  } else {
179  string key = "W" + word;
180  string data;
181  if (!get_exact_entry(key, data)) {
182  // This word doesn't exist.
183  return;
184  }
185 
186  Xapian::termcount freq;
187  const char *p = data.data();
188  if (!unpack_uint_last(&p, p + data.size(), &freq)) {
189  throw Xapian::DatabaseCorruptError("Bad spelling word freq");
190  }
191  if (freqdec < freq) {
192  wordfreq_changes[word] = freq - freqdec;
193  return;
194  }
195  // Mark word as deleted.
196  wordfreq_changes[word] = 0;
197  }
198 
199  // Remove trigrams for word.
200  toggle_word(word);
201 }
202 
203 void
205 {
206  fragment buf;
207  // Head:
208  buf[0] = 'H';
209  buf[1] = word[0];
210  buf[2] = word[1];
211  buf[3] = '\0';
212  toggle_fragment(buf, word);
213 
214  // Tail:
215  buf[0] = 'T';
216  buf[1] = word[word.size() - 2];
217  buf[2] = word[word.size() - 1];
218  buf[3] = '\0';
219  toggle_fragment(buf, word);
220 
221  if (word.size() <= 4) {
222  // We also generate 'bookends' for two, three, and four character
223  // terms so we can handle transposition of the middle two characters
224  // of a four character word, substitution or deletion of the middle
225  // character of a three character word, or insertion in the middle of a
226  // two character word.
227  // 'Bookends':
228  buf[0] = 'B';
229  buf[1] = word[0];
230  buf[3] = '\0';
231  toggle_fragment(buf, word);
232  }
233  if (word.size() > 2) {
234  set<fragment> done;
235  // Middles:
236  buf[0] = 'M';
237  for (size_t start = 0; start <= word.size() - 3; ++start) {
238  memcpy(buf.data + 1, word.data() + start, 3);
239  // Don't toggle the same fragment twice or it will cancel out.
240  // Bug fixed in 1.2.6.
241  if (done.insert(buf).second)
242  toggle_fragment(buf, word);
243  }
244  }
245 }
246 
248  bool operator()(const TermList *a, const TermList *b) const {
249  return a->get_approx_size() > b->get_approx_size();
250  }
251 };
252 
253 TermList *
255 {
256  // This should have been handled by Database::get_spelling_suggestion().
257  AssertRel(word.size(),>,1);
258 
259  // Merge any pending changes to disk, but don't call commit() so they
260  // won't be switched live.
261  if (!wordfreq_changes.empty()) merge_changes();
262 
263  // Build a priority queue of TermList objects which returns those of
264  // greatest approximate size first.
265  priority_queue<TermList*, vector<TermList*>, TermListGreaterApproxSize> pq;
266  try {
267  string data;
268  fragment buf;
269 
270  // Head:
271  buf[0] = 'H';
272  buf[1] = word[0];
273  buf[2] = word[1];
274  if (get_exact_entry(string(buf), data))
275  pq.push(new ChertSpellingTermList(data));
276 
277  // Tail:
278  buf[0] = 'T';
279  buf[1] = word[word.size() - 2];
280  buf[2] = word[word.size() - 1];
281  if (get_exact_entry(string(buf), data))
282  pq.push(new ChertSpellingTermList(data));
283 
284  if (word.size() <= 4) {
285  // We also generate 'bookends' for two, three, and four character
286  // terms so we can handle transposition of the middle two
287  // characters of a four character word, substitution or deletion of
288  // the middle character of a three character word, or insertion in
289  // the middle of a two character word.
290  buf[0] = 'B';
291  buf[1] = word[0];
292  buf[3] = '\0';
293  if (get_exact_entry(string(buf), data))
294  pq.push(new ChertSpellingTermList(data));
295  }
296  if (word.size() > 2) {
297  // Middles:
298  buf[0] = 'M';
299  for (size_t start = 0; start <= word.size() - 3; ++start) {
300  memcpy(buf.data + 1, word.data() + start, 3);
301  if (get_exact_entry(string(buf), data))
302  pq.push(new ChertSpellingTermList(data));
303  }
304 
305  if (word.size() == 3) {
306  // For three letter words, we generate the two "single
307  // transposition" forms too, so that we can produce good
308  // spelling suggestions.
309  // ABC -> BAC
310  buf[1] = word[1];
311  buf[2] = word[0];
312  if (get_exact_entry(string(buf), data))
313  pq.push(new ChertSpellingTermList(data));
314  // ABC -> ACB
315  buf[1] = word[0];
316  buf[2] = word[2];
317  buf[3] = word[1];
318  if (get_exact_entry(string(buf), data))
319  pq.push(new ChertSpellingTermList(data));
320  }
321  } else {
322  Assert(word.size() == 2);
323  // For two letter words, we generate H and T terms for the
324  // transposed form so that we can produce good spelling
325  // suggestions.
326  // AB -> BA
327  buf[0] = 'H';
328  buf[1] = word[1];
329  buf[2] = word[0];
330  if (get_exact_entry(string(buf), data))
331  pq.push(new ChertSpellingTermList(data));
332  buf[0] = 'T';
333  if (get_exact_entry(string(buf), data))
334  pq.push(new ChertSpellingTermList(data));
335  }
336 
337  if (pq.empty()) return NULL;
338 
339  // Build up an OrTermList tree by combine leaves and/or branches in
340  // pairs. The tree is balanced by the approximated sizes of the leaf
341  // ChertSpellingTermList objects - the way the tree is built are very
342  // similar to how an optimal Huffman code is often constructed.
343  //
344  // Balancing the tree like this should tend to minimise the amount of
345  // work done.
346  while (pq.size() > 1) {
347  // Build the tree such that left is always >= right so that
348  // OrTermList can rely on this when trying to minimise work.
349  TermList * termlist = pq.top();
350  pq.pop();
351 
352  termlist = new OrTermList(pq.top(), termlist);
353  pq.pop();
354  pq.push(termlist);
355  }
356 
357  return pq.top();
358  } catch (...) {
359  // Make sure we delete all the TermList objects to avoid leaking
360  // memory.
361  while (!pq.empty()) {
362  delete pq.top();
363  pq.pop();
364  }
365  throw;
366  }
367 }
368 
370 ChertSpellingTable::get_word_frequency(const string & word) const
371 {
372  map<string, Xapian::termcount>::const_iterator i;
373  i = wordfreq_changes.find(word);
374  if (i != wordfreq_changes.end()) {
375  // Modified frequency for word:
376  return i->second;
377  }
378 
379  string key = "W" + word;
380  string data;
381  if (get_exact_entry(key, data)) {
382  // Word "word" already exists.
383  Xapian::termcount freq;
384  const char *p = data.data();
385  if (!unpack_uint_last(&p, p + data.size(), &freq)) {
386  throw Xapian::DatabaseCorruptError("Bad spelling word freq");
387  }
388  return freq;
389  }
390 
391  return 0;
392 }
393 
395 
398 {
399  // This is only used to decide how to build a OR-tree of TermList objects
400  // so we just need to return "sizes" which are ordered roughly correctly.
401  return data.size();
402 }
403 
404 std::string
406 {
407  return current_term;
408 }
409 
412 {
413  return 1;
414 }
415 
418 {
419  return 1;
420 }
421 
422 TermList *
424 {
425  if (p == data.size()) {
426  p = 0;
427  data.resize(0);
428  return NULL;
429  }
430  if (!current_term.empty()) {
431  if (p == data.size())
432  throw Xapian::DatabaseCorruptError("Bad spelling termlist");
433  current_term.resize(uint8_t(data[p++]) ^ MAGIC_XOR_VALUE);
434  }
435  size_t add;
436  if (p == data.size() ||
437  (add = uint8_t(data[p]) ^ MAGIC_XOR_VALUE) >= data.size() - p)
438  throw Xapian::DatabaseCorruptError("Bad spelling termlist");
439  current_term.append(data.data() + p + 1, add);
440  p += add + 1;
441  return NULL;
442 }
443 
444 TermList *
445 ChertSpellingTermList::skip_to(const string & term)
446 {
447  while (!data.empty() && current_term < term) {
449  }
450  return NULL;
451 }
452 
453 bool
455 {
456  return data.empty();
457 }
458 
461 {
462  throw Xapian::UnimplementedError("ChertSpellingTermList::positionlist_count() not implemented");
463 }
464 
467 {
468  throw Xapian::UnimplementedError("ChertSpellingTermList::positionlist_begin() not implemented");
469 }
The list of words containing a particular trigram.
#define Assert(COND)
Definition: omassert.h:122
Xapian::PositionIterator positionlist_begin() const
Return a PositionIterator for the current position.
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
typedefs for Xapian
char data[4]
#define AssertRel(A, REL, B)
Definition: omassert.h:123
Merge two TermList objects using an OR operation.
Abstract base class for termlists.
Definition: termlist.h:39
STL namespace.
Xapian::doccount get_word_frequency(const std::string &word) const
virtual Xapian::termcount get_approx_size() const =0
Return approximate size of this termlist.
Xapian::termcount positionlist_count() const
Return the length of the position list for the current position.
Hierarchy of classes which Xapian can throw as exceptions.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
void toggle_fragment(fragment frag, const std::string &word)
TermList * open_termlist(const std::string &word)
void pack_uint_last(std::string &s, U value)
Append an encoded unsigned integer to a string as the last item.
Definition: pack.h:93
Collate statistics and calculate the term weights for the ESet.
TermList * next()
Advance the current position to the next term in the termlist.
Xapian::termcount get_approx_size() const
Return approximate size of this termlist.
std::string get_termname() const
Return the termname at the current position.
Class for iterating over term positions.
void toggle_word(const std::string &word)
bool at_end() const
Return true if the current position is past the last term in this list.
DatabaseCorruptError indicates database corruption was detected.
Definition: error.h:409
Spelling correction data for a chert database.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
#define MAGIC_XOR_VALUE
TermList * skip_to(const std::string &term)
Skip forward to the specified term.
Xapian::doccount get_termfreq() const
Return the term frequency for the term at the current position.
Pack types into strings and unpack them again.
bool unpack_uint_last(const char **p, const char *end, U *result)
Decode an unsigned integer as the last item in a string.
Definition: pack.h:111
void append(const std::string &word)
Various assertion macros.
void add_word(const std::string &word, Xapian::termcount freqinc)
bool operator()(const TermList *a, const TermList *b) const
void remove_word(const std::string &word, Xapian::termcount freqdec)
UnimplementedError indicates an attempt to use an unimplemented feature.
Definition: error.h:325