sourcedoc/html/chert__spelling_8cc_source.html

 /* Copyright (C) 2004,2005,2006,2007,2008,2009,2010,2011 Olly Betts
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
  */

 #include <config.h>

 #include <xapian/error.h>
 #include <xapian/types.h>

 #include "expand/expandweight.h"
 #include "chert_spelling.h"
 #include "omassert.h"
 #include "expand/ortermlist.h"
 #include "pack.h"

 #include "../prefix_compressed_strings.h"

 #include <algorithm>
 #include <map>
 #include <queue>
 #include <vector>
 #include <set>
 #include <string>

 using namespace std;

 void
 ChertSpellingTable::merge_changes()
 {
     for (auto i : termlist_deltas) {
         const string& key = i.first;
         const set<string>& changes = i.second;

         auto d = changes.begin();
         if (d == changes.end()) continue;

         string updated;
         string current;
         PrefixCompressedStringWriter out(updated);
         if (get_exact_entry(key, current)) {
             PrefixCompressedStringItor in(current);
             updated.reserve(current.size()); // FIXME plus some?
             while (!in.at_end() && d != changes.end()) {
                 const string & word = *in;
                 Assert(d != changes.end());
                 int cmp = word.compare(*d);
                 if (cmp < 0) {
                     out.append(word);
                     ++in;
                 } else if (cmp > 0) {
                     out.append(*d);
                     ++d;
                 } else {
                     // If an existing entry is in the changes list, that means
                     // we should remove it.
                     ++in;
                     ++d;
                 }
             }
             if (!in.at_end()) {
                 // FIXME : easy to optimise this to a fix-up and substring copy.
                 while (!in.at_end()) {
                     out.append(*in++);
                 }
             }
         }
         while (d != changes.end()) {
             out.append(*d++);
         }
         if (!updated.empty()) {
             add(key, updated);
         } else {
             del(key);
         }
     }
     termlist_deltas.clear();

     map<string, Xapian::termcount>::const_iterator j;
     for (j = wordfreq_changes.begin(); j != wordfreq_changes.end(); ++j) {
         string key = "W" + j->first;
         if (j->second) {
             string tag;
             pack_uint_last(tag, j->second);
             add(key, tag);
         } else {
             del(key);
         }
     }
     wordfreq_changes.clear();
 }

 void
 ChertSpellingTable::toggle_fragment(fragment frag, const string & word)
 {
     auto i = termlist_deltas.find(frag);
     if (i == termlist_deltas.end()) {
         i = termlist_deltas.insert(make_pair(frag, set<string>())).first;
     }
     // The commonest case is that we're adding lots of words, so try insert
     // first and if that reports that the word already exists, remove it.
     auto res = i->second.insert(word);
     if (!res.second) {
         // word is already in the set, so remove it.
         i->second.erase(res.first);
     }
 }

 void
 ChertSpellingTable::add_word(const string & word, Xapian::termcount freqinc)
 {
     if (word.size() <= 1) return;

     map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
     if (i != wordfreq_changes.end()) {
         // Word "word" already exists and has been modified.
         if (i->second) {
             i->second += freqinc;
             return;
         }
         // If "word" is currently modified such that it no longer exists, so
         // we need to execute the code below to re-add trigrams for it.
         i->second = freqinc;
     } else {
         string key = "W" + word;
         string data;
         if (get_exact_entry(key, data)) {
             // Word "word" already exists, so increment its count.
             Xapian::termcount freq;
             const char * p = data.data();
             if (!unpack_uint_last(&p, p + data.size(), &freq) || freq == 0) {
                 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
             }
             wordfreq_changes[word] = freq + freqinc;
             return;
         }
         wordfreq_changes[word] = freqinc;
     }

     // Add trigrams for word.
     toggle_word(word);
 }

 void
 ChertSpellingTable::remove_word(const string & word, Xapian::termcount freqdec)
 {
     if (word.size() <= 1) return;

     map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
     if (i != wordfreq_changes.end()) {
         if (i->second == 0) {
             // Word has already been deleted.
             return;
         }
         // Word "word" exists and has been modified.
         if (freqdec < i->second) {
             i->second -= freqdec;
             return;
         }

         // Mark word as deleted.
         i->second = 0;
     } else {
         string key = "W" + word;
         string data;
         if (!get_exact_entry(key, data)) {
             // This word doesn't exist.
             return;
         }

         Xapian::termcount freq;
         const char *p = data.data();
         if (!unpack_uint_last(&p, p + data.size(), &freq)) {
             throw Xapian::DatabaseCorruptError("Bad spelling word freq");
         }
         if (freqdec < freq) {
             wordfreq_changes[word] = freq - freqdec;
             return;
         }
         // Mark word as deleted.
         wordfreq_changes[word] = 0;
     }

     // Remove trigrams for word.
     toggle_word(word);
 }

 void
 ChertSpellingTable::toggle_word(const string & word)
 {
     fragment buf;
     // Head:
     buf[0] = 'H';
     buf[1] = word[0];
     buf[2] = word[1];
     buf[3] = '\0';
     toggle_fragment(buf, word);

     // Tail:
     buf[0] = 'T';
     buf[1] = word[word.size() - 2];
     buf[2] = word[word.size() - 1];
     buf[3] = '\0';
     toggle_fragment(buf, word);

     if (word.size() <= 4) {
         // We also generate 'bookends' for two, three, and four character
         // terms so we can handle transposition of the middle two characters
         // of a four character word, substitution or deletion of the middle
         // character of a three character word, or insertion in the middle of a
         // two character word.
         // 'Bookends':
         buf[0] = 'B';
         buf[1] = word[0];
         buf[3] = '\0';
         toggle_fragment(buf, word);
     }
     if (word.size() > 2) {
         set<fragment> done;
         // Middles:
         buf[0] = 'M';
         for (size_t start = 0; start <= word.size() - 3; ++start) {
             memcpy(buf.data + 1, word.data() + start, 3);
             // Don't toggle the same fragment twice or it will cancel out.
             // Bug fixed in 1.2.6.
             if (done.insert(buf).second)
                 toggle_fragment(buf, word);
         }
     }
 }

 struct TermListGreaterApproxSize {
     bool operator()(const TermList *a, const TermList *b) const {
         return a->get_approx_size() > b->get_approx_size();
     }
 };

 TermList *
 ChertSpellingTable::open_termlist(const string & word)
 {
     // This should have been handled by Database::get_spelling_suggestion().
     AssertRel(word.size(),>,1);

     // Merge any pending changes to disk, but don't call commit() so they
     // won't be switched live.
     if (!wordfreq_changes.empty()) merge_changes();

     // Build a priority queue of TermList objects which returns those of
     // greatest approximate size first.
     priority_queue<TermList*, vector<TermList*>, TermListGreaterApproxSize> pq;
     try {
         string data;
         fragment buf;

         // Head:
         buf[0] = 'H';
         buf[1] = word[0];
         buf[2] = word[1];
         if (get_exact_entry(string(buf), data))
             pq.push(new ChertSpellingTermList(data));

         // Tail:
         buf[0] = 'T';
         buf[1] = word[word.size() - 2];
         buf[2] = word[word.size() - 1];
         if (get_exact_entry(string(buf), data))
             pq.push(new ChertSpellingTermList(data));

         if (word.size() <= 4) {
             // We also generate 'bookends' for two, three, and four character
             // terms so we can handle transposition of the middle two
             // characters of a four character word, substitution or deletion of
             // the middle character of a three character word, or insertion in
             // the middle of a two character word.
             buf[0] = 'B';
             buf[1] = word[0];
             buf[3] = '\0';
             if (get_exact_entry(string(buf), data))
                 pq.push(new ChertSpellingTermList(data));
         }
         if (word.size() > 2) {
             // Middles:
             buf[0] = 'M';
             for (size_t start = 0; start <= word.size() - 3; ++start) {
                 memcpy(buf.data + 1, word.data() + start, 3);
                 if (get_exact_entry(string(buf), data))
                     pq.push(new ChertSpellingTermList(data));
             }

             if (word.size() == 3) {
                 // For three letter words, we generate the two "single
                 // transposition" forms too, so that we can produce good
                 // spelling suggestions.
                 // ABC -> BAC
                 buf[1] = word[1];
                 buf[2] = word[0];
                 if (get_exact_entry(string(buf), data))
                     pq.push(new ChertSpellingTermList(data));
                 // ABC -> ACB
                 buf[1] = word[0];
                 buf[2] = word[2];
                 buf[3] = word[1];
                 if (get_exact_entry(string(buf), data))
                     pq.push(new ChertSpellingTermList(data));
             }
         } else {
             Assert(word.size() == 2);
             // For two letter words, we generate H and T terms for the
             // transposed form so that we can produce good spelling
             // suggestions.
             // AB -> BA
             buf[0] = 'H';
             buf[1] = word[1];
             buf[2] = word[0];
             if (get_exact_entry(string(buf), data))
                 pq.push(new ChertSpellingTermList(data));
             buf[0] = 'T';
             if (get_exact_entry(string(buf), data))
                 pq.push(new ChertSpellingTermList(data));
         }

         if (pq.empty()) return NULL;

         // Build up an OrTermList tree by combine leaves and/or branches in
         // pairs.  The tree is balanced by the approximated sizes of the leaf
         // ChertSpellingTermList objects - the way the tree is built are very
         // similar to how an optimal Huffman code is often constructed.
         //
         // Balancing the tree like this should tend to minimise the amount of
         // work done.
         while (pq.size() > 1) {
             // Build the tree such that left is always >= right so that
             // OrTermList can rely on this when trying to minimise work.
             TermList * termlist = pq.top();
             pq.pop();

             termlist = new OrTermList(pq.top(), termlist);
             pq.pop();
             pq.push(termlist);
         }

         return pq.top();
     } catch (...) {
         // Make sure we delete all the TermList objects to avoid leaking
         // memory.
         while (!pq.empty()) {
             delete pq.top();
             pq.pop();
         }
         throw;
     }
 }

 Xapian::doccount
 ChertSpellingTable::get_word_frequency(const string & word) const
 {
     map<string, Xapian::termcount>::const_iterator i;
     i = wordfreq_changes.find(word);
     if (i != wordfreq_changes.end()) {
         // Modified frequency for word:
         return i->second;
     }

     string key = "W" + word;
     string data;
     if (get_exact_entry(key, data)) {
         // Word "word" already exists.
         Xapian::termcount freq;
         const char *p = data.data();
         if (!unpack_uint_last(&p, p + data.size(), &freq)) {
             throw Xapian::DatabaseCorruptError("Bad spelling word freq");
         }
         return freq;
     }

     return 0;
 }


 Xapian::termcount
 ChertSpellingTermList::get_approx_size() const
 {
     // This is only used to decide how to build a OR-tree of TermList objects
     // so we just need to return "sizes" which are ordered roughly correctly.
     return data.size();
 }

 std::string
 ChertSpellingTermList::get_termname() const
 {
     return current_term;
 }

 Xapian::termcount
 ChertSpellingTermList::get_wdf() const
 {
     return 1;
 }

 Xapian::doccount
 ChertSpellingTermList::get_termfreq() const
 {
     return 1;
 }

 TermList *
 ChertSpellingTermList::next()
 {
     if (p == data.size()) {
         p = 0;
         data.resize(0);
         return NULL;
     }
     if (!current_term.empty()) {
         if (p == data.size())
             throw Xapian::DatabaseCorruptError("Bad spelling termlist");
         current_term.resize(uint8_t(data[p++]) ^ MAGIC_XOR_VALUE);
     }
     size_t add;
     if (p == data.size() ||
         (add = uint8_t(data[p]) ^ MAGIC_XOR_VALUE) >= data.size() - p)
         throw Xapian::DatabaseCorruptError("Bad spelling termlist");
     current_term.append(data.data() + p + 1, add);
     p += add + 1;
     return NULL;
 }

 TermList *
 ChertSpellingTermList::skip_to(const string & term)
 {
     while (!data.empty() && current_term < term) {
         (void)ChertSpellingTermList::next();
     }
     return NULL;
 }

 bool
 ChertSpellingTermList::at_end() const
 {
     return data.empty();
 }

 Xapian::termcount
 ChertSpellingTermList::positionlist_count() const
 {
     throw Xapian::UnimplementedError("ChertSpellingTermList::positionlist_count() not implemented");
 }

 Xapian::PositionIterator
 ChertSpellingTermList::positionlist_begin() const
 {
     throw Xapian::UnimplementedError("ChertSpellingTermList::positionlist_begin() not implemented");
 }
ChertSpellingTermList
The list of words containing a particular trigram.
Definition: chert_spelling.h:122

Assert
#define Assert(COND)
Definition: omassert.h:122

ChertSpellingTermList::positionlist_begin
Xapian::PositionIterator positionlist_begin() const
Return a PositionIterator for the current position.
Definition: chert_spelling.cc:466

ChertSpellingTermList::get_wdf
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
Definition: chert_spelling.cc:411

types.h
typedefs for Xapian

fragment::data
char data[4]
Definition: chert_spelling.h:35

AssertRel
#define AssertRel(A, REL, B)
Definition: omassert.h:123

fragment
Definition: chert_spelling.h:34

ortermlist.h
Merge two TermList objects using an OR operation.

TermListGreaterApproxSize
Definition: chert_spelling.cc:247

config.h

Xapian::TermIterator::Internal
Abstract base class for termlists.
Definition: termlist.h:39

std
STL namespace.

ChertSpellingTable::get_word_frequency
Xapian::doccount get_word_frequency(const std::string &word) const
Definition: chert_spelling.cc:370

Xapian::TermIterator::Internal::get_approx_size
virtual Xapian::termcount get_approx_size() const =0
Return approximate size of this termlist.

ChertSpellingTermList::positionlist_count
Xapian::termcount positionlist_count() const
Return the length of the position list for the current position.
Definition: chert_spelling.cc:460

error.h
Hierarchy of classes which Xapian can throw as exceptions.

Xapian::termcount
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72

PrefixCompressedStringWriter
Definition: prefix_compressed_strings.h:91

ChertSpellingTable::toggle_fragment
void toggle_fragment(fragment frag, const std::string &word)
Definition: chert_spelling.cc:109

ChertSpellingTable::open_termlist
TermList * open_termlist(const std::string &word)
Definition: chert_spelling.cc:254

pack_uint_last
void pack_uint_last(std::string &s, U value)
Append an encoded unsigned integer to a string as the last item.
Definition: pack.h:93

expandweight.h
Collate statistics and calculate the term weights for the ESet.

ChertSpellingTermList::next
TermList * next()
Advance the current position to the next term in the termlist.
Definition: chert_spelling.cc:423

ChertSpellingTermList::get_approx_size
Xapian::termcount get_approx_size() const
Return approximate size of this termlist.
Definition: chert_spelling.cc:397

ChertSpellingTable::merge_changes
void merge_changes()
Definition: chert_spelling.cc:44

ChertSpellingTermList::get_termname
std::string get_termname() const
Return the termname at the current position.
Definition: chert_spelling.cc:405

Xapian::PositionIterator
Class for iterating over term positions.
Definition: positioniterator.h:40

ChertSpellingTable::toggle_word
void toggle_word(const std::string &word)
Definition: chert_spelling.cc:204

ChertSpellingTermList::at_end
bool at_end() const
Return true if the current position is past the last term in this list.
Definition: chert_spelling.cc:454

PrefixCompressedStringItor::at_end
bool at_end() const
Definition: prefix_compressed_strings.h:86

Xapian::DatabaseCorruptError
DatabaseCorruptError indicates database corruption was detected.
Definition: error.h:409

OrTermList
Definition: ortermlist.h:32

chert_spelling.h
Spelling correction data for a chert database.

Xapian::doccount
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38

MAGIC_XOR_VALUE
#define MAGIC_XOR_VALUE
Definition: byte_length_strings.h:33

ChertSpellingTermList::skip_to
TermList * skip_to(const std::string &term)
Skip forward to the specified term.
Definition: chert_spelling.cc:445

ChertSpellingTermList::get_termfreq
Xapian::doccount get_termfreq() const
Return the term frequency for the term at the current position.
Definition: chert_spelling.cc:417

pack.h
Pack types into strings and unpack them again.

unpack_uint_last
bool unpack_uint_last(const char **p, const char *end, U *result)
Decode an unsigned integer as the last item in a string.
Definition: pack.h:111

PrefixCompressedStringItor
Definition: prefix_compressed_strings.h:36

PrefixCompressedStringWriter::append
void append(const std::string &word)
Definition: prefix_compressed_strings.h:98

omassert.h
Various assertion macros.

ChertSpellingTable::add_word
void add_word(const std::string &word, Xapian::termcount freqinc)
Definition: chert_spelling.cc:125

TermListGreaterApproxSize::operator()
bool operator()(const TermList *a, const TermList *b) const
Definition: chert_spelling.cc:248

ChertSpellingTable::remove_word
void remove_word(const std::string &word, Xapian::termcount freqdec)
Definition: chert_spelling.cc:160

Xapian::UnimplementedError
UnimplementedError indicates an attempt to use an unimplemented feature.
Definition: error.h:325