00001
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <config.h>
00022
00023 #include <xapian/error.h>
00024 #include <xapian/types.h>
00025
00026 #include "expandweight.h"
00027 #include "chert_spelling.h"
00028 #include "omassert.h"
00029 #include "ortermlist.h"
00030 #include "pack.h"
00031
00032 #include "../prefix_compressed_strings.h"
00033
00034 #include <algorithm>
00035 #include <map>
00036 #include <queue>
00037 #include <vector>
00038 #include <set>
00039 #include <string>
00040
00041 using namespace std;
00042
00043 void
00044 ChertSpellingTable::merge_changes()
00045 {
00046 map<fragment, set<string> >::const_iterator i;
00047 for (i = termlist_deltas.begin(); i != termlist_deltas.end(); ++i) {
00048 string key = i->first;
00049 const set<string> & changes = i->second;
00050
00051 set<string>::const_iterator d = changes.begin();
00052 if (d == changes.end()) continue;
00053
00054 string updated;
00055 string current;
00056 PrefixCompressedStringWriter out(updated);
00057 if (get_exact_entry(key, current)) {
00058 PrefixCompressedStringItor in(current);
00059 updated.reserve(current.size());
00060 while (!in.at_end() && d != changes.end()) {
00061 const string & word = *in;
00062 Assert(d != changes.end());
00063 int cmp = word.compare(*d);
00064 if (cmp < 0) {
00065 out.append(word);
00066 ++in;
00067 } else if (cmp > 0) {
00068 out.append(*d);
00069 ++d;
00070 } else {
00071
00072
00073 ++in;
00074 ++d;
00075 }
00076 }
00077 if (!in.at_end()) {
00078
00079 while (!in.at_end()) {
00080 out.append(*in++);
00081 }
00082 }
00083 }
00084 while (d != changes.end()) {
00085 out.append(*d++);
00086 }
00087 if (!updated.empty()) {
00088 add(key, updated);
00089 } else {
00090 del(key);
00091 }
00092 }
00093 termlist_deltas.clear();
00094
00095 map<string, Xapian::termcount>::const_iterator j;
00096 for (j = wordfreq_changes.begin(); j != wordfreq_changes.end(); ++j) {
00097 string key = "W" + j->first;
00098 if (j->second) {
00099 string tag;
00100 pack_uint_last(tag, j->second);
00101 add(key, tag);
00102 } else {
00103 del(key);
00104 }
00105 }
00106 wordfreq_changes.clear();
00107 }
00108
00109 void
00110 ChertSpellingTable::toggle_fragment(fragment frag, const string & word)
00111 {
00112 map<fragment, set<string> >::iterator i = termlist_deltas.find(frag);
00113 if (i == termlist_deltas.end()) {
00114 i = termlist_deltas.insert(make_pair(frag, set<string>())).first;
00115 }
00116
00117
00118 pair<set<string>::iterator, bool> res = i->second.insert(word);
00119 if (!res.second) {
00120
00121 i->second.erase(res.first);
00122 }
00123 }
00124
00125 void
00126 ChertSpellingTable::add_word(const string & word, Xapian::termcount freqinc)
00127 {
00128 if (word.size() <= 1) return;
00129
00130 map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
00131 if (i != wordfreq_changes.end()) {
00132
00133 if (i->second) {
00134 i->second += freqinc;
00135 return;
00136 }
00137
00138
00139 i->second = freqinc;
00140 } else {
00141 string key = "W" + word;
00142 string data;
00143 if (get_exact_entry(key, data)) {
00144
00145 Xapian::termcount freq;
00146 const char * p = data.data();
00147 if (!unpack_uint_last(&p, p + data.size(), &freq) || freq == 0) {
00148 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
00149 }
00150 wordfreq_changes[word] = freq + freqinc;
00151 return;
00152 }
00153 wordfreq_changes[word] = freqinc;
00154 }
00155
00156
00157 toggle_word(word);
00158 }
00159
00160 void
00161 ChertSpellingTable::remove_word(const string & word, Xapian::termcount freqdec)
00162 {
00163 if (word.size() <= 1) return;
00164
00165 map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
00166 if (i != wordfreq_changes.end()) {
00167 if (i->second == 0) {
00168
00169 return;
00170 }
00171
00172 if (freqdec < i->second) {
00173 i->second -= freqdec;
00174 return;
00175 }
00176
00177
00178 i->second = 0;
00179 } else {
00180 string key = "W" + word;
00181 string data;
00182 if (!get_exact_entry(key, data)) {
00183
00184 return;
00185 }
00186
00187 Xapian::termcount freq;
00188 const char *p = data.data();
00189 if (!unpack_uint_last(&p, p + data.size(), &freq)) {
00190 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
00191 }
00192 if (freqdec < freq) {
00193 wordfreq_changes[word] = freq - freqdec;
00194 return;
00195 }
00196
00197 wordfreq_changes[word] = 0;
00198 }
00199
00200
00201 toggle_word(word);
00202 }
00203
00204 void
00205 ChertSpellingTable::toggle_word(const string & word)
00206 {
00207 fragment buf;
00208
00209 buf[0] = 'H';
00210 buf[1] = word[0];
00211 buf[2] = word[1];
00212 buf[3] = '\0';
00213 toggle_fragment(buf, word);
00214
00215
00216 buf[0] = 'T';
00217 buf[1] = word[word.size() - 2];
00218 buf[2] = word[word.size() - 1];
00219 buf[3] = '\0';
00220 toggle_fragment(buf, word);
00221
00222 if (word.size() <= 4) {
00223
00224
00225
00226
00227
00228
00229 buf[0] = 'B';
00230 buf[1] = word[0];
00231 buf[3] = '\0';
00232 toggle_fragment(buf, word);
00233 }
00234 if (word.size() > 2) {
00235 set<fragment> done;
00236
00237 buf[0] = 'M';
00238 for (size_t start = 0; start <= word.size() - 3; ++start) {
00239 memcpy(buf.data + 1, word.data() + start, 3);
00240
00241
00242 if (done.insert(buf).second)
00243 toggle_fragment(buf, word);
00244 }
00245 }
00246 }
00247
00248 struct TermListGreaterApproxSize {
00249 bool operator()(const TermList *a, const TermList *b) {
00250 return a->get_approx_size() > b->get_approx_size();
00251 }
00252 };
00253
00254 TermList *
00255 ChertSpellingTable::open_termlist(const string & word)
00256 {
00257
00258 AssertRel(word.size(),>,1);
00259
00260
00261
00262 if (!wordfreq_changes.empty()) merge_changes();
00263
00264
00265
00266 priority_queue<TermList*, vector<TermList*>, TermListGreaterApproxSize> pq;
00267 try {
00268 string data;
00269 fragment buf;
00270
00271
00272 buf[0] = 'H';
00273 buf[1] = word[0];
00274 buf[2] = word[1];
00275 if (get_exact_entry(string(buf), data))
00276 pq.push(new ChertSpellingTermList(data));
00277
00278
00279 buf[0] = 'T';
00280 buf[1] = word[word.size() - 2];
00281 buf[2] = word[word.size() - 1];
00282 if (get_exact_entry(string(buf), data))
00283 pq.push(new ChertSpellingTermList(data));
00284
00285 if (word.size() <= 4) {
00286
00287
00288
00289
00290
00291 buf[0] = 'B';
00292 buf[1] = word[0];
00293 buf[3] = '\0';
00294 if (get_exact_entry(string(buf), data))
00295 pq.push(new ChertSpellingTermList(data));
00296 }
00297 if (word.size() > 2) {
00298
00299 buf[0] = 'M';
00300 for (size_t start = 0; start <= word.size() - 3; ++start) {
00301 memcpy(buf.data + 1, word.data() + start, 3);
00302 if (get_exact_entry(string(buf), data))
00303 pq.push(new ChertSpellingTermList(data));
00304 }
00305
00306 if (word.size() == 3) {
00307
00308
00309
00310
00311 buf[1] = word[1];
00312 buf[2] = word[0];
00313 if (get_exact_entry(string(buf), data))
00314 pq.push(new ChertSpellingTermList(data));
00315
00316 buf[1] = word[0];
00317 buf[2] = word[2];
00318 buf[3] = word[1];
00319 if (get_exact_entry(string(buf), data))
00320 pq.push(new ChertSpellingTermList(data));
00321 }
00322 } else {
00323 Assert(word.size() == 2);
00324
00325
00326
00327
00328 buf[0] = 'H';
00329 buf[1] = word[1];
00330 buf[2] = word[0];
00331 if (get_exact_entry(string(buf), data))
00332 pq.push(new ChertSpellingTermList(data));
00333 buf[0] = 'T';
00334 if (get_exact_entry(string(buf), data))
00335 pq.push(new ChertSpellingTermList(data));
00336 }
00337
00338 if (pq.empty()) return NULL;
00339
00340
00341
00342
00343
00344
00345
00346
00347 while (pq.size() > 1) {
00348
00349
00350 TermList * termlist = pq.top();
00351 pq.pop();
00352
00353 termlist = new OrTermList(pq.top(), termlist);
00354 pq.pop();
00355 pq.push(termlist);
00356 }
00357
00358 return pq.top();
00359 } catch (...) {
00360
00361
00362 while (!pq.empty()) {
00363 delete pq.top();
00364 pq.pop();
00365 }
00366 throw;
00367 }
00368 }
00369
00370 Xapian::doccount
00371 ChertSpellingTable::get_word_frequency(const string & word) const
00372 {
00373 map<string, Xapian::termcount>::const_iterator i;
00374 i = wordfreq_changes.find(word);
00375 if (i != wordfreq_changes.end()) {
00376
00377 return i->second;
00378 }
00379
00380 string key = "W" + word;
00381 string data;
00382 if (get_exact_entry(key, data)) {
00383
00384 Xapian::termcount freq;
00385 const char *p = data.data();
00386 if (!unpack_uint_last(&p, p + data.size(), &freq)) {
00387 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
00388 }
00389 return freq;
00390 }
00391
00392 return 0;
00393 }
00394
00396
00397 Xapian::termcount
00398 ChertSpellingTermList::get_approx_size() const
00399 {
00400
00401
00402 return data.size();
00403 }
00404
00405 std::string
00406 ChertSpellingTermList::get_termname() const
00407 {
00408 return current_term;
00409 }
00410
00411 Xapian::termcount
00412 ChertSpellingTermList::get_wdf() const
00413 {
00414 return 1;
00415 }
00416
00417 Xapian::doccount
00418 ChertSpellingTermList::get_termfreq() const
00419 {
00420 return 1;
00421 }
00422
00423 Xapian::termcount
00424 ChertSpellingTermList::get_collection_freq() const
00425 {
00426 return 1;
00427 }
00428
00429 TermList *
00430 ChertSpellingTermList::next()
00431 {
00432 if (p == data.size()) {
00433 p = 0;
00434 data.resize(0);
00435 return NULL;
00436 }
00437 if (!current_term.empty()) {
00438 if (p == data.size())
00439 throw Xapian::DatabaseCorruptError("Bad spelling termlist");
00440 current_term.resize(byte(data[p++]) ^ MAGIC_XOR_VALUE);
00441 }
00442 size_t add;
00443 if (p == data.size() ||
00444 (add = byte(data[p]) ^ MAGIC_XOR_VALUE) >= data.size() - p)
00445 throw Xapian::DatabaseCorruptError("Bad spelling termlist");
00446 current_term.append(data.data() + p + 1, add);
00447 p += add + 1;
00448 return NULL;
00449 }
00450
00451 TermList *
00452 ChertSpellingTermList::skip_to(const string & term)
00453 {
00454 while (!data.empty() && current_term < term) {
00455 (void)ChertSpellingTermList::next();
00456 }
00457 return NULL;
00458 }
00459
00460 bool
00461 ChertSpellingTermList::at_end() const
00462 {
00463 return data.empty();
00464 }
00465
00466 Xapian::termcount
00467 ChertSpellingTermList::positionlist_count() const
00468 {
00469 throw Xapian::UnimplementedError("ChertSpellingTermList::positionlist_count() not implemented");
00470 }
00471
00472 Xapian::PositionIterator
00473 ChertSpellingTermList::positionlist_begin() const
00474 {
00475 throw Xapian::UnimplementedError("ChertSpellingTermList::positionlist_begin() not implemented");
00476 }