00001
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <config.h>
00022
00023 #include <xapian/error.h>
00024 #include <xapian/types.h>
00025
00026 #include "expandweight.h"
00027 #include "flint_spelling.h"
00028 #include "flint_utils.h"
00029 #include "omassert.h"
00030 #include "ortermlist.h"
00031
00032 #include "../prefix_compressed_strings.h"
00033
00034 #include <algorithm>
00035 #include <map>
00036 #include <queue>
00037 #include <vector>
00038 #include <set>
00039 #include <string>
00040
00041 using namespace std;
00042
00043 void
00044 FlintSpellingTable::merge_changes()
00045 {
00046 map<F_fragment, set<string> >::const_iterator i;
00047 for (i = termlist_deltas.begin(); i != termlist_deltas.end(); ++i) {
00048 string key = i->first;
00049 const set<string> & changes = i->second;
00050
00051 set<string>::const_iterator d = changes.begin();
00052 if (d == changes.end()) continue;
00053
00054 string updated;
00055 string current;
00056 PrefixCompressedStringWriter out(updated);
00057 if (get_exact_entry(key, current)) {
00058 PrefixCompressedStringItor in(current);
00059 updated.reserve(current.size());
00060 while (!in.at_end() && d != changes.end()) {
00061 const string & word = *in;
00062 Assert(d != changes.end());
00063 int cmp = word.compare(*d);
00064 if (cmp < 0) {
00065 out.append(word);
00066 ++in;
00067 } else if (cmp > 0) {
00068 out.append(*d);
00069 ++d;
00070 } else {
00071
00072
00073 ++in;
00074 ++d;
00075 }
00076 }
00077 if (!in.at_end()) {
00078
00079 while (!in.at_end()) {
00080 out.append(*in++);
00081 }
00082 }
00083 }
00084 while (d != changes.end()) {
00085 out.append(*d++);
00086 }
00087 if (!updated.empty()) {
00088 add(key, updated);
00089 } else {
00090 del(key);
00091 }
00092 }
00093 termlist_deltas.clear();
00094
00095 map<string, Xapian::termcount>::const_iterator j;
00096 for (j = wordfreq_changes.begin(); j != wordfreq_changes.end(); ++j) {
00097 string key = "W" + j->first;
00098 if (j->second) {
00099 add(key, F_pack_uint_last(j->second));
00100 } else {
00101 del(key);
00102 }
00103 }
00104 wordfreq_changes.clear();
00105 }
00106
00107 void
00108 FlintSpellingTable::toggle_fragment(F_fragment frag, const string & word)
00109 {
00110 map<F_fragment, set<string> >::iterator i = termlist_deltas.find(frag);
00111 if (i == termlist_deltas.end()) {
00112 i = termlist_deltas.insert(make_pair(frag, set<string>())).first;
00113 }
00114
00115
00116 pair<set<string>::iterator, bool> res = i->second.insert(word);
00117 if (!res.second) {
00118
00119 i->second.erase(res.first);
00120 }
00121 }
00122
00123 void
00124 FlintSpellingTable::add_word(const string & word, Xapian::termcount freqinc)
00125 {
00126 if (word.size() <= 1) return;
00127
00128 map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
00129 if (i != wordfreq_changes.end()) {
00130
00131 if (i->second) {
00132 i->second += freqinc;
00133 return;
00134 }
00135
00136
00137 i->second = freqinc;
00138 } else {
00139 string key = "W" + word;
00140 string data;
00141 if (get_exact_entry(key, data)) {
00142
00143 Xapian::termcount freq;
00144 const char * p = data.data();
00145 if (!F_unpack_uint_last(&p, p + data.size(), &freq) || freq == 0) {
00146 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
00147 }
00148 wordfreq_changes[word] = freq + freqinc;
00149 return;
00150 }
00151 wordfreq_changes[word] = freqinc;
00152 }
00153
00154
00155 toggle_word(word);
00156 }
00157
00158 void
00159 FlintSpellingTable::remove_word(const string & word, Xapian::termcount freqdec)
00160 {
00161 if (word.size() <= 1) return;
00162
00163 map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
00164 if (i != wordfreq_changes.end()) {
00165 if (i->second == 0) {
00166
00167 return;
00168 }
00169
00170 if (freqdec < i->second) {
00171 i->second -= freqdec;
00172 return;
00173 }
00174
00175
00176 i->second = 0;
00177 } else {
00178 string key = "W" + word;
00179 string data;
00180 if (!get_exact_entry(key, data)) {
00181
00182 return;
00183 }
00184
00185 Xapian::termcount freq;
00186 const char *p = data.data();
00187 if (!F_unpack_uint_last(&p, p + data.size(), &freq)) {
00188 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
00189 }
00190 if (freqdec < freq) {
00191 wordfreq_changes[word] = freq - freqdec;
00192 return;
00193 }
00194
00195 wordfreq_changes[word] = 0;
00196 }
00197
00198
00199 toggle_word(word);
00200 }
00201
00202 void
00203 FlintSpellingTable::toggle_word(const string & word)
00204 {
00205 F_fragment buf;
00206
00207 buf[0] = 'H';
00208 buf[1] = word[0];
00209 buf[2] = word[1];
00210 buf[3] = '\0';
00211 toggle_fragment(buf, word);
00212
00213
00214 buf[0] = 'T';
00215 buf[1] = word[word.size() - 2];
00216 buf[2] = word[word.size() - 1];
00217 buf[3] = '\0';
00218 toggle_fragment(buf, word);
00219
00220 if (word.size() <= 4) {
00221
00222
00223
00224
00225
00226
00227 buf[0] = 'B';
00228 buf[1] = word[0];
00229 buf[3] = '\0';
00230 toggle_fragment(buf, word);
00231 }
00232 if (word.size() > 2) {
00233 set<F_fragment> done;
00234
00235 buf[0] = 'M';
00236 for (size_t start = 0; start <= word.size() - 3; ++start) {
00237 memcpy(buf.data + 1, word.data() + start, 3);
00238
00239
00240 if (done.insert(buf).second)
00241 toggle_fragment(buf, word);
00242 }
00243 }
00244 }
00245
00246 struct TermListGreaterApproxSize {
00247 bool operator()(const TermList *a, const TermList *b) {
00248 return a->get_approx_size() > b->get_approx_size();
00249 }
00250 };
00251
00252 TermList *
00253 FlintSpellingTable::open_termlist(const string & word)
00254 {
00255
00256 AssertRel(word.size(),>,1);
00257
00258
00259
00260 if (!wordfreq_changes.empty()) merge_changes();
00261
00262
00263
00264 priority_queue<TermList*, vector<TermList*>, TermListGreaterApproxSize> pq;
00265 try {
00266 string data;
00267 F_fragment buf;
00268
00269
00270 buf[0] = 'H';
00271 buf[1] = word[0];
00272 buf[2] = word[1];
00273 if (get_exact_entry(string(buf), data))
00274 pq.push(new FlintSpellingTermList(data));
00275
00276
00277 buf[0] = 'T';
00278 buf[1] = word[word.size() - 2];
00279 buf[2] = word[word.size() - 1];
00280 if (get_exact_entry(string(buf), data))
00281 pq.push(new FlintSpellingTermList(data));
00282
00283 if (word.size() <= 4) {
00284
00285
00286
00287
00288
00289 buf[0] = 'B';
00290 buf[1] = word[0];
00291 buf[3] = '\0';
00292 if (get_exact_entry(string(buf), data))
00293 pq.push(new FlintSpellingTermList(data));
00294 }
00295 if (word.size() > 2) {
00296
00297 buf[0] = 'M';
00298 for (size_t start = 0; start <= word.size() - 3; ++start) {
00299 memcpy(buf.data + 1, word.data() + start, 3);
00300 if (get_exact_entry(string(buf), data))
00301 pq.push(new FlintSpellingTermList(data));
00302 }
00303
00304 if (word.size() == 3) {
00305
00306
00307
00308
00309 buf[1] = word[1];
00310 buf[2] = word[0];
00311 if (get_exact_entry(string(buf), data))
00312 pq.push(new FlintSpellingTermList(data));
00313
00314 buf[1] = word[0];
00315 buf[2] = word[2];
00316 buf[3] = word[1];
00317 if (get_exact_entry(string(buf), data))
00318 pq.push(new FlintSpellingTermList(data));
00319 }
00320 } else {
00321 Assert(word.size() == 2);
00322
00323
00324
00325
00326 buf[0] = 'H';
00327 buf[1] = word[1];
00328 buf[2] = word[0];
00329 if (get_exact_entry(string(buf), data))
00330 pq.push(new FlintSpellingTermList(data));
00331 buf[0] = 'T';
00332 if (get_exact_entry(string(buf), data))
00333 pq.push(new FlintSpellingTermList(data));
00334 }
00335
00336 if (pq.empty()) return NULL;
00337
00338
00339
00340
00341
00342
00343
00344
00345 while (pq.size() > 1) {
00346
00347
00348 TermList * termlist = pq.top();
00349 pq.pop();
00350
00351 termlist = new OrTermList(pq.top(), termlist);
00352 pq.pop();
00353 pq.push(termlist);
00354 }
00355
00356 return pq.top();
00357 } catch (...) {
00358
00359
00360 while (!pq.empty()) {
00361 delete pq.top();
00362 pq.pop();
00363 }
00364 throw;
00365 }
00366 }
00367
00368 Xapian::doccount
00369 FlintSpellingTable::get_word_frequency(const string & word) const
00370 {
00371 map<string, Xapian::termcount>::const_iterator i;
00372 i = wordfreq_changes.find(word);
00373 if (i != wordfreq_changes.end()) {
00374
00375 return i->second;
00376 }
00377
00378 string key = "W" + word;
00379 string data;
00380 if (get_exact_entry(key, data)) {
00381
00382 Xapian::termcount freq;
00383 const char *p = data.data();
00384 if (!F_unpack_uint_last(&p, p + data.size(), &freq)) {
00385 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
00386 }
00387 return freq;
00388 }
00389
00390 return 0;
00391 }
00392
00394
00395 Xapian::termcount
00396 FlintSpellingTermList::get_approx_size() const
00397 {
00398
00399
00400 return data.size();
00401 }
00402
00403 std::string
00404 FlintSpellingTermList::get_termname() const
00405 {
00406 return current_term;
00407 }
00408
00409 Xapian::termcount
00410 FlintSpellingTermList::get_wdf() const
00411 {
00412 return 1;
00413 }
00414
00415 Xapian::doccount
00416 FlintSpellingTermList::get_termfreq() const
00417 {
00418 return 1;
00419 }
00420
00421 Xapian::termcount
00422 FlintSpellingTermList::get_collection_freq() const
00423 {
00424 return 1;
00425 }
00426
00427 TermList *
00428 FlintSpellingTermList::next()
00429 {
00430 if (p == data.size()) {
00431 p = 0;
00432 data.resize(0);
00433 return NULL;
00434 }
00435 if (!current_term.empty()) {
00436 if (p == data.size())
00437 throw Xapian::DatabaseCorruptError("Bad spelling termlist");
00438 current_term.resize(byte(data[p++]) ^ MAGIC_XOR_VALUE);
00439 }
00440 unsigned add;
00441 if (p == data.size() ||
00442 (add = byte(data[p]) ^ MAGIC_XOR_VALUE) >= data.size() - p)
00443 throw Xapian::DatabaseCorruptError("Bad spelling termlist");
00444 current_term.append(data.data() + p + 1, add);
00445 p += add + 1;
00446 return NULL;
00447 }
00448
00449 TermList *
00450 FlintSpellingTermList::skip_to(const string & term)
00451 {
00452 while (!data.empty() && current_term < term) {
00453 (void)FlintSpellingTermList::next();
00454 }
00455 return NULL;
00456 }
00457
00458 bool
00459 FlintSpellingTermList::at_end() const
00460 {
00461 return data.empty();
00462 }
00463
00464 Xapian::termcount
00465 FlintSpellingTermList::positionlist_count() const
00466 {
00467 throw Xapian::UnimplementedError("FlintSpellingTermList::positionlist_count() not implemented");
00468 }
00469
00470 Xapian::PositionIterator
00471 FlintSpellingTermList::positionlist_begin() const
00472 {
00473 throw Xapian::UnimplementedError("FlintSpellingTermList::positionlist_begin() not implemented");
00474 }