00001
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <config.h>
00022
00023 #include <xapian/error.h>
00024 #include <xapian/types.h>
00025
00026 #include "expandweight.h"
00027 #include "brass_spelling.h"
00028 #include "omassert.h"
00029 #include "ortermlist.h"
00030 #include "pack.h"
00031
00032 #include "../prefix_compressed_strings.h"
00033
00034 #include <algorithm>
00035 #include <map>
00036 #include <queue>
00037 #include <vector>
00038 #include <set>
00039 #include <string>
00040
00041 using namespace Brass;
00042 using namespace std;
00043
00044 void
00045 BrassSpellingTable::merge_changes()
00046 {
00047 map<fragment, set<string> >::const_iterator i;
00048 for (i = termlist_deltas.begin(); i != termlist_deltas.end(); ++i) {
00049 string key = i->first;
00050 const set<string> & changes = i->second;
00051
00052 set<string>::const_iterator d = changes.begin();
00053 if (d == changes.end()) continue;
00054
00055 string updated;
00056 string current;
00057 PrefixCompressedStringWriter out(updated);
00058 if (get_exact_entry(key, current)) {
00059 PrefixCompressedStringItor in(current);
00060 updated.reserve(current.size());
00061 while (!in.at_end() && d != changes.end()) {
00062 const string & word = *in;
00063 Assert(d != changes.end());
00064 int cmp = word.compare(*d);
00065 if (cmp < 0) {
00066 out.append(word);
00067 ++in;
00068 } else if (cmp > 0) {
00069 out.append(*d);
00070 ++d;
00071 } else {
00072
00073
00074 ++in;
00075 ++d;
00076 }
00077 }
00078 if (!in.at_end()) {
00079
00080 while (!in.at_end()) {
00081 out.append(*in++);
00082 }
00083 }
00084 }
00085 while (d != changes.end()) {
00086 out.append(*d++);
00087 }
00088 if (!updated.empty()) {
00089 add(key, updated);
00090 } else {
00091 del(key);
00092 }
00093 }
00094 termlist_deltas.clear();
00095
00096 map<string, Xapian::termcount>::const_iterator j;
00097 for (j = wordfreq_changes.begin(); j != wordfreq_changes.end(); ++j) {
00098 string key = "W" + j->first;
00099 if (j->second) {
00100 string tag;
00101 pack_uint_last(tag, j->second);
00102 add(key, tag);
00103 } else {
00104 del(key);
00105 }
00106 }
00107 wordfreq_changes.clear();
00108 }
00109
00110 void
00111 BrassSpellingTable::toggle_fragment(fragment frag, const string & word)
00112 {
00113 map<fragment, set<string> >::iterator i = termlist_deltas.find(frag);
00114 if (i == termlist_deltas.end()) {
00115 i = termlist_deltas.insert(make_pair(frag, set<string>())).first;
00116 }
00117
00118
00119 pair<set<string>::iterator, bool> res = i->second.insert(word);
00120 if (!res.second) {
00121
00122 i->second.erase(res.first);
00123 }
00124 }
00125
00126 void
00127 BrassSpellingTable::add_word(const string & word, Xapian::termcount freqinc)
00128 {
00129 if (word.size() <= 1) return;
00130
00131 map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
00132 if (i != wordfreq_changes.end()) {
00133
00134 if (i->second) {
00135 i->second += freqinc;
00136 return;
00137 }
00138
00139
00140 i->second = freqinc;
00141 } else {
00142 string key = "W" + word;
00143 string data;
00144 if (get_exact_entry(key, data)) {
00145
00146 Xapian::termcount freq;
00147 const char * p = data.data();
00148 if (!unpack_uint_last(&p, p + data.size(), &freq) || freq == 0) {
00149 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
00150 }
00151 wordfreq_changes[word] = freq + freqinc;
00152 return;
00153 }
00154 wordfreq_changes[word] = freqinc;
00155 }
00156
00157
00158 toggle_word(word);
00159 }
00160
00161 void
00162 BrassSpellingTable::remove_word(const string & word, Xapian::termcount freqdec)
00163 {
00164 if (word.size() <= 1) return;
00165
00166 map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
00167 if (i != wordfreq_changes.end()) {
00168 if (i->second == 0) {
00169
00170 return;
00171 }
00172
00173 if (freqdec < i->second) {
00174 i->second -= freqdec;
00175 return;
00176 }
00177
00178
00179 i->second = 0;
00180 } else {
00181 string key = "W" + word;
00182 string data;
00183 if (!get_exact_entry(key, data)) {
00184
00185 return;
00186 }
00187
00188 Xapian::termcount freq;
00189 const char *p = data.data();
00190 if (!unpack_uint_last(&p, p + data.size(), &freq)) {
00191 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
00192 }
00193 if (freqdec < freq) {
00194 wordfreq_changes[word] = freq - freqdec;
00195 return;
00196 }
00197
00198 wordfreq_changes[word] = 0;
00199 }
00200
00201
00202 toggle_word(word);
00203 }
00204
00205 void
00206 BrassSpellingTable::toggle_word(const string & word)
00207 {
00208 fragment buf;
00209
00210 buf[0] = 'H';
00211 buf[1] = word[0];
00212 buf[2] = word[1];
00213 buf[3] = '\0';
00214 toggle_fragment(buf, word);
00215
00216
00217 buf[0] = 'T';
00218 buf[1] = word[word.size() - 2];
00219 buf[2] = word[word.size() - 1];
00220 buf[3] = '\0';
00221 toggle_fragment(buf, word);
00222
00223 if (word.size() <= 4) {
00224
00225
00226
00227
00228
00229
00230 buf[0] = 'B';
00231 buf[1] = word[0];
00232 buf[3] = '\0';
00233 toggle_fragment(buf, word);
00234 }
00235 if (word.size() > 2) {
00236 set<fragment> done;
00237
00238 buf[0] = 'M';
00239 for (size_t start = 0; start <= word.size() - 3; ++start) {
00240 memcpy(buf.data + 1, word.data() + start, 3);
00241
00242
00243 if (done.insert(buf).second)
00244 toggle_fragment(buf, word);
00245 }
00246 }
00247 }
00248
00249 struct TermListGreaterApproxSize {
00250 bool operator()(const TermList *a, const TermList *b) {
00251 return a->get_approx_size() > b->get_approx_size();
00252 }
00253 };
00254
00255 TermList *
00256 BrassSpellingTable::open_termlist(const string & word)
00257 {
00258
00259 AssertRel(word.size(),>,1);
00260
00261
00262
00263 if (!wordfreq_changes.empty()) merge_changes();
00264
00265
00266
00267 priority_queue<TermList*, vector<TermList*>, TermListGreaterApproxSize> pq;
00268 try {
00269 string data;
00270 fragment buf;
00271
00272
00273 buf[0] = 'H';
00274 buf[1] = word[0];
00275 buf[2] = word[1];
00276 if (get_exact_entry(string(buf), data))
00277 pq.push(new BrassSpellingTermList(data));
00278
00279
00280 buf[0] = 'T';
00281 buf[1] = word[word.size() - 2];
00282 buf[2] = word[word.size() - 1];
00283 if (get_exact_entry(string(buf), data))
00284 pq.push(new BrassSpellingTermList(data));
00285
00286 if (word.size() <= 4) {
00287
00288
00289
00290
00291
00292 buf[0] = 'B';
00293 buf[1] = word[0];
00294 buf[3] = '\0';
00295 if (get_exact_entry(string(buf), data))
00296 pq.push(new BrassSpellingTermList(data));
00297 }
00298 if (word.size() > 2) {
00299
00300 buf[0] = 'M';
00301 for (size_t start = 0; start <= word.size() - 3; ++start) {
00302 memcpy(buf.data + 1, word.data() + start, 3);
00303 if (get_exact_entry(string(buf), data))
00304 pq.push(new BrassSpellingTermList(data));
00305 }
00306
00307 if (word.size() == 3) {
00308
00309
00310
00311
00312 buf[1] = word[1];
00313 buf[2] = word[0];
00314 if (get_exact_entry(string(buf), data))
00315 pq.push(new BrassSpellingTermList(data));
00316
00317 buf[1] = word[0];
00318 buf[2] = word[2];
00319 buf[3] = word[1];
00320 if (get_exact_entry(string(buf), data))
00321 pq.push(new BrassSpellingTermList(data));
00322 }
00323 } else {
00324 Assert(word.size() == 2);
00325
00326
00327
00328
00329 buf[0] = 'H';
00330 buf[1] = word[1];
00331 buf[2] = word[0];
00332 if (get_exact_entry(string(buf), data))
00333 pq.push(new BrassSpellingTermList(data));
00334 buf[0] = 'T';
00335 if (get_exact_entry(string(buf), data))
00336 pq.push(new BrassSpellingTermList(data));
00337 }
00338
00339 if (pq.empty()) return NULL;
00340
00341
00342
00343
00344
00345
00346
00347
00348 while (pq.size() > 1) {
00349
00350
00351 TermList * termlist = pq.top();
00352 pq.pop();
00353
00354 termlist = new OrTermList(pq.top(), termlist);
00355 pq.pop();
00356 pq.push(termlist);
00357 }
00358
00359 return pq.top();
00360 } catch (...) {
00361
00362
00363 while (!pq.empty()) {
00364 delete pq.top();
00365 pq.pop();
00366 }
00367 throw;
00368 }
00369 }
00370
00371 Xapian::doccount
00372 BrassSpellingTable::get_word_frequency(const string & word) const
00373 {
00374 map<string, Xapian::termcount>::const_iterator i;
00375 i = wordfreq_changes.find(word);
00376 if (i != wordfreq_changes.end()) {
00377
00378 return i->second;
00379 }
00380
00381 string key = "W" + word;
00382 string data;
00383 if (get_exact_entry(key, data)) {
00384
00385 Xapian::termcount freq;
00386 const char *p = data.data();
00387 if (!unpack_uint_last(&p, p + data.size(), &freq)) {
00388 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
00389 }
00390 return freq;
00391 }
00392
00393 return 0;
00394 }
00395
00397
00398 Xapian::termcount
00399 BrassSpellingTermList::get_approx_size() const
00400 {
00401
00402
00403 return data.size();
00404 }
00405
00406 std::string
00407 BrassSpellingTermList::get_termname() const
00408 {
00409 return current_term;
00410 }
00411
00412 Xapian::termcount
00413 BrassSpellingTermList::get_wdf() const
00414 {
00415 return 1;
00416 }
00417
00418 Xapian::doccount
00419 BrassSpellingTermList::get_termfreq() const
00420 {
00421 return 1;
00422 }
00423
00424 Xapian::termcount
00425 BrassSpellingTermList::get_collection_freq() const
00426 {
00427 return 1;
00428 }
00429
00430 TermList *
00431 BrassSpellingTermList::next()
00432 {
00433 if (p == data.size()) {
00434 p = 0;
00435 data.resize(0);
00436 return NULL;
00437 }
00438 if (!current_term.empty()) {
00439 if (p == data.size())
00440 throw Xapian::DatabaseCorruptError("Bad spelling termlist");
00441 current_term.resize(byte(data[p++]) ^ MAGIC_XOR_VALUE);
00442 }
00443 size_t add;
00444 if (p == data.size() ||
00445 (add = byte(data[p]) ^ MAGIC_XOR_VALUE) >= data.size() - p)
00446 throw Xapian::DatabaseCorruptError("Bad spelling termlist");
00447 current_term.append(data.data() + p + 1, add);
00448 p += add + 1;
00449 return NULL;
00450 }
00451
00452 TermList *
00453 BrassSpellingTermList::skip_to(const string & term)
00454 {
00455 while (!data.empty() && current_term < term) {
00456 (void)BrassSpellingTermList::next();
00457 }
00458 return NULL;
00459 }
00460
00461 bool
00462 BrassSpellingTermList::at_end() const
00463 {
00464 return data.empty();
00465 }
00466
00467 Xapian::termcount
00468 BrassSpellingTermList::positionlist_count() const
00469 {
00470 throw Xapian::UnimplementedError("BrassSpellingTermList::positionlist_count() not implemented");
00471 }
00472
00473 Xapian::PositionIterator
00474 BrassSpellingTermList::positionlist_begin() const
00475 {
00476 throw Xapian::UnimplementedError("BrassSpellingTermList::positionlist_begin() not implemented");
00477 }