00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include <config.h>
00025
00026 #include "omdebug.h"
00027
00028 #include "emptypostlist.h"
00029 #include "expandweight.h"
00030 #include "inmemory_database.h"
00031 #include "inmemory_document.h"
00032 #include "inmemory_alltermslist.h"
00033 #include "utils.h"
00034
00035 #include <string>
00036 #include <vector>
00037 #include <map>
00038
00039 #include <xapian/error.h>
00040 #include <xapian/valueiterator.h>
00041
00042 using std::make_pair;
00043
00044 inline void
00045 InMemoryTerm::add_posting(const InMemoryPosting & post)
00046 {
00047
00048 vector<InMemoryPosting>::iterator p;
00049 p = lower_bound(docs.begin(), docs.end(),
00050 post, InMemoryPostingLessThan());
00051 if (p == docs.end() || InMemoryPostingLessThan()(post, *p)) {
00052 docs.insert(p, post);
00053 } else if (!p->valid) {
00054 *p = post;
00055 } else {
00056 (*p).merge(post);
00057 }
00058 }
00059
00060 inline void
00061 InMemoryDoc::add_posting(const InMemoryTermEntry & post)
00062 {
00063
00064 vector<InMemoryTermEntry>::iterator p;
00065 p = lower_bound(terms.begin(), terms.end(),
00066 post, InMemoryTermEntryLessThan());
00067 if (p == terms.end() || InMemoryTermEntryLessThan()(post, *p)) {
00068 terms.insert(p, post);
00069 } else {
00070 (*p).merge(post);
00071 }
00072 }
00073
00075
00077
00078 InMemoryPostList::InMemoryPostList(Xapian::Internal::RefCntPtr<const InMemoryDatabase> db_,
00079 const InMemoryTerm & imterm)
00080 : pos(imterm.docs.begin()),
00081 end(imterm.docs.end()),
00082 termfreq(imterm.term_freq),
00083 started(false),
00084 db(db_)
00085 {
00086
00087 Assert(pos != end);
00088 while (pos != end && !pos->valid) ++pos;
00089 }
00090
00091 Xapian::doccount
00092 InMemoryPostList::get_termfreq() const
00093 {
00094 return termfreq;
00095 }
00096
00097 Xapian::docid
00098 InMemoryPostList::get_docid() const
00099 {
00100
00101 Assert(started);
00102 Assert(!at_end());
00103
00104 return (*pos).did;
00105 }
00106
00107 PostList *
00108 InMemoryPostList::next(Xapian::weight )
00109 {
00110 if (started) {
00111 Assert(!at_end());
00112 ++pos;
00113 while (pos != end && !pos->valid) ++pos;
00114 } else {
00115 started = true;
00116 }
00117 return NULL;
00118 }
00119
00120 PostList *
00121 InMemoryPostList::skip_to(Xapian::docid did, Xapian::weight w_min)
00122 {
00123
00124
00125
00126
00127
00128
00129
00130 started = true;
00131 Assert(!at_end());
00132 while (!at_end() && (*pos).did < did) {
00133 (void) next(w_min);
00134 }
00135 return NULL;
00136 }
00137
00138 bool
00139 InMemoryPostList::at_end() const
00140 {
00141 return (pos == end);
00142 }
00143
00144 string
00145 InMemoryPostList::get_description() const
00146 {
00147 return "InMemoryPostList " + om_tostring(termfreq);
00148 }
00149
00150 Xapian::doclength
00151 InMemoryPostList::get_doclength() const
00152 {
00153 return db->get_doclength(get_docid());
00154 }
00155
00156 PositionList *
00157 InMemoryPostList::read_position_list()
00158 {
00159 mypositions.set_data(pos->positions);
00160 return &mypositions;
00161 }
00162
00163 PositionList *
00164 InMemoryPostList::open_position_list() const
00165 {
00166 return new InMemoryPositionList(pos->positions);
00167 }
00168
00169 Xapian::termcount
00170 InMemoryPostList::get_wdf() const
00171 {
00172 return (*pos).wdf;
00173 }
00174
00176
00178
00179 InMemoryTermList::InMemoryTermList(Xapian::Internal::RefCntPtr<const InMemoryDatabase> db_,
00180 Xapian::docid did_,
00181 const InMemoryDoc & doc,
00182 Xapian::termcount len)
00183 : pos(doc.terms.begin()), end(doc.terms.end()), terms(doc.terms.size()),
00184 started(false), db(db_), did(did_), document_length(len)
00185 {
00186 DEBUGLINE(DB, "InMemoryTermList::InMemoryTermList(): " <<
00187 terms << " terms starting from " << pos->tname);
00188 }
00189
00190 Xapian::termcount
00191 InMemoryTermList::get_wdf() const
00192 {
00193 Assert(started);
00194 Assert(!at_end());
00195 return (*pos).wdf;
00196 }
00197
00198 Xapian::doccount
00199 InMemoryTermList::get_termfreq() const
00200 {
00201 Assert(started);
00202 Assert(!at_end());
00203
00204 return db->get_termfreq((*pos).tname);
00205 }
00206
00207 Xapian::termcount
00208 InMemoryTermList::get_approx_size() const
00209 {
00210 return terms;
00211 }
00212
00213 void
00214 InMemoryTermList::accumulate_stats(Xapian::Internal::ExpandStats & stats) const
00215 {
00216 Assert(started);
00217 Assert(!at_end());
00218 stats.accumulate(InMemoryTermList::get_wdf(), document_length,
00219 InMemoryTermList::get_termfreq(),
00220 db->get_doccount());
00221 }
00222
00223 string
00224 InMemoryTermList::get_termname() const
00225 {
00226 Assert(started);
00227 Assert(!at_end());
00228 return (*pos).tname;
00229 }
00230
00231 TermList *
00232 InMemoryTermList::next()
00233 {
00234 if (started) {
00235 Assert(!at_end());
00236 pos++;
00237 } else {
00238 started = true;
00239 }
00240 return NULL;
00241 }
00242
00243 bool
00244 InMemoryTermList::at_end() const
00245 {
00246 Assert(started);
00247 return (pos == end);
00248 }
00249
00250 Xapian::termcount
00251 InMemoryTermList::positionlist_count() const
00252 {
00253 return db->positionlist_count(did, (*pos).tname);
00254 }
00255
00256 Xapian::PositionIterator
00257 InMemoryTermList::positionlist_begin() const
00258 {
00259 return Xapian::PositionIterator(db->open_position_list(did, (*pos).tname));
00260 }
00261
00263
00265
00266 InMemoryAllDocsPostList::InMemoryAllDocsPostList(Xapian::Internal::RefCntPtr<const InMemoryDatabase> db_)
00267 : did(0), db(db_)
00268 {
00269 }
00270
00271 Xapian::doccount
00272 InMemoryAllDocsPostList::get_termfreq() const
00273 {
00274 return db->totdocs;
00275 }
00276
00277 Xapian::docid
00278 InMemoryAllDocsPostList::get_docid() const
00279 {
00280 Assert(did > 0);
00281 Assert(did <= db->termlists.size());
00282 Assert(db->termlists[did - 1].is_valid);
00283 return did;
00284 }
00285
00286 Xapian::doclength
00287 InMemoryAllDocsPostList::get_doclength() const
00288 {
00289 return db->get_doclength(did);
00290 }
00291
00292 Xapian::termcount
00293 InMemoryAllDocsPostList::get_wdf() const
00294 {
00295 return 1;
00296 }
00297
00298 PositionList *
00299 InMemoryAllDocsPostList::read_position_list()
00300 {
00301 throw Xapian::UnimplementedError("Can't open position list for all docs iterator");
00302 }
00303
00304 PositionList *
00305 InMemoryAllDocsPostList::open_position_list() const
00306 {
00307 throw Xapian::UnimplementedError("Can't open position list for all docs iterator");
00308 }
00309
00310 PostList *
00311 InMemoryAllDocsPostList::next(Xapian::weight )
00312 {
00313 Assert(!at_end());
00314 do {
00315 ++did;
00316 } while (did <= db->termlists.size() && !db->termlists[did - 1].is_valid);
00317 return NULL;
00318 }
00319
00320 PostList *
00321 InMemoryAllDocsPostList::skip_to(Xapian::docid did_, Xapian::weight )
00322 {
00323 Assert(!at_end());
00324 if (did <= did_) {
00325 did = did_;
00326 while (did <= db->termlists.size() && !db->termlists[did - 1].is_valid) {
00327 ++did;
00328 }
00329 }
00330 return NULL;
00331 }
00332
00333 bool
00334 InMemoryAllDocsPostList::at_end() const
00335 {
00336 return (did > db->termlists.size());
00337 }
00338
00339 string
00340 InMemoryAllDocsPostList::get_description() const
00341 {
00342 return "InMemoryAllDocsPostList " + om_tostring(did);
00343 }
00344
00346
00348
00349 InMemoryDatabase::InMemoryDatabase()
00350 : totdocs(0), totlen(0), positions_present(false)
00351 {
00352
00353 transaction_state = TRANSACTION_UNIMPLEMENTED;
00354 }
00355
00356 InMemoryDatabase::~InMemoryDatabase()
00357 {
00358 dtor_called();
00359 }
00360
00361 LeafPostList *
00362 InMemoryDatabase::open_post_list(const string & tname) const
00363 {
00364 if (tname.empty()) {
00365 Xapian::Internal::RefCntPtr<const InMemoryDatabase> ptrtothis(this);
00366 return new InMemoryAllDocsPostList(ptrtothis);
00367 }
00368 map<string, InMemoryTerm>::const_iterator i = postlists.find(tname);
00369 if (i == postlists.end() || i->second.term_freq == 0)
00370 return new EmptyPostList();
00371
00372 Xapian::Internal::RefCntPtr<const InMemoryDatabase> ptrtothis(this);
00373 LeafPostList * pl = new InMemoryPostList(ptrtothis, i->second);
00374 Assert(!pl->at_end());
00375 return pl;
00376 }
00377
00378 bool
00379 InMemoryDatabase::doc_exists(Xapian::docid did) const
00380 {
00381 return (did > 0 && did <= termlists.size() && termlists[did - 1].is_valid);
00382 }
00383
00384 Xapian::doccount
00385 InMemoryDatabase::get_termfreq(const string & tname) const
00386 {
00387 map<string, InMemoryTerm>::const_iterator i = postlists.find(tname);
00388 if (i == postlists.end()) return 0;
00389 return i->second.term_freq;
00390 }
00391
00392 Xapian::termcount
00393 InMemoryDatabase::get_collection_freq(const string &tname) const
00394 {
00395 map<string, InMemoryTerm>::const_iterator i = postlists.find(tname);
00396 if (i == postlists.end()) return 0;
00397 return i->second.collection_freq;
00398 }
00399
00400 Xapian::doccount
00401 InMemoryDatabase::get_doccount() const
00402 {
00403 return totdocs;
00404 }
00405
00406 Xapian::docid
00407 InMemoryDatabase::get_lastdocid() const
00408 {
00409 return termlists.size();
00410 }
00411
00412 Xapian::doclength
00413 InMemoryDatabase::get_avlength() const
00414 {
00415 if (totdocs == 0) return 0;
00416 return Xapian::doclength(totlen) / totdocs;
00417 }
00418
00419 Xapian::doclength
00420 InMemoryDatabase::get_doclength(Xapian::docid did) const
00421 {
00422 if (!doc_exists(did)) {
00423 throw Xapian::DocNotFoundError(string("Docid ") + om_tostring(did) +
00424 string(" not found"));
00425 }
00426 return doclengths[did - 1];
00427 }
00428
00429 TermList *
00430 InMemoryDatabase::open_term_list(Xapian::docid did) const
00431 {
00432 if (did == 0) throw Xapian::InvalidArgumentError("Docid 0 invalid");
00433 if (!doc_exists(did)) {
00434
00435 throw Xapian::DocNotFoundError(string("Docid ") + om_tostring(did) +
00436 string(" not found"));
00437 }
00438 return new InMemoryTermList(Xapian::Internal::RefCntPtr<const InMemoryDatabase>(this), did,
00439 termlists[did - 1], doclengths[did - 1]);
00440 }
00441
00442 Xapian::Document::Internal *
00443 InMemoryDatabase::open_document(Xapian::docid did, bool ) const
00444 {
00445
00446 if (did == 0) throw Xapian::InvalidArgumentError("Docid 0 invalid");
00447 if (!doc_exists(did)) {
00448
00449 throw Xapian::DocNotFoundError(string("Docid ") + om_tostring(did) +
00450 string(" not found"));
00451 }
00452 return new InMemoryDocument(this, did, doclists[did - 1],
00453 valuelists[did - 1]);
00454 }
00455
00456 std::string
00457 InMemoryDatabase::get_metadata(const std::string & key) const
00458 {
00459 map<string, string>::const_iterator i = metadata.find(key);
00460 if (i == metadata.end())
00461 return string();
00462 return i->second;
00463 }
00464
00465 TermList *
00466 InMemoryDatabase::open_metadata_keylist(const string &) const
00467 {
00468 if (metadata.empty()) return NULL;
00469
00470 throw Xapian::UnimplementedError("InMemory backend doesn't currently implement Database::metadata_keys_begin()");
00471 }
00472
00473 void
00474 InMemoryDatabase::set_metadata(const std::string & key,
00475 const std::string & value)
00476 {
00477 if (!value.empty()) {
00478 metadata[key] = value;
00479 } else {
00480 metadata.erase(key);
00481 }
00482 }
00483
00484 Xapian::termcount
00485 InMemoryDatabase::positionlist_count(Xapian::docid did,
00486 const string & tname) const
00487 {
00488 if (!doc_exists(did)) {
00489 return 0;
00490 }
00491 const InMemoryDoc &doc = termlists[did-1];
00492
00493 vector<InMemoryTermEntry>::const_iterator i;
00494 for (i = doc.terms.begin(); i != doc.terms.end(); ++i) {
00495 if (i->tname == tname) {
00496 return i->positions.size();
00497 }
00498 }
00499 return 0;
00500 }
00501
00502 PositionList *
00503 InMemoryDatabase::open_position_list(Xapian::docid did,
00504 const string & tname) const
00505 {
00506 if (!doc_exists(did)) {
00507 throw Xapian::DocNotFoundError("Document id " + om_tostring(did) +
00508 " doesn't exist in inmemory database");
00509 }
00510 const InMemoryDoc &doc = termlists[did-1];
00511
00512 vector<InMemoryTermEntry>::const_iterator i;
00513 for (i = doc.terms.begin(); i != doc.terms.end(); ++i) {
00514 if (i->tname == tname) {
00515 return new InMemoryPositionList(i->positions);
00516 }
00517 }
00518 throw Xapian::RangeError("No positionlist for term in document.");
00519 }
00520
00521 void
00522 InMemoryDatabase::add_values(Xapian::docid did,
00523 const map<Xapian::valueno, string> &values_)
00524 {
00525 if (did > valuelists.size()) {
00526 valuelists.resize(did);
00527 }
00528 valuelists[did-1] = values_;
00529 }
00530
00531
00532 void
00533 InMemoryDatabase::flush()
00534 {
00535 }
00536
00537
00538 void
00539 InMemoryDatabase::cancel()
00540 {
00541 }
00542
00543 void
00544 InMemoryDatabase::delete_document(Xapian::docid did)
00545 {
00546 if (!doc_exists(did)) {
00547 throw Xapian::DocNotFoundError(string("Docid ") + om_tostring(did) +
00548 string(" not found"));
00549 }
00550 termlists[did-1].is_valid = false;
00551 doclists[did-1] = "";
00552 valuelists[did-1].clear();
00553 totlen -= doclengths[did-1];
00554 doclengths[did-1] = 0;
00555 totdocs--;
00556
00557
00558 if (totdocs == 0) positions_present = false;
00559
00560 vector<InMemoryTermEntry>::const_iterator i;
00561 for (i = termlists[did - 1].terms.begin();
00562 i != termlists[did - 1].terms.end();
00563 ++i) {
00564 map<string, InMemoryTerm>::iterator t = postlists.find(i->tname);
00565 Assert(t != postlists.end());
00566 t->second.collection_freq -= i->wdf;
00567 --t->second.term_freq;
00568 vector<InMemoryPosting>::iterator posting = t->second.docs.begin();
00569 while (posting != t->second.docs.end()) {
00570
00571
00572
00573 if (posting->did == did) posting->valid = false;
00574 ++posting;
00575 }
00576 }
00577 termlists[did-1].terms.clear();
00578 }
00579
00580 void
00581 InMemoryDatabase::replace_document(Xapian::docid did,
00582 const Xapian::Document & document)
00583 {
00584 DEBUGLINE(DB, "InMemoryDatabase::replace_document(): replacing doc "
00585 << did);
00586
00587 if (doc_exists(did)) {
00588 doclists[did - 1] = "";
00589 valuelists[did - 1].clear();
00590 totlen -= doclengths[did - 1];
00591 totdocs--;
00592 } else if (did > termlists.size()) {
00593 termlists.resize(did);
00594 termlists[did - 1].is_valid = true;
00595 doclengths.resize(did);
00596 doclists.resize(did);
00597 valuelists.resize(did);
00598 } else {
00599 termlists[did - 1].is_valid = true;
00600 }
00601
00602 vector<InMemoryTermEntry>::const_iterator i;
00603 for (i = termlists[did - 1].terms.begin();
00604 i != termlists[did - 1].terms.end();
00605 ++i) {
00606 map<string, InMemoryTerm>::iterator t = postlists.find(i->tname);
00607 Assert(t != postlists.end());
00608 t->second.collection_freq -= i->wdf;
00609 --t->second.term_freq;
00610 vector<InMemoryPosting>::iterator posting = t->second.docs.begin();
00611 while (posting != t->second.docs.end()) {
00612
00613
00614
00615 if (posting->did == did) posting->valid = false;
00616 ++posting;
00617 }
00618 }
00619
00620 doclengths[did - 1] = 0;
00621 doclists[did - 1] = document.get_data();
00622
00623 finish_add_doc(did, document);
00624 }
00625
00626 Xapian::docid
00627 InMemoryDatabase::add_document(const Xapian::Document & document)
00628 {
00629 Xapian::docid did = make_doc(document.get_data());
00630
00631 DEBUGLINE(DB, "InMemoryDatabase::add_document(): adding doc " << did);
00632
00633 finish_add_doc(did, document);
00634
00635 return did;
00636 }
00637
00638 void
00639 InMemoryDatabase::finish_add_doc(Xapian::docid did, const Xapian::Document &document)
00640 {
00641 {
00642 map<Xapian::valueno, string> values;
00643 Xapian::ValueIterator k = document.values_begin();
00644 Xapian::ValueIterator k_end = document.values_end();
00645 for ( ; k != k_end; ++k) {
00646 values.insert(make_pair(k.get_valueno(), *k));
00647 DEBUGLINE(DB, "InMemoryDatabase::finish_add_doc(): adding value "
00648 << k.get_valueno() << " -> " << *k);
00649 }
00650 add_values(did, values);
00651 }
00652
00653 InMemoryDoc doc(true);
00654 Xapian::TermIterator i = document.termlist_begin();
00655 Xapian::TermIterator i_end = document.termlist_end();
00656 for ( ; i != i_end; ++i) {
00657 make_term(*i);
00658
00659 DEBUGLINE(DB, "InMemoryDatabase::finish_add_doc(): adding term "
00660 << *i);
00661 Xapian::PositionIterator j = i.positionlist_begin();
00662 Xapian::PositionIterator j_end = i.positionlist_end();
00663
00664 if (j == j_end) {
00665
00666 make_posting(&doc, *i, did, 0, i.get_wdf(), false);
00667 } else {
00668 positions_present = true;
00669 for ( ; j != j_end; ++j) {
00670 make_posting(&doc, *i, did, *j, i.get_wdf());
00671 }
00672 }
00673
00674 Assert(did > 0 && did <= doclengths.size());
00675 doclengths[did - 1] += i.get_wdf();
00676 totlen += i.get_wdf();
00677 postlists[*i].collection_freq += i.get_wdf();
00678 ++postlists[*i].term_freq;
00679 }
00680 swap(termlists[did - 1], doc);
00681
00682 totdocs++;
00683 }
00684
00685 void
00686 InMemoryDatabase::make_term(const string & tname)
00687 {
00688 postlists[tname];
00689 }
00690
00691 Xapian::docid
00692 InMemoryDatabase::make_doc(const string & docdata)
00693 {
00694 termlists.push_back(InMemoryDoc(true));
00695 doclengths.push_back(0);
00696 doclists.push_back(docdata);
00697
00698 AssertEqParanoid(termlists.size(), doclengths.size());
00699
00700 return termlists.size();
00701 }
00702
00703 void InMemoryDatabase::make_posting(InMemoryDoc * doc,
00704 const string & tname,
00705 Xapian::docid did,
00706 Xapian::termpos position,
00707 Xapian::termcount wdf,
00708 bool use_position)
00709 {
00710 Assert(doc);
00711 Assert(postlists.find(tname) != postlists.end());
00712 Assert(did > 0 && did <= termlists.size());
00713 Assert(did > 0 && did <= doclengths.size());
00714 Assert(doc_exists(did));
00715
00716
00717 InMemoryPosting posting;
00718 posting.did = did;
00719 if (use_position) {
00720 posting.positions.push_back(position);
00721 }
00722 posting.wdf = wdf;
00723 posting.valid = true;
00724
00725
00726 postlists[tname].add_posting(posting);
00727
00728
00729 InMemoryTermEntry termentry;
00730 termentry.tname = tname;
00731 if (use_position) {
00732 termentry.positions.push_back(position);
00733 }
00734 termentry.wdf = wdf;
00735
00736
00737 doc->add_posting(termentry);
00738 }
00739
00740 bool
00741 InMemoryDatabase::term_exists(const string & tname) const
00742 {
00743 Assert(!tname.empty());
00744 map<string, InMemoryTerm>::const_iterator i = postlists.find(tname);
00745 if (i == postlists.end()) return false;
00746 return (i->second.term_freq != 0);
00747 }
00748
00749 bool
00750 InMemoryDatabase::has_positions() const
00751 {
00752 return positions_present;
00753 }
00754
00755 TermList *
00756 InMemoryDatabase::open_allterms(const string & prefix) const
00757 {
00758 return new InMemoryAllTermsList(&postlists,
00759 Xapian::Internal::RefCntPtr<const InMemoryDatabase>(this),
00760 prefix);
00761 }