backends/flint/flint_postlist.cc

Go to the documentation of this file.
00001 /* flint_postlist.cc: Postlists in a flint database
00002  *
00003  * Copyright 1999,2000,2001 BrightStation PLC
00004  * Copyright 2002,2003,2004,2005,2007 Olly Betts
00005  * Copyright 2007 Lemur Consulting Ltd
00006  *
00007  * This program is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU General Public License as
00009  * published by the Free Software Foundation; either version 2 of the
00010  * License, or (at your option) any later version.
00011  *
00012  * This program is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
00020  * USA
00021  */
00022 
00023 #include <config.h>
00024 #include "flint_postlist.h"
00025 
00026 #include "flint_cursor.h"
00027 #include "flint_database.h"
00028 #include "flint_utils.h"
00029 #include "noreturn.h"
00030 #include "omdebug.h"
00031 #include "utils.h"
00032 
00033 Xapian::doccount
00034 FlintPostListTable::get_termfreq(const string & term) const
00035 {
00036     string key = make_key(term);
00037     string tag;
00038     if (!get_exact_entry(key, tag)) return 0;
00039 
00040     Xapian::doccount termfreq;
00041     const char * p = tag.data();
00042     FlintPostList::read_number_of_entries(&p, p + tag.size(), &termfreq, NULL);
00043     return termfreq;
00044 }
00045 
00046 Xapian::termcount
00047 FlintPostListTable::get_collection_freq(const string & term) const
00048 {
00049     string key = make_key(term);
00050     string tag;
00051     if (!get_exact_entry(key, tag)) return 0;
00052 
00053     Xapian::termcount collfreq;
00054     const char * p = tag.data();
00055     FlintPostList::read_number_of_entries(&p, p + tag.size(), NULL, &collfreq);
00056     return collfreq;
00057 }
00058 
00059 // How big should chunks in the posting list be?  (They
00060 // will grow slightly bigger than this, but not more than a
00061 // few bytes extra) - FIXME: tune this value to try to
00062 // maximise how well blocks are used.  Or performance.
00063 // Or indexing speed.  Or something...
00064 const unsigned int CHUNKSIZE = 2000;
00065 
00072 class PostlistChunkWriter {
00073     public:
00074         PostlistChunkWriter(const string &orig_key_,
00075                             bool is_first_chunk_,
00076                             const string &tname_,
00077                             bool is_last_chunk_);
00078 
00080         void append(FlintTable * table, Xapian::docid did,
00081                     Xapian::termcount wdf, flint_doclen_t doclen);
00082 
00084         void raw_append(Xapian::docid first_did_, Xapian::docid current_did_,
00085                         const string & s) {
00086             Assert(!started);
00087             first_did = first_did_;
00088             current_did = current_did_;
00089             if (!s.empty()) {
00090                 chunk.append(s);
00091                 started = true;
00092             }
00093         }
00094 
00099         void flush(FlintTable *table);
00100 
00101     private:
00102         string orig_key;
00103         string tname;
00104         bool is_first_chunk;
00105         bool is_last_chunk;
00106         bool started;
00107 
00108         Xapian::docid first_did;
00109         Xapian::docid current_did;
00110 
00111         string chunk;
00112 };
00113 
00114 // Static functions
00115 
00117 XAPIAN_NORETURN(static void report_read_error(const char * position));
00118 static void report_read_error(const char * position)
00119 {
00120     if (position == 0) {
00121         // data ran out
00122         throw Xapian::DatabaseCorruptError("Data ran out unexpectedly when reading posting list.");
00123     }
00124     // overflow
00125     throw Xapian::RangeError("Value in posting list too large.");
00126 }
00127 
00128 static inline bool get_tname_from_key(const char **src, const char *end,
00129                                string &tname)
00130 {
00131     return unpack_string_preserving_sort(src, end, tname);
00132 }
00133 
00134 static inline bool
00135 check_tname_in_key_lite(const char **keypos, const char *keyend, const string &tname)
00136 {
00137     string tname_in_key;
00138 
00139     // Read the termname.
00140     if (!get_tname_from_key(keypos, keyend, tname_in_key)) {
00141         report_read_error(*keypos);
00142     }
00143 
00144     // This should only fail if the postlist doesn't exist at all.
00145     return tname_in_key == tname;
00146 }
00147 
00148 static inline bool
00149 check_tname_in_key(const char **keypos, const char *keyend, const string &tname)
00150 {
00151     if (*keypos == keyend) return false;
00152 
00153     return check_tname_in_key_lite(keypos, keyend, tname);
00154 }
00155 
00157 static Xapian::docid
00158 read_start_of_first_chunk(const char ** posptr,
00159                           const char * end,
00160                           Xapian::doccount * number_of_entries_ptr,
00161                           Xapian::termcount * collection_freq_ptr)
00162 {
00163     DEBUGCALL_STATIC(DB, Xapian::docid, "read_start_of_first_chunk",
00164                      (const void *)posptr << ", " <<
00165                      (const void *)end << ", " <<
00166                      (void *)number_of_entries_ptr << ", " <<
00167                      (void *)collection_freq_ptr);
00168 
00169     FlintPostList::read_number_of_entries(posptr, end,
00170                            number_of_entries_ptr, collection_freq_ptr);
00171     if (number_of_entries_ptr)
00172         DEBUGLINE(DB, "number_of_entries = " << *number_of_entries_ptr);
00173     if (collection_freq_ptr)
00174         DEBUGLINE(DB, "collection_freq = " << *collection_freq_ptr);
00175 
00176     Xapian::docid did;
00177     // Read the docid of the first entry in the posting list.
00178     if (!unpack_uint(posptr, end, &did))
00179         report_read_error(*posptr);
00180     ++did;
00181     DEBUGLINE(DB, "doc_id = " << did);
00182     RETURN(did);
00183 }
00184 
00185 static inline void read_did_increase(const char ** posptr,
00186                               const char * end,
00187                               Xapian::docid * did_ptr)
00188 {
00189     Xapian::docid did_increase;
00190     if (!unpack_uint(posptr, end, &did_increase)) report_read_error(*posptr);
00191     *did_ptr += did_increase + 1;
00192 }
00193 
00195 static inline void read_wdf_and_length(const char ** posptr,
00196                                 const char * end,
00197                                 Xapian::termcount * wdf_ptr,
00198                                 flint_doclen_t * doclength_ptr)
00199 {
00200     if (!unpack_uint(posptr, end, wdf_ptr)) report_read_error(*posptr);
00201     if (!unpack_uint(posptr, end, doclength_ptr)) report_read_error(*posptr);
00202 }
00203 
00205 static Xapian::docid
00206 read_start_of_chunk(const char ** posptr,
00207                     const char * end,
00208                     Xapian::docid first_did_in_chunk,
00209                     bool * is_last_chunk_ptr)
00210 {
00211     DEBUGCALL_STATIC(DB, Xapian::docid, "read_start_of_chunk",
00212                      reinterpret_cast<const void*>(posptr) << ", " <<
00213                      reinterpret_cast<const void*>(end) << ", " <<
00214                      first_did_in_chunk << ", " <<
00215                      reinterpret_cast<const void*>(is_last_chunk_ptr));
00216 
00217     // Read whether this is the last chunk
00218     if (!unpack_bool(posptr, end, is_last_chunk_ptr))
00219         report_read_error(*posptr);
00220     if (is_last_chunk_ptr)
00221         DEBUGLINE(DB, "is_last_chunk = " << *is_last_chunk_ptr);
00222 
00223     // Read what the final document ID in this chunk is.
00224     Xapian::docid increase_to_last;
00225     if (!unpack_uint(posptr, end, &increase_to_last))
00226         report_read_error(*posptr);
00227     ++increase_to_last;
00228     Xapian::docid last_did_in_chunk = first_did_in_chunk + increase_to_last;
00229     DEBUGLINE(DB, "last_did_in_chunk = " << last_did_in_chunk);
00230     RETURN(last_did_in_chunk);
00231 }
00232 
00233 static string make_wdf_and_length(Xapian::termcount wdf, flint_doclen_t doclength)
00234 {
00235     return pack_uint(wdf) + pack_uint(doclength);
00236 }
00237 
00238 static void write_start_of_chunk(string & chunk,
00239                                  unsigned int start_of_chunk_header,
00240                                  unsigned int end_of_chunk_header,
00241                                  bool is_last_chunk,
00242                                  Xapian::docid first_did_in_chunk,
00243                                  Xapian::docid last_did_in_chunk)
00244 {
00245     Assert((size_t)(end_of_chunk_header - start_of_chunk_header) <= chunk.size());
00246     Assert(last_did_in_chunk >= first_did_in_chunk);
00247     Xapian::docid increase_to_last = last_did_in_chunk - first_did_in_chunk;
00248 
00249     chunk.replace(start_of_chunk_header,
00250                   end_of_chunk_header - start_of_chunk_header,
00251                   pack_bool(is_last_chunk) + pack_uint(increase_to_last - 1));
00252     // FIXME - storing increase_to_last - 1 is bogus as this value is
00253     // -1 when a postlist chunk has a single entry!  Luckily the code
00254     // works despite this, but it's ugly.
00255 }
00256 
00261 class PostlistChunkReader {
00262     public:
00268         PostlistChunkReader(Xapian::docid first_did, const string & data_)
00269             : data(data_), pos(data.data()), end(pos + data.length()), at_end(data.empty()), did(first_did)
00270         {
00271             if (!at_end) read_wdf_and_length(&pos, end, &wdf, &doclength);
00272         }
00273 
00274         Xapian::docid get_docid() const {
00275             return did;
00276         }
00277         Xapian::termcount get_wdf() const {
00278             return wdf;
00279         }
00280         flint_doclen_t get_doclength() const {
00281             DEBUGCALL(DB, flint_doclen_t, "PostlistChunkReader::get_doclength", "");
00282             RETURN(doclength);
00283         }
00284 
00285         bool is_at_end() const {
00286             return at_end;
00287         }
00288 
00291         void next();
00292 
00293     private:
00294         string data;
00295 
00296         const char *pos;
00297         const char *end;
00298 
00299         bool at_end;
00300 
00301         Xapian::docid did;
00302         Xapian::termcount wdf;
00303         flint_doclen_t doclength;
00304 };
00305 
00306 void
00307 PostlistChunkReader::next()
00308 {
00309     if (pos == end) {
00310         at_end = true;
00311     } else {
00312         read_did_increase(&pos, end, &did);
00313         read_wdf_and_length(&pos, end, &wdf, &doclength);
00314     }
00315 }
00316 
00317 PostlistChunkWriter::PostlistChunkWriter(const string &orig_key_,
00318                                          bool is_first_chunk_,
00319                                          const string &tname_,
00320                                          bool is_last_chunk_)
00321         : orig_key(orig_key_),
00322           tname(tname_), is_first_chunk(is_first_chunk_),
00323           is_last_chunk(is_last_chunk_),
00324           started(false)
00325 {
00326     DEBUGCALL(DB, void, "PostlistChunkWriter::PostlistChunkWriter",
00327               orig_key_ << ", " << is_first_chunk_ << ", " << tname_ << ", " <<
00328               is_last_chunk_);
00329 }
00330 
00331 void
00332 PostlistChunkWriter::append(FlintTable * table, Xapian::docid did,
00333                             Xapian::termcount wdf, flint_doclen_t doclen)
00334 {
00335     if (!started) {
00336         started = true;
00337         first_did = did;
00338     } else {
00339         Assert(did > current_did);
00340         // Start a new chunk if this one has grown to the threshold.
00341         if (chunk.size() >= CHUNKSIZE) {
00342             bool save_is_last_chunk = is_last_chunk;
00343             is_last_chunk = false;
00344             flush(table);
00345             is_last_chunk = save_is_last_chunk;
00346             is_first_chunk = false;
00347             first_did = did;
00348             chunk = "";
00349             orig_key = FlintPostListTable::make_key(tname, first_did);
00350         } else {
00351             chunk.append(pack_uint(did - current_did - 1));
00352         }
00353     }
00354     current_did = did;
00355     chunk.append(make_wdf_and_length(wdf, doclen));
00356 }
00357 
00360 static inline string
00361 make_start_of_first_chunk(Xapian::doccount entries,
00362                           Xapian::termcount collectionfreq,
00363                           Xapian::docid new_did)
00364 {
00365     return pack_uint(entries) + pack_uint(collectionfreq) + pack_uint(new_did - 1);
00366 }
00367 
00370 static inline string
00371 make_start_of_chunk(bool new_is_last_chunk,
00372                     Xapian::docid new_first_did,
00373                     Xapian::docid new_final_did)
00374 {
00375     Assert(new_final_did >= new_first_did);
00376     return pack_bool(new_is_last_chunk) +
00377             pack_uint(new_final_did - new_first_did - 1);
00378 }
00379 
00380 void
00381 PostlistChunkWriter::flush(FlintTable *table)
00382 {
00383     DEBUGCALL(DB, void, "PostlistChunkWriter::flush", table);
00384 
00385     /* This is one of the more messy parts involved with updating posting
00386      * list chunks.
00387      * 
00388      * Depending on circumstances, we may have to delete an entire chunk
00389      * or file it under a different key, as well as possibly modifying both
00390      * the previous and next chunk of the postlist.
00391      */
00392 
00393     if (!started) {
00394         /* This chunk is now empty so disappears entirely.
00395          *
00396          * If this was the last chunk, then the previous chunk
00397          * must have its "is_last_chunk" flag updated.
00398          *
00399          * If this was the first chunk, then the next chunk must
00400          * be transformed into the first chunk.  Messy!
00401          */
00402         DEBUGLINE(DB, "PostlistChunkWriter::flush(): deleting chunk");
00403         Assert(!orig_key.empty());
00404         if (is_first_chunk) {
00405             DEBUGLINE(DB, "PostlistChunkWriter::flush(): deleting first chunk");
00406             if (is_last_chunk) {
00407                 /* This is the first and the last chunk, ie the only
00408                  * chunk, so just delete the tag.
00409                  */
00410                 table->del(orig_key);
00411                 return;
00412             }
00413 
00414             /* This is the messiest case.  The first chunk is to
00415              * be removed, and there is at least one chunk after
00416              * it.  Need to rewrite the next chunk as the first
00417              * chunk.
00418              */
00419             AutoPtr<FlintCursor> cursor(table->cursor_get());
00420 
00421             if (!cursor->find_entry(orig_key)) {
00422                 throw Xapian::DatabaseCorruptError("The key we're working on has disappeared");
00423             }
00424 
00425             // Extract existing counts from the first chunk so we can reinsert
00426             // them into the block we're renaming.
00427             Xapian::doccount num_ent;
00428             Xapian::termcount coll_freq;
00429             {
00430                 cursor->read_tag();
00431                 const char *tagpos = cursor->current_tag.data();
00432                 const char *tagend = tagpos + cursor->current_tag.size();
00433 
00434                 (void)read_start_of_first_chunk(&tagpos, tagend,
00435                                                 &num_ent, &coll_freq);
00436             }
00437 
00438             // Seek to the next chunk.
00439             cursor->next();
00440             if (cursor->after_end()) {
00441                 throw Xapian::DatabaseCorruptError("Expected another key but found none");
00442             }
00443             const char *kpos = cursor->current_key.data();
00444             const char *kend = kpos + cursor->current_key.size();
00445             if (!check_tname_in_key(&kpos, kend, tname)) {
00446                 throw Xapian::DatabaseCorruptError("Expected another key with the same term name but found a different one");
00447             }
00448 
00449             // Read the new first docid
00450             Xapian::docid new_first_did;
00451             if (!unpack_uint_preserving_sort(&kpos, kend,
00452                                              &new_first_did)) {
00453                 report_read_error(kpos);
00454             }
00455 
00456             cursor->read_tag();
00457             const char *tagpos = cursor->current_tag.data();
00458             const char *tagend = tagpos + cursor->current_tag.size();
00459 
00460             // Read the chunk header
00461             bool new_is_last_chunk;
00462             Xapian::docid new_last_did_in_chunk =
00463                 read_start_of_chunk(&tagpos, tagend, new_first_did,
00464                                     &new_is_last_chunk);
00465 
00466             string chunk_data(tagpos, tagend);
00467 
00468             // First remove the renamed tag
00469             table->del(cursor->current_key);
00470 
00471             // And now write it as the first chunk
00472             string tag;
00473             tag = make_start_of_first_chunk(num_ent, coll_freq, new_first_did);
00474             tag += make_start_of_chunk(new_is_last_chunk,
00475                                               new_first_did,
00476                                               new_last_did_in_chunk);
00477             tag += chunk_data;
00478             table->add(orig_key, tag);
00479             return;
00480         }
00481 
00482         DEBUGLINE(DB, "PostlistChunkWriter::flush(): deleting secondary chunk");
00483         /* This isn't the first chunk.  Check whether we're the last
00484          * chunk.
00485          */
00486 
00487         // Delete this chunk
00488         table->del(orig_key);
00489 
00490         if (is_last_chunk) {
00491             DEBUGLINE(DB, "PostlistChunkWriter::flush(): deleting secondary last chunk");
00492             // Update the previous chunk's is_last_chunk flag.
00493             AutoPtr<FlintCursor> cursor(table->cursor_get());
00494 
00495             /* Should not find the key we just deleted, but should
00496              * find the previous chunk. */
00497             if (cursor->find_entry(orig_key)) {
00498                 throw Xapian::DatabaseCorruptError("Flint key not deleted as we expected");
00499             }
00500             // Make sure this is a chunk with the right term attached.
00501             const char * keypos = cursor->current_key.data();
00502             const char * keyend = keypos + cursor->current_key.size();
00503             if (!check_tname_in_key(&keypos, keyend, tname)) {
00504                 throw Xapian::DatabaseCorruptError("Couldn't find chunk before delete chunk");
00505             }
00506 
00507             bool is_prev_first_chunk = (keypos == keyend);
00508 
00509             // Now update the last_chunk
00510             cursor->read_tag();
00511             string tag = cursor->current_tag;
00512 
00513             const char *tagpos = tag.data();
00514             const char *tagend = tagpos + tag.size();
00515 
00516             // Skip first chunk header
00517             Xapian::docid first_did_in_chunk;
00518             if (is_prev_first_chunk) {
00519                 first_did_in_chunk = read_start_of_first_chunk(&tagpos, tagend,
00520                                           0, 0);
00521             } else {
00522                 if (!unpack_uint_preserving_sort(&keypos, keyend,
00523                                                  &first_did_in_chunk))
00524                     report_read_error(keypos);
00525             }
00526             bool wrong_is_last_chunk;
00527             string::size_type start_of_chunk_header = tagpos - tag.data();
00528             Xapian::docid last_did_in_chunk =
00529                 read_start_of_chunk(&tagpos, tagend, first_did_in_chunk,
00530                                     &wrong_is_last_chunk);
00531             string::size_type end_of_chunk_header = tagpos - tag.data();
00532 
00533             // write new is_last flag
00534             write_start_of_chunk(tag,
00535                                  start_of_chunk_header,
00536                                  end_of_chunk_header,
00537                                  true, // is_last_chunk
00538                                  first_did_in_chunk,
00539                                  last_did_in_chunk);
00540             table->add(cursor->current_key, tag);
00541         }
00542     } else {
00543         DEBUGLINE(DB, "PostlistChunkWriter::flush(): updating chunk which still has items in it");
00544         /* The chunk still has some items in it.  Two major subcases:
00545          * a) This is the first chunk.
00546          * b) This isn't the first chunk.
00547          *
00548          * The subcases just affect the chunk header.
00549          */
00550         string tag;
00551 
00552         /* First write the header, which depends on whether this is the
00553          * first chunk.
00554          */
00555         if (is_first_chunk) {
00556             /* The first chunk.  This is the relatively easy case,
00557              * and we just have to write this one back to disk.
00558              */
00559             DEBUGLINE(DB, "PostlistChunkWriter::flush(): rewriting the first chunk, which still has items in it");
00560             string key = FlintPostListTable::make_key(tname);
00561             bool ok = table->get_exact_entry(key, tag);
00562             (void)ok;
00563             Assert(ok);
00564             Assert(!tag.empty());
00565 
00566             Xapian::doccount num_ent;
00567             Xapian::termcount coll_freq;
00568             {
00569                 const char * tagpos = tag.data();
00570                 const char * tagend = tagpos + tag.size();
00571                 (void)read_start_of_first_chunk(&tagpos, tagend,
00572                                                 &num_ent, &coll_freq);
00573             }
00574 
00575             tag = make_start_of_first_chunk(num_ent, coll_freq, first_did);
00576 
00577             tag += make_start_of_chunk(is_last_chunk, first_did, current_did);
00578             tag += chunk;
00579             table->add(key, tag);
00580             return;
00581         }
00582 
00583         DEBUGLINE(DB, "PostlistChunkWriter::flush(): updating secondary chunk which still has items in it");
00584         /* Not the first chunk.
00585          *
00586          * This has the easy sub-sub-case:
00587          *   The first entry in the chunk hasn't changed
00588          * ...and the hard sub-sub-case:
00589          *   The first entry in the chunk has changed.  This is
00590          *   harder because the key for the chunk changes, so
00591          *   we've got to do a switch.
00592          */
00593 
00594         // First find out the initial docid
00595         const char *keypos = orig_key.data();
00596         const char *keyend = keypos + orig_key.size();
00597         if (!check_tname_in_key(&keypos, keyend, tname)) {
00598             throw Xapian::DatabaseCorruptError("Have invalid key writing to postlist");
00599         }
00600         Xapian::docid initial_did;
00601         if (!unpack_uint_preserving_sort(&keypos, keyend, &initial_did)) {
00602             report_read_error(keypos);
00603         }
00604         string new_key;
00605         if (initial_did != first_did) {
00606             /* The fiddlier case:
00607              * Create a new tag with the correct key, and replace
00608              * the old one.
00609              */
00610             new_key = FlintPostListTable::make_key(tname, first_did);
00611             table->del(orig_key);
00612         } else {
00613             new_key = orig_key;
00614         }
00615 
00616         // ...and write the start of this chunk.
00617         tag = make_start_of_chunk(is_last_chunk, first_did, current_did);
00618 
00619         tag += chunk;
00620         table->add(new_key, tag);
00621     }
00622 }
00623 
00628 void FlintPostList::read_number_of_entries(const char ** posptr,
00629                                    const char * end,
00630                                    Xapian::doccount * number_of_entries_ptr,
00631                                    Xapian::termcount * collection_freq_ptr)
00632 {
00633     if (!unpack_uint(posptr, end, number_of_entries_ptr))
00634         report_read_error(*posptr);
00635     if (!unpack_uint(posptr, end, collection_freq_ptr))
00636         report_read_error(*posptr);
00637 }
00638 
00658 FlintPostList::FlintPostList(Xapian::Internal::RefCntPtr<const FlintDatabase> this_db_,
00659                              const string & term_)
00660         : this_db(this_db_),
00661           term(term_),
00662           have_started(false),
00663           cursor(this_db->postlist_table.cursor_get()),
00664           is_at_end(false)
00665 {
00666     DEBUGCALL(DB, void, "FlintPostList::FlintPostList",
00667               this_db_.get() << ", " << term_);
00668     string key = FlintPostListTable::make_key(term);
00669     int found = cursor->find_entry(key);
00670     if (!found) {
00671         number_of_entries = 0;
00672         is_at_end = true;
00673         pos = 0;
00674         end = 0;
00675         first_did_in_chunk = 0;
00676         last_did_in_chunk = 0;
00677         return;
00678     }
00679     cursor->read_tag();
00680     pos = cursor->current_tag.data();
00681     end = pos + cursor->current_tag.size();
00682 
00683     did = read_start_of_first_chunk(&pos, end, &number_of_entries, NULL);
00684     first_did_in_chunk = did;
00685     last_did_in_chunk = read_start_of_chunk(&pos, end, first_did_in_chunk,
00686                                             &is_last_chunk);
00687     read_wdf_and_length(&pos, end, &wdf, &doclength);
00688 }
00689 
00690 FlintPostList::~FlintPostList()
00691 {
00692     DEBUGCALL(DB, void, "FlintPostList::~FlintPostList", "");
00693 }
00694 
00695 bool
00696 FlintPostList::next_in_chunk()
00697 {
00698     DEBUGCALL(DB, bool, "FlintPostList::next_in_chunk", "");
00699     if (pos == end) RETURN(false);
00700 
00701     read_did_increase(&pos, end, &did);
00702     read_wdf_and_length(&pos, end, &wdf, &doclength);
00703 
00704     // Either not at last doc in chunk, or pos == end, but not both.
00705     Assert(did <= last_did_in_chunk);
00706     Assert(did < last_did_in_chunk || pos == end);
00707     Assert(pos != end || did == last_did_in_chunk);
00708 
00709     RETURN(true);
00710 }
00711 
00712 void
00713 FlintPostList::next_chunk()
00714 {
00715     DEBUGCALL(DB, void, "FlintPostList::next_chunk", "");
00716     if (is_last_chunk) {
00717         is_at_end = true;
00718         return;
00719     }
00720 
00721     cursor->next();
00722     if (cursor->after_end()) {
00723         is_at_end = true;
00724         throw Xapian::DatabaseCorruptError("Unexpected end of posting list for `" +
00725                                      term + "'");
00726     }
00727     const char * keypos = cursor->current_key.data();
00728     const char * keyend = keypos + cursor->current_key.size();
00729     // Check we're still in same postlist
00730     if (!check_tname_in_key_lite(&keypos, keyend, term)) {
00731         is_at_end = true;
00732         throw Xapian::DatabaseCorruptError("Unexpected end of posting list for `" +
00733                                      term + "'");
00734     }
00735 
00736     Xapian::docid newdid;
00737     if (!unpack_uint_preserving_sort(&keypos, keyend, &newdid)) {
00738         report_read_error(keypos);
00739     }
00740     if (newdid <= did) {
00741         throw Xapian::DatabaseCorruptError("Document ID in new chunk of postlist (" +
00742                 om_tostring(newdid) +
00743                 ") is not greater than final document ID in previous chunk (" +
00744                 om_tostring(did) + ")");
00745     }
00746     did = newdid;
00747 
00748     cursor->read_tag();
00749     pos = cursor->current_tag.data();
00750     end = pos + cursor->current_tag.size();
00751 
00752     first_did_in_chunk = did;
00753     last_did_in_chunk = read_start_of_chunk(&pos, end, first_did_in_chunk,
00754                                             &is_last_chunk);
00755     read_wdf_and_length(&pos, end, &wdf, &doclength);
00756 }
00757 
00758 PositionList *
00759 FlintPostList::read_position_list()
00760 {
00761     DEBUGCALL(DB, PositionList *, "FlintPostList::read_position_list", "");
00762     positionlist.read_data(&this_db->position_table, did, term);
00763     RETURN(&positionlist);
00764 }
00765 
00766 PositionList *
00767 FlintPostList::open_position_list() const
00768 {
00769     DEBUGCALL(DB, PositionList *, "FlintPostList::open_position_list", "");
00770     RETURN(new FlintPositionList(&this_db->position_table, did, term));
00771 }
00772 
00773 PostList *
00774 FlintPostList::next(Xapian::weight w_min)
00775 {
00776     DEBUGCALL(DB, PostList *, "FlintPostList::next", w_min);
00777     (void)w_min; // no warning
00778 
00779     if (!have_started) {
00780         have_started = true;
00781     } else {
00782         if (!next_in_chunk()) next_chunk();
00783     }
00784 
00785     DEBUGLINE(DB, string("Moved to ") <<
00786               (is_at_end ? string("end.") : string("docid, wdf, doclength = ") +
00787                om_tostring(did) + ", " + om_tostring(wdf) + ", " +
00788                om_tostring(doclength)));
00789 
00790     RETURN(NULL);
00791 }
00792 
00793 bool
00794 FlintPostList::current_chunk_contains(Xapian::docid desired_did)
00795 {
00796     DEBUGCALL(DB, bool, "FlintPostList::current_chunk_contains", desired_did);
00797     if (desired_did >= first_did_in_chunk &&
00798         desired_did <= last_did_in_chunk) {
00799         RETURN(true);
00800     }
00801     RETURN(false);
00802 }
00803 
00804 void
00805 FlintPostList::move_to_chunk_containing(Xapian::docid desired_did)
00806 {
00807     DEBUGCALL(DB, void,
00808               "FlintPostList::move_to_chunk_containing", desired_did);
00809     (void)cursor->find_entry(FlintPostListTable::make_key(term, desired_did));
00810     Assert(!cursor->after_end());
00811 
00812     const char * keypos = cursor->current_key.data();
00813     const char * keyend = keypos + cursor->current_key.size();
00814     // Check we're still in same postlist
00815     if (!check_tname_in_key_lite(&keypos, keyend, term)) {
00816         // This should only happen if the postlist doesn't exist at all.
00817         is_at_end = true;
00818         is_last_chunk = true;
00819         return;
00820     }
00821     is_at_end = false;
00822 
00823     cursor->read_tag();
00824     pos = cursor->current_tag.data();
00825     end = pos + cursor->current_tag.size();
00826 
00827     if (keypos == keyend) {
00828         // In first chunk
00829 #ifdef XAPIAN_DEBUG
00830         Xapian::doccount old_number_of_entries = number_of_entries;
00831         did = read_start_of_first_chunk(&pos, end, &number_of_entries, NULL);
00832         Assert(old_number_of_entries == number_of_entries);
00833 #else
00834         did = read_start_of_first_chunk(&pos, end, NULL, NULL);
00835 #endif
00836     } else {
00837         // In normal chunk
00838         if (!unpack_uint_preserving_sort(&keypos, keyend, &did)) {
00839             report_read_error(keypos);
00840         }
00841     }
00842 
00843     first_did_in_chunk = did;
00844     last_did_in_chunk = read_start_of_chunk(&pos, end, first_did_in_chunk,
00845                                             &is_last_chunk);
00846     read_wdf_and_length(&pos, end, &wdf, &doclength);
00847 
00848     // Possible, since desired_did might be after end of this chunk and before
00849     // the next.
00850     if (desired_did > last_did_in_chunk) next_chunk();
00851 }
00852 
00853 bool
00854 FlintPostList::move_forward_in_chunk_to_at_least(Xapian::docid desired_did)
00855 {
00856     DEBUGCALL(DB, bool,
00857               "FlintPostList::move_forward_in_chunk_to_at_least", desired_did);
00858     if (desired_did > last_did_in_chunk) {
00859         pos = end;
00860         RETURN(false);
00861     }
00862     while (did < desired_did) {
00863         // FIXME: perhaps we don't need to decode the wdf and document length
00864         // for documents we're skipping past.
00865         bool at_end_of_chunk = !next_in_chunk();
00866         if (at_end_of_chunk) RETURN(false);
00867     }
00868     RETURN(true);
00869 }
00870 
00871 PostList *
00872 FlintPostList::skip_to(Xapian::docid desired_did, Xapian::weight w_min)
00873 {
00874     DEBUGCALL(DB, PostList *,
00875               "FlintPostList::skip_to", desired_did << ", " << w_min);
00876     (void)w_min; // no warning
00877     // We've started now - if we hadn't already, we're already positioned
00878     // at start so there's no need to actually do anything.
00879     have_started = true;
00880 
00881     // Don't skip back, and don't need to do anything if already there.
00882     if (is_at_end || desired_did <= did) RETURN(NULL);
00883 
00884     // Move to correct chunk
00885     if (!current_chunk_contains(desired_did)) {
00886         move_to_chunk_containing(desired_did);
00887         // Might be at_end now, so we need to check before trying to move
00888         // forward in chunk.
00889         if (is_at_end) RETURN(NULL);
00890     }
00891 
00892     // Move to correct position in chunk
00893     bool have_document = move_forward_in_chunk_to_at_least(desired_did);
00894 #ifdef XAPIAN_DEBUG
00895     Assert(have_document);
00896 #else
00897     (void)have_document;
00898 #endif
00899 
00900     DEBUGLINE(DB, string("Skipped to ") <<
00901               (is_at_end ? string("end.") : string("docid, wdf, doclength = ") +
00902                om_tostring(did) + ", " + om_tostring(wdf) + ", " +
00903                om_tostring(doclength) + "."));
00904 
00905     RETURN(NULL);
00906 }
00907 
00908 string
00909 FlintPostList::get_description() const
00910 {
00911     return term + ":" + om_tostring(number_of_entries);
00912 }
00913 
00914 // Returns the last did to allow in this chunk.
00915 Xapian::docid
00916 FlintPostListTable::get_chunk(const string &tname,
00917           Xapian::docid did, bool adding,
00918           PostlistChunkReader ** from, PostlistChunkWriter **to)
00919 {
00920     // Get chunk containing entry
00921     string key = make_key(tname, did);
00922 
00923     // Find the right chunk
00924     AutoPtr<FlintCursor> cursor(cursor_get());
00925 
00926     cursor->find_entry(key);
00927     Assert(!cursor->after_end());
00928 
00929     const char * keypos = cursor->current_key.data();
00930     const char * keyend = keypos + cursor->current_key.size();
00931 
00932     if (!check_tname_in_key(&keypos, keyend, tname)) {
00933         // Postlist for this termname doesn't exist.
00934         if (!adding)
00935             throw Xapian::DatabaseCorruptError("Attempted to delete or modify an entry in a non-existent posting list for " + tname);
00936 
00937         *from = NULL;
00938         *to = new PostlistChunkWriter("", true, tname, true);
00939         return Xapian::docid(-1);
00940     }
00941 
00942     // See if we're appending - if so we can shortcut by just copying
00943     // the data part of the chunk wholesale.
00944     bool is_first_chunk = (keypos == keyend);
00945 
00946     cursor->read_tag();
00947     const char * pos = cursor->current_tag.data();
00948     const char * end = pos + cursor->current_tag.size();
00949     Xapian::docid first_did_in_chunk;
00950     if (is_first_chunk) {
00951         first_did_in_chunk = read_start_of_first_chunk(&pos, end, NULL, NULL);
00952     } else {
00953         if (!unpack_uint_preserving_sort(&keypos, keyend,
00954                                          &first_did_in_chunk)) {
00955             report_read_error(keypos);
00956         }
00957     }
00958 
00959     bool is_last_chunk;
00960     Xapian::docid last_did_in_chunk;
00961     last_did_in_chunk = read_start_of_chunk(&pos, end, first_did_in_chunk, &is_last_chunk);
00962     *to = new PostlistChunkWriter(cursor->current_key, is_first_chunk, tname,
00963                                   is_last_chunk);
00964     if (did > last_did_in_chunk) {
00965         // This is the shortcut.  Not very pretty, but I'll leave refactoring
00966         // until I've a clearer picture of everything which needs to be done.
00967         // (FIXME)
00968         *from = NULL;
00969         (*to)->raw_append(first_did_in_chunk, last_did_in_chunk,
00970                           string(pos, end)); 
00971     } else {
00972         *from = new PostlistChunkReader(first_did_in_chunk, string(pos, end));
00973     }
00974     if (is_last_chunk) return Xapian::docid(-1);
00975 
00976     // Find first did of next tag.
00977     cursor->next();
00978     if (cursor->after_end()) {
00979         throw Xapian::DatabaseCorruptError("Expected another key but found none");
00980     }
00981     const char *kpos = cursor->current_key.data();
00982     const char *kend = kpos + cursor->current_key.size();
00983     if (!check_tname_in_key(&kpos, kend, tname)) {
00984         throw Xapian::DatabaseCorruptError("Expected another key with the same term name but found a different one");
00985     }
00986 
00987     // Read the new first docid
00988     Xapian::docid first_did_of_next_chunk;
00989     if (!unpack_uint_preserving_sort(&kpos, kend, &first_did_of_next_chunk)) {
00990         report_read_error(kpos);
00991     }
00992     return first_did_of_next_chunk - 1;
00993 }
00994 
00995 void
00996 FlintPostListTable::merge_changes(
00997     const map<string, map<Xapian::docid, pair<char, Xapian::termcount> > > & mod_plists,
00998     const map<Xapian::docid, Xapian::termcount> & doclens,
00999     const map<string, pair<Xapian::termcount_diff, Xapian::termcount_diff> > & freq_deltas)
01000 {
01001     DEBUGCALL(DB, void, "FlintPostListTable::merge_changes", "mod_plists, doclens, freq_deltas");
01002     map<string, map<Xapian::docid, pair<char, Xapian::termcount> > >::const_iterator i;
01003     for (i = mod_plists.begin(); i != mod_plists.end(); ++i) {
01004         if (i->second.empty()) continue;
01005         string tname = i->first;
01006         {
01007             // Rewrite the first chunk of this posting list with the updated
01008             // termfreq and collfreq.
01009             map<string, pair<Xapian::termcount_diff, Xapian::termcount_diff> >::const_iterator deltas = freq_deltas.find(tname);
01010             Assert(deltas != freq_deltas.end());
01011 
01012             string current_key = make_key(tname);
01013             string tag;
01014             (void)get_exact_entry(current_key, tag);
01015 
01016             // Read start of first chunk to get termfreq and collfreq.
01017             const char *pos = tag.data();
01018             const char *end = pos + tag.size();
01019             Xapian::doccount termfreq;
01020             Xapian::termcount collfreq;
01021             Xapian::docid firstdid, lastdid;
01022             bool islast;
01023             if (pos == end) {
01024                 termfreq = 0;
01025                 collfreq = 0;
01026                 firstdid = 0;
01027                 lastdid = 0;
01028                 islast = true;
01029             } else {
01030                 firstdid = read_start_of_first_chunk(&pos, end,
01031                                                      &termfreq, &collfreq);
01032                 // Handle the generic start of chunk header.
01033                 lastdid = read_start_of_chunk(&pos, end, firstdid, &islast);
01034             }
01035 
01036             termfreq += deltas->second.first;
01037             if (termfreq == 0) {
01038                 // All postings deleted!  So we can shortcut by zapping the
01039                 // posting list.
01040                 if (islast) {
01041                     // Only one entry for this posting list.
01042                     del(current_key);
01043                     continue;
01044                 }
01045                 AutoPtr<FlintCursor> cursor(cursor_get());
01046                 bool found = cursor->find_entry(current_key);
01047                 Assert(found);
01048                 if (!found) continue; // Reduce damage!
01049                 while (cursor->del()) {
01050                     const char *kpos = cursor->current_key.data();
01051                     const char *kend = kpos + cursor->current_key.size();
01052                     if (!check_tname_in_key_lite(&kpos, kend, tname)) break;
01053                 }
01054                 continue;
01055             }
01056             collfreq += deltas->second.second;
01057 
01058             // Rewrite start of first chunk to update termfreq and collfreq.
01059             string newhdr = make_start_of_first_chunk(termfreq, collfreq, firstdid);
01060             newhdr += make_start_of_chunk(islast, firstdid, lastdid);
01061             if (pos == end) {
01062                 add(current_key, newhdr);
01063             } else {
01064                 Assert((size_t)(pos - tag.data()) <= tag.size());
01065                 tag.replace(0, pos - tag.data(), newhdr);
01066                 add(current_key, tag);
01067             }
01068         }
01069         map<Xapian::docid, pair<char, Xapian::termcount> >::const_iterator j;
01070         j = i->second.begin();
01071         Assert(j != i->second.end()); // This case is caught above.
01072 
01073         Xapian::docid max_did;
01074         PostlistChunkReader *from;
01075         PostlistChunkWriter *to;
01076         max_did = get_chunk(tname, j->first, j->second.first == 'A',
01077                             &from, &to);
01078         for ( ; j != i->second.end(); ++j) {
01079             Xapian::docid did = j->first;
01080 
01081 next_chunk:
01082             DEBUGLINE(DB, "Updating tname=" << tname << ", did=" << did);
01083             if (from) while (!from->is_at_end()) {
01084                 Xapian::docid copy_did = from->get_docid();
01085                 if (copy_did >= did) {
01086                     if (copy_did == did) {
01087                         Assert(j->second.first != 'A');
01088                         from->next();
01089                     }
01090                     break;
01091                 }
01092                 to->append(this, copy_did,
01093                            from->get_wdf(), from->get_doclength());
01094                 from->next();
01095             }
01096             if ((!from || from->is_at_end()) && did > max_did) {
01097                 delete from;
01098                 to->flush(this);
01099                 delete to;
01100                 max_did = get_chunk(tname, did, false, &from, &to);
01101                 goto next_chunk;
01102             }
01103 
01104             if (j->second.first != 'D') {
01105                 map<Xapian::docid, Xapian::termcount>::const_iterator k = doclens.find(did);
01106                 Assert(k != doclens.end());
01107                 Xapian::termcount new_doclen = k->second;
01108                 Xapian::termcount new_wdf = j->second.second;
01109 
01110                 to->append(this, did, new_wdf, new_doclen);
01111             }
01112         }
01113 
01114         if (from) {
01115             while (!from->is_at_end()) {
01116                 to->append(this, from->get_docid(),
01117                            from->get_wdf(), from->get_doclength());
01118                 from->next();
01119             }
01120             delete from;
01121         }
01122         to->flush(this);
01123         delete to;
01124     }
01125 }

Documentation for Xapian (version 1.0.20).
Generated on 28 Apr 2010 by Doxygen 1.5.2.