backends/flint/flint_positionlist.cc

Go to the documentation of this file.
00001 /* flint_positionlist.cc: A position list in a flint database.
00002  *
00003  * Copyright (C) 2004,2005,2006,2010 Olly Betts
00004  *
00005  * This program is free software; you can redistribute it and/or
00006  * modify it under the terms of the GNU General Public License as
00007  * published by the Free Software Foundation; either version 2 of the
00008  * License, or (at your option) any later version.
00009  *
00010  * This program is distributed in the hope that it will be useful,
00011  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013  * GNU General Public License for more details.
00014  *
00015  * You should have received a copy of the GNU General Public License
00016  * along with this program; if not, write to the Free Software
00017  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00018  * USA
00019  */
00020 
00021 #include <config.h>
00022 
00023 #include <xapian/types.h>
00024 
00025 #include "flint_positionlist.h"
00026 #include "flint_utils.h"
00027 #include "omdebug.h"
00028 
00029 #include <vector>
00030 #include <string>
00031 #include <cmath>
00032 
00033 using namespace std;
00034 
00035 static const unsigned char flstab[256] = {
00036     0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
00037     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
00038     6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
00039     6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
00040     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
00041     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
00042     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
00043     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
00044     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00045     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00046     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00047     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00048     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00049     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00050     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00051     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
00052 };
00053 
00054 // Highly optimised fls() implementation.
00055 inline int my_fls(unsigned mask)
00056 {
00057     int result = 0;
00058     if (mask >= 0x10000u) {
00059         mask >>= 16;
00060         result = 16;
00061     }
00062     if (mask >= 0x100u) {
00063         mask >>= 8;
00064         result += 8;
00065     }
00066     return result + flstab[mask];
00067 }
00068 
00069 class BitWriter {
00070     private:
00071         string buf;
00072         int n_bits;
00073         unsigned int acc;
00074     public:
00075         BitWriter() : n_bits(0), acc(0) { }
00076         BitWriter(const string & seed) : buf(seed), n_bits(0), acc(0) { }
00077         void encode(size_t value, size_t outof) {
00078             Assert(value < outof);
00079             size_t bits = my_fls(outof - 1);
00080             const size_t spare = (1 << bits) - outof;
00081             if (spare) {
00082                 const size_t mid_start = (outof - spare) / 2;
00083                 if (value < mid_start) {
00084                     write_bits(value, bits);
00085                 } else if (value >= mid_start + spare) {
00086                     write_bits((value - (mid_start + spare)) | (1 << (bits - 1)), bits);
00087                 } else {
00088                     --bits;
00089                     write_bits(value, bits);
00090                 }
00091             } else {
00092                 write_bits(value, bits);
00093             }
00094         }
00095         void write_bits(int data, int count) {
00096             if (count + n_bits > 32) {
00097                 // We need to write more bits than there's empty room for in
00098                 // the accumulator.  So we arrange to shift out 8 bits, then
00099                 // adjust things so we're adding 8 fewer bits.
00100                 Assert(count <= 32);
00101                 acc |= (data << n_bits);
00102                 buf += char(acc);
00103                 acc >>= 8;
00104                 data >>= 8;
00105                 count -= 8;
00106             }
00107             acc |= (data << n_bits);
00108             n_bits += count;
00109             while (n_bits >= 8) {
00110                 buf += char(acc);
00111                 acc >>= 8;
00112                 n_bits -= 8;
00113             }
00114         }
00115         string & freeze() {
00116             if (n_bits) {
00117                 buf += char(acc);
00118                 n_bits = 0;
00119                 acc = 0;
00120             }
00121             return buf;
00122         }
00123 };
00124 
00125 static void
00126 encode_interpolative(BitWriter & wr, const vector<Xapian::termpos> &pos, int j, int k)
00127 {
00128     while (j + 1 < k) {
00129         const size_t mid = (j + k) / 2;
00130         // Encode one out of (pos[k] - pos[j] + 1) values
00131         // (less some at either end because we must be able to fit
00132         // all the intervening pos in)
00133         const size_t outof = pos[k] - pos[j] + j - k + 1;
00134         const size_t lowest = pos[j] + mid - j;
00135         wr.encode(pos[mid] - lowest, outof);
00136         encode_interpolative(wr, pos, j, mid);
00137         j = mid;
00138     }
00139 }
00140 
00141 namespace Xapian {
00142 
00143 class BitReader {
00144     private:
00145         string buf;
00146         size_t idx;
00147         int n_bits;
00148         unsigned int acc;
00149     public:
00150         BitReader(const string &buf_) : buf(buf_), idx(0), n_bits(0), acc(0) { }
00151         Xapian::termpos decode(Xapian::termpos outof) {
00152             size_t bits = my_fls(outof - 1);
00153             const size_t spare = (1 << bits) - outof;
00154             const size_t mid_start = (outof - spare) / 2;
00155             Xapian::termpos p;
00156             if (spare) {
00157                 p = read_bits(bits - 1);
00158                 if (p < mid_start) {
00159                     if (read_bits(1)) p += mid_start + spare;
00160                 }
00161             } else {
00162                 p = read_bits(bits);
00163             }
00164             Assert(p < outof);
00165             return p;
00166         }
00167         unsigned int read_bits(int count) {
00168             unsigned int result;
00169             if (count > 25) {
00170                 // If we need more than 25 bits, read in two goes to ensure
00171                 // that we don't overflow acc.  This is a little more
00172                 // conservative than it needs to be, but such large values will
00173                 // inevitably be rare (because you can't fit very many of them
00174                 // into 2^32!)
00175                 Assert(count <= 32);
00176                 result = read_bits(16);
00177                 return result | (read_bits(count - 16) << 16);
00178             }
00179             while (n_bits < count) {
00180                 Assert(idx < buf.size());
00181                 acc |= static_cast<unsigned char>(buf[idx++]) << n_bits;
00182                 n_bits += 8;
00183             }
00184             result = acc & ((1u << count) - 1);
00185             acc >>= count;
00186             n_bits -= count;
00187             return result;
00188         }
00189         // Check all the data has been read.  Because it'll be zero padded
00190         // to fill a byte, the best we can actually do is check that
00191         // there's less than a byte left and that all remaining bits are
00192         // zero.
00193         bool check_all_gone() const {
00194             return (idx == buf.size() && n_bits < 7 && acc == 0);
00195         }
00196         void decode_interpolative(vector<Xapian::termpos> & pos, int j, int k);
00197 };
00198 
00199 void
00200 BitReader::decode_interpolative(vector<Xapian::termpos> & pos, int j, int k)
00201 {
00202     while (j + 1 < k) {
00203         const size_t mid = (j + k) / 2;
00204         // Decode one out of (pos[k] - pos[j] + 1) values
00205         // (less some at either end because we must be able to fit
00206         // all the intervening pos in)
00207         const size_t outof = pos[k] - pos[j] + j - k + 1;
00208         pos[mid] = decode(outof) + (pos[j] + mid - j);
00209         decode_interpolative(pos, j, mid);
00210         j = mid;
00211     }
00212 }
00213 
00214 }
00215 
00216 using Xapian::BitReader;
00217 
00218 void
00219 FlintPositionListTable::set_positionlist(Xapian::docid did,
00220                                          const string & tname,
00221                                          Xapian::PositionIterator pos,
00222                                          const Xapian::PositionIterator &pos_end,
00223                                          bool check_for_update)
00224 {
00225     DEBUGCALL(DB, void, "FlintPositionList::set_positionlist", did << ", " << tname << ", " << pos << ", " << pos_end << ", " << check_for_update);
00226     Assert(pos != pos_end);
00227 
00228     // FIXME: avoid the need for this copy!
00229     vector<Xapian::termpos> poscopy(pos, pos_end);
00230 
00231     string key = make_key(did, tname);
00232 
00233     string s = pack_uint(poscopy.back());
00234 
00235     if (poscopy.size() > 1) {
00236         BitWriter wr(s);
00237         wr.encode(poscopy[0], poscopy.back());
00238         wr.encode(poscopy.size() - 2, poscopy.back() - poscopy[0]);
00239         encode_interpolative(wr, poscopy, 0, poscopy.size() - 1);
00240         swap(s, wr.freeze());
00241     }
00242 
00243     if (check_for_update) {
00244         string old_tag;
00245         if (get_exact_entry(key, old_tag) && s == old_tag)
00246             return;
00247     }
00248     add(key, s);
00249 }
00250 
00251 Xapian::termcount
00252 FlintPositionListTable::positionlist_count(Xapian::docid did,
00253                                            const string & term) const
00254 {
00255     DEBUGCALL(DB, void, "FlintPositionListTable::positionlist_count",
00256               did << ", " << term);
00257 
00258     string data;
00259     if (!get_exact_entry(pack_uint_preserving_sort(did) + term, data)) {
00260         // There's no positional information for this term.
00261         return 0;
00262     }
00263 
00264     const char * pos = data.data();
00265     const char * end = pos + data.size();
00266     Xapian::termpos pos_last;
00267     if (!unpack_uint(&pos, end, &pos_last)) {
00268         throw Xapian::DatabaseCorruptError("Position list data corrupt");
00269     }
00270     if (pos == end) {
00271         // Special case for single entry position list.
00272         return 1;
00273     }
00274 
00275     BitReader rd(data);
00276     // Skip the header we just read.
00277     (void)rd.read_bits(8 * (pos - data.data()));
00278     Xapian::termpos pos_first = rd.decode(pos_last);
00279     Xapian::termpos pos_size = rd.decode(pos_last - pos_first) + 2;
00280     return pos_size;
00281 }
00282 
00284 
00285 bool
00286 FlintPositionList::read_data(const FlintTable * table, Xapian::docid did,
00287                              const string & tname)
00288 {
00289     DEBUGCALL(DB, void, "FlintPositionList::read_data",
00290               table << ", " << did << ", " << tname);
00291 
00292     have_started = false;
00293     positions.clear();
00294 
00295     string data;
00296     if (!table->get_exact_entry(pack_uint_preserving_sort(did) + tname, data)) {
00297         // There's no positional information for this term.
00298         current_pos = positions.begin();
00299         return false;
00300     }
00301 
00302     const char * pos = data.data();
00303     const char * end = pos + data.size();
00304     Xapian::termpos pos_last;
00305     if (!unpack_uint(&pos, end, &pos_last)) {
00306         throw Xapian::DatabaseCorruptError("Position list data corrupt");
00307     }
00308     if (pos == end) {
00309         // Special case for single entry position list.
00310         positions.push_back(pos_last);
00311         current_pos = positions.begin();
00312         return true;
00313     }
00314     BitReader rd(data);
00315     // Skip the header we just read.
00316     (void)rd.read_bits(8 * (pos - data.data()));
00317     Xapian::termpos pos_first = rd.decode(pos_last);
00318     Xapian::termpos pos_size = rd.decode(pos_last - pos_first) + 2;
00319     positions.resize(pos_size);
00320     positions[0] = pos_first;
00321     positions.back() = pos_last;
00322     rd.decode_interpolative(positions, 0, pos_size - 1);
00323 
00324     current_pos = positions.begin();
00325     return true;
00326 }
00327 
00328 Xapian::termcount
00329 FlintPositionList::get_size() const
00330 {
00331     DEBUGCALL(DB, Xapian::termcount, "FlintPositionList::get_size", "");
00332     RETURN(positions.size());
00333 }
00334 
00335 Xapian::termpos
00336 FlintPositionList::get_position() const
00337 {
00338     DEBUGCALL(DB, Xapian::termpos, "FlintPositionList::get_position", "");
00339     Assert(have_started);
00340     RETURN(*current_pos);
00341 }
00342 
00343 void
00344 FlintPositionList::next()
00345 {
00346     DEBUGCALL(DB, void, "FlintPositionList::next", "");
00347 
00348     if (!have_started) {
00349         have_started = true;
00350     } else {
00351         Assert(!at_end());
00352         ++current_pos;
00353     }
00354 }
00355 
00356 void
00357 FlintPositionList::skip_to(Xapian::termpos termpos)
00358 {
00359     DEBUGCALL(DB, void, "FlintPositionList::skip_to", termpos);
00360     if (!have_started) {
00361         have_started = true;
00362     }
00363     while (!at_end() && *current_pos < termpos) ++current_pos;
00364 }
00365 
00366 bool
00367 FlintPositionList::at_end() const
00368 {
00369     DEBUGCALL(DB, bool, "FlintPositionList::at_end", "");
00370     RETURN(current_pos == positions.end());
00371 }

Documentation for Xapian (version 1.0.20).
Generated on 28 Apr 2010 by Doxygen 1.5.2.