00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <config.h>
00022
00023 #include <xapian/types.h>
00024
00025 #include "flint_positionlist.h"
00026 #include "flint_utils.h"
00027 #include "omdebug.h"
00028
00029 #include <vector>
00030 #include <string>
00031 #include <cmath>
00032
00033 using namespace std;
00034
00035 static const unsigned char flstab[256] = {
00036 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
00037 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
00038 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
00039 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
00040 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
00041 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
00042 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
00043 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
00044 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00045 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00046 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00047 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00048 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00049 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00050 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00051 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
00052 };
00053
00054
00055 inline int my_fls(unsigned mask)
00056 {
00057 int result = 0;
00058 if (mask >= 0x10000u) {
00059 mask >>= 16;
00060 result = 16;
00061 }
00062 if (mask >= 0x100u) {
00063 mask >>= 8;
00064 result += 8;
00065 }
00066 return result + flstab[mask];
00067 }
00068
00069 class BitWriter {
00070 private:
00071 string buf;
00072 int n_bits;
00073 unsigned int acc;
00074 public:
00075 BitWriter() : n_bits(0), acc(0) { }
00076 BitWriter(const string & seed) : buf(seed), n_bits(0), acc(0) { }
00077 void encode(size_t value, size_t outof) {
00078 Assert(value < outof);
00079 size_t bits = my_fls(outof - 1);
00080 const size_t spare = (1 << bits) - outof;
00081 if (spare) {
00082 const size_t mid_start = (outof - spare) / 2;
00083 if (value < mid_start) {
00084 write_bits(value, bits);
00085 } else if (value >= mid_start + spare) {
00086 write_bits((value - (mid_start + spare)) | (1 << (bits - 1)), bits);
00087 } else {
00088 --bits;
00089 write_bits(value, bits);
00090 }
00091 } else {
00092 write_bits(value, bits);
00093 }
00094 }
00095 void write_bits(int data, int count) {
00096 if (count + n_bits > 32) {
00097
00098
00099
00100 Assert(count <= 32);
00101 acc |= (data << n_bits);
00102 buf += char(acc);
00103 acc >>= 8;
00104 data >>= 8;
00105 count -= 8;
00106 }
00107 acc |= (data << n_bits);
00108 n_bits += count;
00109 while (n_bits >= 8) {
00110 buf += char(acc);
00111 acc >>= 8;
00112 n_bits -= 8;
00113 }
00114 }
00115 string & freeze() {
00116 if (n_bits) {
00117 buf += char(acc);
00118 n_bits = 0;
00119 acc = 0;
00120 }
00121 return buf;
00122 }
00123 };
00124
00125 static void
00126 encode_interpolative(BitWriter & wr, const vector<Xapian::termpos> &pos, int j, int k)
00127 {
00128 while (j + 1 < k) {
00129 const size_t mid = (j + k) / 2;
00130
00131
00132
00133 const size_t outof = pos[k] - pos[j] + j - k + 1;
00134 const size_t lowest = pos[j] + mid - j;
00135 wr.encode(pos[mid] - lowest, outof);
00136 encode_interpolative(wr, pos, j, mid);
00137 j = mid;
00138 }
00139 }
00140
00141 namespace Xapian {
00142
00143 class BitReader {
00144 private:
00145 string buf;
00146 size_t idx;
00147 int n_bits;
00148 unsigned int acc;
00149 public:
00150 BitReader(const string &buf_) : buf(buf_), idx(0), n_bits(0), acc(0) { }
00151 Xapian::termpos decode(Xapian::termpos outof) {
00152 size_t bits = my_fls(outof - 1);
00153 const size_t spare = (1 << bits) - outof;
00154 const size_t mid_start = (outof - spare) / 2;
00155 Xapian::termpos p;
00156 if (spare) {
00157 p = read_bits(bits - 1);
00158 if (p < mid_start) {
00159 if (read_bits(1)) p += mid_start + spare;
00160 }
00161 } else {
00162 p = read_bits(bits);
00163 }
00164 Assert(p < outof);
00165 return p;
00166 }
00167 unsigned int read_bits(int count) {
00168 unsigned int result;
00169 if (count > 25) {
00170
00171
00172
00173
00174
00175 Assert(count <= 32);
00176 result = read_bits(16);
00177 return result | (read_bits(count - 16) << 16);
00178 }
00179 while (n_bits < count) {
00180 Assert(idx < buf.size());
00181 acc |= static_cast<unsigned char>(buf[idx++]) << n_bits;
00182 n_bits += 8;
00183 }
00184 result = acc & ((1u << count) - 1);
00185 acc >>= count;
00186 n_bits -= count;
00187 return result;
00188 }
00189
00190
00191
00192
00193 bool check_all_gone() const {
00194 return (idx == buf.size() && n_bits < 7 && acc == 0);
00195 }
00196 void decode_interpolative(vector<Xapian::termpos> & pos, int j, int k);
00197 };
00198
00199 void
00200 BitReader::decode_interpolative(vector<Xapian::termpos> & pos, int j, int k)
00201 {
00202 while (j + 1 < k) {
00203 const size_t mid = (j + k) / 2;
00204
00205
00206
00207 const size_t outof = pos[k] - pos[j] + j - k + 1;
00208 pos[mid] = decode(outof) + (pos[j] + mid - j);
00209 decode_interpolative(pos, j, mid);
00210 j = mid;
00211 }
00212 }
00213
00214 }
00215
00216 using Xapian::BitReader;
00217
00218 void
00219 FlintPositionListTable::set_positionlist(Xapian::docid did,
00220 const string & tname,
00221 Xapian::PositionIterator pos,
00222 const Xapian::PositionIterator &pos_end,
00223 bool check_for_update)
00224 {
00225 DEBUGCALL(DB, void, "FlintPositionList::set_positionlist", did << ", " << tname << ", " << pos << ", " << pos_end << ", " << check_for_update);
00226 Assert(pos != pos_end);
00227
00228
00229 vector<Xapian::termpos> poscopy(pos, pos_end);
00230
00231 string key = make_key(did, tname);
00232
00233 string s = pack_uint(poscopy.back());
00234
00235 if (poscopy.size() > 1) {
00236 BitWriter wr(s);
00237 wr.encode(poscopy[0], poscopy.back());
00238 wr.encode(poscopy.size() - 2, poscopy.back() - poscopy[0]);
00239 encode_interpolative(wr, poscopy, 0, poscopy.size() - 1);
00240 swap(s, wr.freeze());
00241 }
00242
00243 if (check_for_update) {
00244 string old_tag;
00245 if (get_exact_entry(key, old_tag) && s == old_tag)
00246 return;
00247 }
00248 add(key, s);
00249 }
00250
00251 Xapian::termcount
00252 FlintPositionListTable::positionlist_count(Xapian::docid did,
00253 const string & term) const
00254 {
00255 DEBUGCALL(DB, void, "FlintPositionListTable::positionlist_count",
00256 did << ", " << term);
00257
00258 string data;
00259 if (!get_exact_entry(pack_uint_preserving_sort(did) + term, data)) {
00260
00261 return 0;
00262 }
00263
00264 const char * pos = data.data();
00265 const char * end = pos + data.size();
00266 Xapian::termpos pos_last;
00267 if (!unpack_uint(&pos, end, &pos_last)) {
00268 throw Xapian::DatabaseCorruptError("Position list data corrupt");
00269 }
00270 if (pos == end) {
00271
00272 return 1;
00273 }
00274
00275 BitReader rd(data);
00276
00277 (void)rd.read_bits(8 * (pos - data.data()));
00278 Xapian::termpos pos_first = rd.decode(pos_last);
00279 Xapian::termpos pos_size = rd.decode(pos_last - pos_first) + 2;
00280 return pos_size;
00281 }
00282
00284
00285 bool
00286 FlintPositionList::read_data(const FlintTable * table, Xapian::docid did,
00287 const string & tname)
00288 {
00289 DEBUGCALL(DB, void, "FlintPositionList::read_data",
00290 table << ", " << did << ", " << tname);
00291
00292 have_started = false;
00293 positions.clear();
00294
00295 string data;
00296 if (!table->get_exact_entry(pack_uint_preserving_sort(did) + tname, data)) {
00297
00298 current_pos = positions.begin();
00299 return false;
00300 }
00301
00302 const char * pos = data.data();
00303 const char * end = pos + data.size();
00304 Xapian::termpos pos_last;
00305 if (!unpack_uint(&pos, end, &pos_last)) {
00306 throw Xapian::DatabaseCorruptError("Position list data corrupt");
00307 }
00308 if (pos == end) {
00309
00310 positions.push_back(pos_last);
00311 current_pos = positions.begin();
00312 return true;
00313 }
00314 BitReader rd(data);
00315
00316 (void)rd.read_bits(8 * (pos - data.data()));
00317 Xapian::termpos pos_first = rd.decode(pos_last);
00318 Xapian::termpos pos_size = rd.decode(pos_last - pos_first) + 2;
00319 positions.resize(pos_size);
00320 positions[0] = pos_first;
00321 positions.back() = pos_last;
00322 rd.decode_interpolative(positions, 0, pos_size - 1);
00323
00324 current_pos = positions.begin();
00325 return true;
00326 }
00327
00328 Xapian::termcount
00329 FlintPositionList::get_size() const
00330 {
00331 DEBUGCALL(DB, Xapian::termcount, "FlintPositionList::get_size", "");
00332 RETURN(positions.size());
00333 }
00334
00335 Xapian::termpos
00336 FlintPositionList::get_position() const
00337 {
00338 DEBUGCALL(DB, Xapian::termpos, "FlintPositionList::get_position", "");
00339 Assert(have_started);
00340 RETURN(*current_pos);
00341 }
00342
00343 void
00344 FlintPositionList::next()
00345 {
00346 DEBUGCALL(DB, void, "FlintPositionList::next", "");
00347
00348 if (!have_started) {
00349 have_started = true;
00350 } else {
00351 Assert(!at_end());
00352 ++current_pos;
00353 }
00354 }
00355
00356 void
00357 FlintPositionList::skip_to(Xapian::termpos termpos)
00358 {
00359 DEBUGCALL(DB, void, "FlintPositionList::skip_to", termpos);
00360 if (!have_started) {
00361 have_started = true;
00362 }
00363 while (!at_end() && *current_pos < termpos) ++current_pos;
00364 }
00365
00366 bool
00367 FlintPositionList::at_end() const
00368 {
00369 DEBUGCALL(DB, bool, "FlintPositionList::at_end", "");
00370 RETURN(current_pos == positions.end());
00371 }