backends/flint/flint_table.h

Go to the documentation of this file.
00001 /* flint_table.h: Btree implementation
00002  *
00003  * Copyright 1999,2000,2001 BrightStation PLC
00004  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010 Olly Betts
00005  *
00006  * This program is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU General Public License as
00008  * published by the Free Software Foundation; either version 2 of the
00009  * License, or (at your option) any later version.
00010  *
00011  * This program is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  * GNU General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU General Public License
00017  * along with this program; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
00019  * USA
00020  */
00021 
00022 #ifndef OM_HGUARD_FLINT_TABLE_H
00023 #define OM_HGUARD_FLINT_TABLE_H
00024 
00025 #include <xapian/error.h>
00026 #include <xapian/visibility.h>
00027 
00028 #include <algorithm>
00029 #include <string>
00030 using std::string;
00031 
00032 #include "flint_types.h"
00033 #include "flint_btreebase.h"
00034 #include "flint_btreeutil.h"
00035 #include "flint_cursor.h"
00036 #include "noreturn.h"
00037 
00038 #include "stringutils.h"
00039 #include "utils.h"
00040 
00041 #include <zlib.h>
00042 
00043 #define DONT_COMPRESS -1
00044 
00050 #define FLINT_BTREE_MAX_KEY_LEN 252
00051 
00052 // FIXME: This named constant probably isn't used everywhere it should be...
00053 #define BYTES_PER_BLOCK_NUMBER 4
00054 
00055 /*  The B-tree blocks have a number of internal lengths and offsets held in 1, 2
00056     or 4 bytes. To make the coding a little clearer,
00057        we use  for
00058        ------  ---
00059        K1      the 1 byte length of key
00060        I2      the 2 byte length of an item (key-tag pair)
00061        D2      the 2 byte offset to the item from the directory
00062        C2      the 2 byte counter that ends each key and begins each tag
00063 */
00064 
00065 #define K1 1
00066 #define I2 2
00067 #define D2 2
00068 #define C2 2
00069 
00070 /*  and when getting K1 or setting D2, we use getK, setD defined as: */
00071 
00072 #define getK(p, c)    getint1(p, c)
00073 #define setD(p, c, x) setint2(p, c, x)
00074 
00075 /* if you've been reading the comments from the top, the next four procedures
00076    will not cause any headaches.
00077 
00078    Recall that item has this form:
00079 
00080            i k
00081            | |
00082            I K key x C tag
00083              <--K-->
00084            <------I------>
00085 
00086 
00087    item_of(p, c) returns i, the address of the item at block address p,
00088    directory offset c,
00089 
00090    component_of(p, c) returns the number marked 'x' above,
00091 
00092    components_of(p, c) returns the number marked 'C' above,
00093 */
00094 
00095 class XAPIAN_VISIBILITY_DEFAULT Key_ {
00096     const byte *p;
00097 public:
00098     explicit Key_(const byte * p_) : p(p_) { }
00099     const byte * get_address() const { return p; }
00100     void read(string * key) const {
00101         key->assign(reinterpret_cast<const char *>(p + K1), length());
00102     }
00103     bool operator==(Key_ key2) const;
00104     bool operator!=(Key_ key2) const { return !(*this == key2); }
00105     bool operator<(Key_ key2) const;
00106     bool operator>=(Key_ key2) const { return !(*this < key2); }
00107     bool operator>(Key_ key2) const { return key2 < *this; }
00108     bool operator<=(Key_ key2) const { return !(key2 < *this); }
00109     int length() const {
00110         return getK(p, 0) - C2 - K1;
00111     }
00112     char operator[](size_t i) const {
00113         return p[i + K1];
00114     }
00115 };
00116 
00117 // Item_wr_ wants to be "Item_ with non-const p and more methods" - we can't
00118 // achieve that nicely with inheritance, so we use a template base class.
00119 template <class T> class Item_base_ {
00120 protected:
00121     T p;
00122 public:
00123     /* Item_ from block address and offset to item pointer */
00124     Item_base_(T p_, int c) : p(p_ + getint2(p_, c)) { }
00125     Item_base_(T p_) : p(p_) { }
00126     T get_address() const { return p; }
00127     int size() const { return getint2(p, 0) & 0x7fff; } /* I in diagram above */
00128     bool get_compressed() const { return *p & 0x80; }
00129     int component_of() const {
00130         return getint2(p, getK(p, I2) + I2 - C2);
00131     }
00132     int components_of() const {
00133         return getint2(p, getK(p, I2) + I2);
00134     }
00135     Key_ key() const { return Key_(p + I2); }
00136     void append_chunk(string * tag) const {
00137         /* number of bytes to extract from current component */
00138         int cd = getK(p, I2) + I2 + C2;
00139         int l = size() - cd;
00140         tag->append(reinterpret_cast<const char *>(p + cd), l);
00141     }
00145     uint4 block_given_by() const {
00146         return getint4(p, size() - BYTES_PER_BLOCK_NUMBER);
00147     }
00148 };
00149 
00150 class Item_ : public Item_base_<const byte *> {
00151 public:
00152     /* Item_ from block address and offset to item pointer */
00153     Item_(const byte * p_, int c) : Item_base_<const byte *>(p_, c) { }
00154     Item_(const byte * p_) : Item_base_<const byte *>(p_) { }
00155 };
00156 
00157 class Item_wr_ : public Item_base_<byte *> {
00158     void set_key_len(int x) { setint1(p, I2, x); }
00159 public:
00160     /* Item_wr_ from block address and offset to item pointer */
00161     Item_wr_(byte * p_, int c) : Item_base_<byte *>(p_, c) { }
00162     Item_wr_(byte * p_) : Item_base_<byte *>(p_) { }
00163     void set_component_of(int i) {
00164         setint2(p, getK(p, I2) + I2 - C2, i);
00165     }
00166     void set_components_of(int m) {
00167         setint2(p, getK(p, I2) + I2, m);
00168     }
00169     // Takes size as we may be truncating newkey.
00170     void set_key_and_block(Key_ newkey, int truncate_size, uint4 n) {
00171         int i = truncate_size;
00172         // Read the length now because we may be copying the key over itself.
00173         // FIXME that's stupid!  sort this out
00174         int newkey_len = newkey.length();
00175         int newsize = I2 + K1 + i + C2;
00176         // Item size (4 since tag contains block number)
00177         setint2(p, 0, newsize + 4);
00178         // Key size
00179         setint1(p, I2, newsize - I2);
00180         // Copy the main part of the key, possibly truncating.
00181         memmove(p + I2 + K1, newkey.get_address() + K1, i);
00182         // Copy the count part.
00183         memmove(p + I2 + K1 + i, newkey.get_address() + K1 + newkey_len, C2);
00184         // Set tag contents to block number
00185 //      set_block_given_by(n);
00186         setint4(p, newsize, n);
00187     }
00188 
00192     void set_block_given_by(uint4 n) {
00193         setint4(p, size() - BYTES_PER_BLOCK_NUMBER, n);
00194     }
00195     void set_size(int l) { setint2(p, 0, l); }
00198     void form_null_key(uint4 n) {
00199         setint4(p, I2 + K1, n);
00200         set_key_len(K1);        /* null key */
00201         set_size(I2 + K1 + 4);  /* total length */
00202     }
00203     void form_key(const string & key_) {
00204         string::size_type key_len = key_.length();
00205         if (key_len > FLINT_BTREE_MAX_KEY_LEN) {
00206             // We check term length when a term is added to a document but
00207             // flint doubles zero bytes, so this can still happen for terms
00208             // which contain one or more zero bytes.
00209             string msg("Key too long: length was ");
00210             msg += om_tostring(key_len);
00211             msg += " bytes, maximum length of a key is "
00212                    STRINGIZE(FLINT_BTREE_MAX_KEY_LEN) " bytes";
00213             throw Xapian::InvalidArgumentError(msg);
00214         }
00215 
00216         set_key_len(key_len + K1 + C2);
00217         memmove(p + I2 + K1, key_.data(), key_len);
00218         set_component_of(1);
00219     }
00220     // FIXME passing cd here is icky
00221     void set_tag(int cd, const char *start, int len, bool compressed) {
00222         memmove(p + cd, start, len);
00223         set_size(cd + len);
00224         if (compressed) *p |= 0x80;
00225     }
00226     void fake_root_item() {
00227         set_key_len(K1 + C2);   // null key length
00228         set_size(I2 + K1 + 2 * C2);   // length of the item
00229         set_component_of(1);
00230         set_components_of(1);
00231     }
00232 };
00233 
00234 // Allow for BTREE_CURSOR_LEVELS levels in the B-tree.
00235 // With 10, overflow is practically impossible
00236 // FIXME: but we want it to be completely impossible...
00237 #define BTREE_CURSOR_LEVELS 10
00238 
00259 class XAPIAN_VISIBILITY_DEFAULT FlintTable {
00260     friend class FlintCursor; /* Should probably fix this. */
00261     private:
00263         FlintTable(const FlintTable &);
00264 
00266         FlintTable & operator=(const FlintTable &);
00267 
00269         bool really_empty() const;
00270 
00271     public:
00287         FlintTable(const string & path_, bool readonly_,
00288                    int compress_strategy_ = DONT_COMPRESS, bool lazy = false);
00289 
00295         ~FlintTable();
00296 
00302         void close(bool permanent=false);
00303 
00306         bool exists() const;
00307 
00316         void open();
00317 
00335         bool open(flint_revision_number_t revision_);
00336 
00350         void commit(flint_revision_number_t revision);
00351 
00357         void cancel();
00358 
00372         bool get_exact_entry(const string & key, string & tag) const;
00373 
00385         bool key_exists(const string &key) const;
00386 
00399         bool find_tag(const string &key, string * tag) const;
00400 
00409         bool read_tag(Cursor_ * C_, string *tag, bool keep_compressed) const;
00410 
00432         bool add(const string &key, string tag, bool already_compressed = false);
00433 
00451         bool del(const string &key);
00452 
00454         void erase();
00455 
00460         void set_block_size(unsigned int block_size_);
00461 
00464         unsigned int get_block_size() const { return block_size; }
00465 
00489         void create_and_open(unsigned int blocksize);
00490 
00491         void set_full_compaction(bool parity);
00492 
00503         flint_revision_number_t get_latest_revision_number() const {
00504             return latest_revision_number;
00505         }
00506 
00515         flint_revision_number_t get_open_revision_number() const {
00516             return revision_number;
00517         }
00518 
00528         flint_tablesize_t get_entry_count() const {
00529             return item_count;
00530         }
00531 
00533         bool empty() const {
00534             // Prior to 1.1.4/1.0.18, item_count was stored in 32 bits, so we
00535             // can't trust it as there could be more than 1<<32 entries.
00536             //
00537             // In theory it should wrap, so if non-zero the table isn't empty,
00538             // but the table this was first noticed in wasn't off by a multiple
00539             // of 1<<32.
00540 
00541             // An empty table will always have level == 0, and most non-empty
00542             // tables will have more levels, so use that as a short-cut.
00543             return (level == 0) && really_empty();
00544         }
00545 
00551         FlintCursor * cursor_get() const;
00552 
00558         bool is_modified() const { return Btree_modified; }
00559 
00565         void set_max_item_size(size_t block_capacity) {
00566             if (block_capacity > 4) block_capacity = 4;
00567             max_item_size = (block_size - 11 /*DIR_START*/ - block_capacity * D2)
00568                 / block_capacity;
00569         }
00570 
00571     protected:
00572 
00577         bool do_open_to_read(bool revision_supplied, flint_revision_number_t revision_);
00578 
00583         bool do_open_to_write(bool revision_supplied,
00584                               flint_revision_number_t revision_,
00585                               bool create_db = false);
00586         bool basic_open(bool revision_supplied, flint_revision_number_t revision);
00587 
00588         bool find(Cursor_ *) const;
00589         int delete_kt();
00590         void read_block(uint4 n, byte *p) const;
00591         void write_block(uint4 n, const byte *p) const;
00592         XAPIAN_NORETURN(void set_overwritten() const);
00593         void block_to_cursor(Cursor_ *C_, int j, uint4 n) const;
00594         void alter();
00595         void compact(byte *p);
00596         void enter_key(int j, Key_ prevkey, Key_ newkey);
00597         int mid_point(byte *p);
00598         void add_item_to_block(byte *p, Item_wr_ kt, int c);
00599         void add_item(Item_wr_ kt, int j);
00600         void delete_item(int j, bool repeatedly);
00601         int add_kt(bool found);
00602         void read_root();
00603         void split_root(uint4 split_n);
00604         void form_key(const string & key) const;
00605 
00606         char other_base_letter() const {
00607            return (base_letter == 'A') ? 'B' : 'A';
00608         }
00609 
00611         void lazy_alloc_deflate_zstream() const;
00612 
00614         void lazy_alloc_inflate_zstream() const;
00615 
00617         flint_revision_number_t revision_number;
00618 
00620         uint4 item_count;
00621 
00623         unsigned int block_size;
00624 
00628         mutable flint_revision_number_t latest_revision_number;
00629 
00634         mutable bool both_bases;
00635 
00637         char base_letter;
00638 
00643         bool faked_root_block;
00644 
00648         bool sequential;
00649 
00651         int handle;
00652 
00654         int level;
00655 
00657         uint4 root;
00658 
00660         mutable Item_wr_ kt;
00661 
00663         byte * buffer;
00664 
00666         FlintTable_base base;
00667 
00669         string name;
00670 
00674         int seq_count;
00675 
00677         uint4 changed_n;
00678 
00681         int changed_c;
00682 
00684         size_t max_item_size;
00685 
00687         mutable bool Btree_modified;
00688 
00690         bool full_compaction;
00691 
00693         bool writable;
00694 
00696         mutable bool cursor_created_since_last_modification;
00697 
00699         unsigned long cursor_version;
00700 
00701         /* B-tree navigation functions */
00702         bool prev(Cursor_ *C_, int j) const {
00703             if (sequential) return prev_for_sequential(C_, j);
00704             return prev_default(C_, j);
00705         }
00706 
00707         bool next(Cursor_ *C_, int j) const {
00708             if (sequential) return next_for_sequential(C_, j);
00709             return next_default(C_, j);
00710         }
00711 
00712         /* Default implementations. */
00713         bool prev_default(Cursor_ *C_, int j) const;
00714         bool next_default(Cursor_ *C_, int j) const;
00715 
00716         /* Implementations for sequential mode. */
00717         bool prev_for_sequential(Cursor_ *C_, int dummy) const;
00718         bool next_for_sequential(Cursor_ *C_, int dummy) const;
00719 
00720         static int find_in_block(const byte * p, Key_ key, bool leaf, int c);
00721 
00725         static uint4 block_given_by(const byte * p, int c);
00726 
00727         mutable Cursor_ C[BTREE_CURSOR_LEVELS];
00728 
00734         byte * split_p;
00735 
00738         int compress_strategy;
00739 
00741         mutable z_stream *deflate_zstream;
00742 
00744         mutable z_stream *inflate_zstream;
00745 
00747         bool lazy;
00748 
00749         /* Debugging methods */
00750 //      void report_block_full(int m, int n, const byte * p);
00751 };
00752 
00753 #endif /* OM_HGUARD_FLINT_TABLE_H */

Documentation for Xapian (version 1.0.20).
Generated on 28 Apr 2010 by Doxygen 1.5.2.