backends/quartz/btree.cc

Go to the documentation of this file.
00001 /* btree.cc: Btree implementation
00002  *
00003  * Copyright 1999,2000,2001 BrightStation PLC
00004  * Copyright 2002 Ananova Ltd
00005  * Copyright 2002,2003,2004,2005,2006,2007,2008,2010 Olly Betts
00006  *
00007  * This program is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU General Public License as
00009  * published by the Free Software Foundation; either version 2 of the
00010  * License, or (at your option) any later version.
00011  *
00012  * This program is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
00020  * USA
00021  */
00022 
00023 #include <config.h>
00024 
00025 #include <xapian/error.h>
00026 
00027 #include "safeerrno.h"
00028 #include "safefcntl.h"
00029 #ifdef __WIN32__
00030 # include "msvc_posix_wrapper.h"
00031 #endif
00032 
00033 #include "stringutils.h" // For STRINGIZE().
00034 
00035 // Define to use "dangerous" mode - in this mode we write modified btree
00036 // blocks back in place.  This is somewhat faster (especially once we're
00037 // I/O bound) but the database can't be safely searched during indexing
00038 // and if indexing is terminated uncleanly, the database may be corrupt.
00039 //
00040 // Despite the risks, it's useful for speeding up a full rebuild.
00041 //
00042 // FIXME: make this mode run-time selectable, and record that it is currently
00043 // in use somewhere on disk, so readers can check and refuse to open the
00044 // database.
00045 //
00046 // #define DANGEROUS
00047 
00048 #include <sys/types.h>
00049 
00050 // Trying to include the correct headers with the correct defines set to
00051 // get pread() and pwrite() prototyped on every platform without breaking any
00052 // other platform is a real can of worms.  So instead we probe for what
00053 // prototypes (if any) are required in configure and put them into
00054 // PREAD_PROTOTYPE and PWRITE_PROTOTYPE.
00055 #if defined HAVE_PREAD && defined PREAD_PROTOTYPE
00056 PREAD_PROTOTYPE
00057 #endif
00058 #if defined HAVE_PWRITE && defined PWRITE_PROTOTYPE
00059 PWRITE_PROTOTYPE
00060 #endif
00061 
00062 #include "safeunistd.h"
00063 
00064 #include <stdio.h>
00065 #include <string.h>   /* for memmove */
00066 #include <limits.h>   /* for CHAR_BIT */
00067 
00068 #include "btree.h"
00069 #include "btree_util.h"
00070 #include "btree_base.h"
00071 #include "bcursor.h"
00072 #include "quartz_utils.h"
00073 
00074 #include "omassert.h"
00075 #include "omdebug.h"
00076 #include <xapian/error.h>
00077 #include "utils.h"
00078 
00079 #include <algorithm>  // for std::min()
00080 #include <string>
00081 #include <vector>
00082 
00083 using std::min;
00084 using std::string;
00085 using std::vector;
00086 
00087 //#define BTREE_DEBUG_FULL 1
00088 #undef BTREE_DEBUG_FULL
00089 
00090 #ifdef BTREE_DEBUG_FULL
00091 /*------debugging aids from here--------*/
00092 
00093 static void print_key(const byte * p, int c, int j);
00094 static void print_tag(const byte * p, int c, int j);
00095 
00096 /*
00097 static void report_cursor(int N, Btree * B, Cursor * C)
00098 {
00099     int i;
00100     printf("%d)\n", N);
00101     for (i = 0; i <= B->level; i++)
00102         printf("p=%d, c=%d, n=[%d], rewrite=%d\n",
00103                 C[i].p, C[i].c, C[i].n, C[i].rewrite);
00104 }
00105 */
00106 
00107 /*------to here--------*/
00108 #endif /* BTREE_DEBUG_FULL */
00109 
00110 /* Input/output is defined with calls to the basic Unix system interface: */
00111 
00112 int sys_open_to_read_no_except(const string & name)
00113 {
00114 #ifdef __WIN32__
00115     int fd = msvc_posix_open(name.c_str(), O_RDONLY | O_BINARY);
00116 #else
00117     int fd = open(name.c_str(), O_RDONLY | O_BINARY);
00118 #endif
00119     return fd;
00120 }
00121 
00122 int sys_open_to_read(const string & name)
00123 {
00124     int fd = sys_open_to_read_no_except(name);
00125     if (fd < 0) {
00126         string message = string("Couldn't open ")
00127                 + name + " to read: " + strerror(errno);
00128         throw Xapian::DatabaseOpeningError(message);
00129     }
00130     return fd;
00131 }
00132 
00133 static int sys_open_to_write_no_except(const string & name)
00134 {
00135 #ifdef __WIN32__
00136     int fd = msvc_posix_open(name.c_str(), O_WRONLY | O_CREAT | O_TRUNC | O_BINARY);
00137 #else
00138     int fd = open(name.c_str(), O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0666);
00139 #endif
00140     return fd;
00141 }
00142 
00143 int sys_open_to_write(const string & name)
00144 {
00145     int fd = sys_open_to_write_no_except(name);
00146     if (fd < 0) {
00147         string message = string("Couldn't open ")
00148                 + name + " to write: " + strerror(errno);
00149         throw Xapian::DatabaseOpeningError(message);
00150     }
00151     return fd;
00152 }
00153 
00154 static int sys_open_for_readwrite(const string & name)
00155 {
00156     int fd = open(name.c_str(), O_RDWR | O_BINARY);
00157     if (fd < 0) {
00158         string message = string("Couldn't open ")
00159                 + name + " read/write: " + strerror(errno);
00160         throw Xapian::DatabaseOpeningError(message);
00161     }
00162     return fd;
00163 }
00164 
00165 static void sys_write_bytes(int h, int n, const char * p)
00166 {
00167     while (true) {
00168         ssize_t bytes_written = write(h, p, n);
00169         if (bytes_written == n) {
00170             // normal case - write succeeded, so return.
00171             return;
00172         } else if (bytes_written == -1) {
00173             if (errno == EINTR) continue;
00174             string message = "Error writing block: ";
00175             message += strerror(errno);
00176             throw Xapian::DatabaseError(message);
00177         } else if (bytes_written == 0) {
00178             string message = "Error writing block: wrote no data";
00179             throw Xapian::DatabaseError(message);
00180         } else if (bytes_written < n) {
00181             /* Wrote part of the block, which is not an error.  We should
00182              * continue writing the rest of the block.
00183              */
00184             n -= bytes_written;
00185             p += bytes_written;
00186         }
00187     }
00188 }
00189 
00190 string sys_read_all_bytes(int h, size_t bytes_to_read)
00191 {
00192     ssize_t bytes_read;
00193     string retval;
00194     while (true) {
00195         char buf[1024];
00196         bytes_read = read(h, buf, min(sizeof(buf), bytes_to_read));
00197         if (bytes_read > 0) {
00198             // add byte to string, continue unless we reached max
00199             retval.append(buf, bytes_read);
00200             bytes_to_read -= bytes_read;
00201             if (bytes_to_read == 0) {
00202                 break;
00203             }
00204         } else if (bytes_read == 0) {
00205             // end of file, we're finished
00206             break;
00207         } else if (bytes_read == -1) {
00208             if (errno == EINTR) continue;
00209             string message = "Error reading all bytes: ";
00210             message += strerror(errno);
00211             throw Xapian::DatabaseError(message);
00212         }
00213     }
00214     return retval;
00215 }
00216 
00217 void
00218 sys_write_string(int h, const string &s)
00219 {
00220     sys_write_bytes(h, s.length(), s.data());
00221 }
00222 
00223 int sys_flush(int h) {
00224 #if defined HAVE_FDATASYNC
00225     return (fdatasync(h) != -1);
00226 #elif defined HAVE_FSYNC
00227     return (fsync(h) != -1);
00228 #elif defined __WIN32__
00229     return (_commit(h) != -1);
00230 #else
00231 #error "Have neither fsync() nor fdatasync() nor _commit() - cannot sync."
00232 #endif
00233 }
00234 
00235 static void sys_unlink(const string &filename)
00236 {
00237 #ifdef __WIN32__
00238     if (msvc_posix_unlink(filename.c_str()) == -1) {
00239 #else
00240     if (unlink(filename) == -1) {
00241 #endif
00242         string message = "Failed to unlink ";
00243         message += filename;
00244         message += ": ";
00245         message += strerror(errno);
00246         throw Xapian::DatabaseCorruptError(message);
00247     }
00248 }
00249 
00250 /* There are two bit maps in bit_map0 and bit_map. The nth bit of bitmap is 0
00251    if the nth block is free, otherwise 1. bit_map0 is the initial state of
00252    bitmap at the start of the current transaction.
00253 
00254    Note use of the limits.h values:
00255    UCHAR_MAX = 255, an unsigned with all bits set, and
00256    CHAR_BIT = 8, the number of bits per byte
00257 
00258    BYTE_PAIR_RANGE below is the smallest +ve number that can't be held in two
00259    bytes -- 64K effectively.
00260 */
00261 
00262 #define BYTE_PAIR_RANGE (1 << 2 * CHAR_BIT)
00263 
00265 void
00266 Btree::read_block(uint4 n, byte * p) const
00267 {
00268     // Log the value of p, not the contents of the block it points to...
00269     DEBUGCALL(DB, void, "Btree::read_block", n << ", " << (void*)p);
00270     /* Use the base bit_map_size not the bitmap's size, because
00271      * the latter is uninitialised in readonly mode.
00272      */
00273     Assert(n / CHAR_BIT < base.get_bit_map_size());
00274 
00275 #ifdef HAVE_PREAD
00276     off_t offset = off_t(block_size) * n;
00277     int m = block_size;
00278     while (true) {
00279         ssize_t bytes_read = pread(handle, reinterpret_cast<char *>(p), m,
00280                                    offset);
00281         // normal case - read succeeded, so return.
00282         if (bytes_read == m) return;
00283         if (bytes_read == -1) {
00284             if (errno == EINTR) continue;
00285             string message = "Error reading block " + om_tostring(n) + ": ";
00286             message += strerror(errno);
00287             throw Xapian::DatabaseError(message);
00288         } else if (bytes_read == 0) {
00289             string message = "Error reading block " + om_tostring(n) + ": got end of file";
00290             throw Xapian::DatabaseError(message);
00291         } else if (bytes_read < m) {
00292             /* Read part of the block, which is not an error.  We should
00293              * continue reading the rest of the block.
00294              */
00295             m -= bytes_read;
00296             p += bytes_read;
00297             offset += bytes_read;
00298         }
00299     }
00300 #else
00301     if (lseek(handle, off_t(block_size) * n, SEEK_SET) == -1) {
00302         string message = "Error seeking to block: ";
00303         message += strerror(errno);
00304         throw Xapian::DatabaseError(message);
00305     }
00306 
00307     int m = block_size;
00308     while (true) {
00309         ssize_t bytes_read = read(handle, reinterpret_cast<char *>(p), m);
00310         // normal case - read succeeded, so return.
00311         if (bytes_read == m) return;
00312         if (bytes_read == -1) {
00313             if (errno == EINTR) continue;
00314             string message = "Error reading block " + om_tostring(n) + ": ";
00315             message += strerror(errno);
00316             throw Xapian::DatabaseError(message);
00317         } else if (bytes_read == 0) {
00318             string message = "Error reading block " + om_tostring(n) + ": got end of file";
00319             throw Xapian::DatabaseError(message);
00320         } else if (bytes_read < m) {
00321             /* Read part of the block, which is not an error.  We should
00322              * continue reading the rest of the block.
00323              */
00324             m -= bytes_read;
00325             p += bytes_read;
00326         }
00327     }
00328 #endif
00329 }
00330 
00337 void
00338 Btree::write_block(uint4 n, const byte * p) const
00339 {
00340     DEBUGCALL(DB, void, "Btree::write_block", n << ", " << p);
00341     Assert(writable);
00342     /* Check that n is in range. */
00343     Assert(n / CHAR_BIT < base.get_bit_map_size());
00344 
00345     /* don't write to non-free */;
00346     AssertParanoid(base.block_free_at_start(n));
00347 
00348     /* write revision is okay */
00349     AssertEqParanoid(REVISION(p), latest_revision_number + 1);
00350 
00351     if (both_bases) {
00352         // Delete the old base before modifying the database.
00353         sys_unlink(name + "base" + other_base_letter);
00354         both_bases = false;
00355         latest_revision_number = revision_number;
00356     }
00357 
00358 #ifdef HAVE_PWRITE
00359     off_t offset = off_t(block_size) * n;
00360     int m = block_size;
00361     while (true) {
00362         ssize_t bytes_written = pwrite(handle, p, m, offset);
00363         if (bytes_written == m) {
00364             // normal case - write succeeded, so return.
00365             return;
00366         } else if (bytes_written == -1) {
00367             if (errno == EINTR) continue;
00368             string message = "Error writing block: ";
00369             message += strerror(errno);
00370             throw Xapian::DatabaseError(message);
00371         } else if (bytes_written == 0) {
00372             string message = "Error writing block: wrote no data";
00373             throw Xapian::DatabaseError(message);
00374         } else if (bytes_written < m) {
00375             /* Wrote part of the block, which is not an error.  We should
00376              * continue writing the rest of the block.
00377              */
00378             m -= bytes_written;
00379             p += bytes_written;
00380             offset += bytes_written;
00381         }
00382     }
00383 #else
00384     if (lseek(handle, (off_t)block_size * n, SEEK_SET) == -1) {
00385         string message = "Error seeking to block: ";
00386         message += strerror(errno);
00387         throw Xapian::DatabaseError(message);
00388     }
00389 
00390     sys_write_bytes(handle, block_size, (const char *)p);
00391 #endif
00392 }
00393 
00394 
00395 /* A note on cursors:
00396 
00397    Each B-tree level has a corresponding array element C[j] in a
00398    cursor, C. C[0] is the leaf (or data) level, and C[B->level] is the
00399    root block level. Within a level j,
00400 
00401        C[j].p  addresses the block
00402        C[j].c  is the offset into the directory entry in the block
00403        C[j].n  is the number of the block at C[j].p
00404 
00405    A look up in the B-tree causes navigation of the blocks starting
00406    from the root. In each block, p, we find an offset, c, to an item
00407    which gives the number, n, of the block for the next level. This
00408    leads to an array of values p,c,n which are held inside the cursor.
00409 
00410    Any Btree object B has a built-in cursor, at B->C. But other cursors may
00411    be created.  If BC is a created cursor, BC->C is the cursor in the
00412    sense given above, and BC->B is the handle for the B-tree again.
00413 */
00414 
00415 
00416 void
00417 Btree::set_overwritten() const
00418 {
00419     DEBUGCALL(DB, void, "Btree::set_overwritten", "");
00420     // If we're writable, there shouldn't be another writer who could cause
00421     // overwritten to be flagged, so that's a DatabaseCorruptError.
00422     if (writable)
00423         throw Xapian::DatabaseCorruptError("Db block overwritten - are there multiple writers?");
00424     throw Xapian::DatabaseModifiedError("The revision being read has been discarded - you should call Xapian::Database::reopen() and retry the operation");
00425 }
00426 
00427 /* block_to_cursor(C, j, n) puts block n into position C[j] of cursor
00428    C, writing the block currently at C[j] back to disk if necessary.
00429    Note that
00430 
00431        C[j].rewrite
00432 
00433    is true iff C[j].n is different from block n in file DB. If it is
00434    false no rewriting is necessary.
00435 */
00436 
00437 void
00438 Btree::block_to_cursor(Cursor * C_, int j, uint4 n) const
00439 {
00440     DEBUGCALL(DB, void, "Btree::block_to_cursor", (void*)C_ << ", " << j << ", " << n);
00441     if (n == C_[j].n) return;
00442     byte * p = C_[j].p;
00443     Assert(p);
00444 
00445     // FIXME: only needs to be done in write mode
00446     if (C_[j].rewrite) {
00447         Assert(writable);
00448         Assert(C == C_);
00449         write_block(C_[j].n, p);
00450         C_[j].rewrite = false;
00451     }
00452     // Check if the block is in the built-in cursor (potentially in
00453     // modified form).
00454     if (writable && n == C[j].n) {
00455         if (p != C[j].p)
00456             memcpy(p, C[j].p, block_size);
00457     } else {
00458         read_block(n, p);
00459     }
00460 
00461     C_[j].n = n;
00462     if (j < level) {
00463         /* unsigned comparison */
00464         if (REVISION(p) > REVISION(C_[j + 1].p)) {
00465             set_overwritten();
00466             return;
00467         }
00468     }
00469     AssertEq(j, GET_LEVEL(p));
00470 }
00471 
00493 void
00494 Btree::alter()
00495 {
00496     DEBUGCALL(DB, void, "Btree::alter", "");
00497     Assert(writable);
00498 #ifdef DANGEROUS
00499     C[0].rewrite = true;
00500 #else
00501     int j = 0;
00502     byte * p = C[j].p;
00503     while (true) {
00504         if (C[j].rewrite) return; /* all new, so return */
00505         C[j].rewrite = true;
00506 
00507         uint4 n = C[j].n;
00508         if (base.block_free_at_start(n)) {
00509             Assert(REVISION(p) == latest_revision_number + 1);
00510             return;
00511         }
00512         Assert(REVISION(p) < latest_revision_number + 1);
00513         base.free_block(n);
00514         n = base.next_free_block();
00515         C[j].n = n;
00516         SET_REVISION(p, latest_revision_number + 1);
00517 
00518         if (j == level) return;
00519         j++;
00520         p = C[j].p;
00521         Item_wr(p, C[j].c).set_block_given_by(n);
00522     }
00523 #endif
00524 }
00525 
00538 int Btree::find_in_block(const byte * p, Key key, bool leaf, int c)
00539 {
00540     DEBUGCALL_STATIC(DB, int, "Btree::find_in_block", reinterpret_cast<const void*>(p) << ", " << reinterpret_cast<const void*>(key.get_address()) << ", " << leaf << ", " << c);
00541     int i = DIR_START;
00542     if (leaf) i -= D2;
00543     int j = DIR_END(p);
00544 
00545     if (c != -1) {
00546         if (c < j && i < c && Item(p, c).key() <= key)
00547             i = c;
00548         c += D2;
00549         if (c < j && i < c && key < Item(p, c).key())
00550             j = c;
00551     }
00552 
00553     while (j - i > D2) {
00554         int k = i + ((j - i)/(D2 * 2))*D2; /* mid way */
00555         if (key < Item(p, k).key()) j = k; else i = k;
00556     }
00557     RETURN(i);
00558 }
00559 
00567 bool
00568 Btree::find(Cursor * C_) const
00569 {
00570     DEBUGCALL(DB, bool, "Btree::find", (void*)C_);
00571     // Note: the parameter is needed when we're called by BCursor
00572     const byte * p;
00573     int c;
00574     Key key = kt.key();
00575     for (int j = level; j > 0; --j) {
00576         p = C_[j].p;
00577         c = find_in_block(p, key, false, C_[j].c);
00578 #ifdef BTREE_DEBUG_FULL
00579         printf("Block in Btree:find - code position 1");
00580         report_block_full(j, C_[j].n, p);
00581 #endif /* BTREE_DEBUG_FULL */
00582         C_[j].c = c;
00583         block_to_cursor(C_, j - 1, Item(p, c).block_given_by());
00584     }
00585     p = C_[0].p;
00586     c = find_in_block(p, key, true, C_[0].c);
00587 #ifdef BTREE_DEBUG_FULL
00588     printf("Block in Btree:find - code position 2");
00589     report_block_full(0, C_[0].n, p);
00590 #endif /* BTREE_DEBUG_FULL */
00591     C_[0].c = c;
00592     if (c < DIR_START) RETURN(false);
00593     RETURN(Item(p, c).key() == key);
00594 }
00595 
00601 void
00602 Btree::compact(byte * p)
00603 {
00604     DEBUGCALL(DB, void, "Btree::compact", (void*)p);
00605     Assert(writable);
00606     Assert(p);
00607     Assert(buffer);
00608     int e = block_size;
00609     byte * b = buffer;
00610     int dir_end = DIR_END(p);
00611     for (int c = DIR_START; c < dir_end; c += D2) {
00612         Item item(p, c);
00613         int l = item.size();
00614         Assert(e >= dir_end);
00615         e -= l;
00616         memmove(b + e, item.get_address(), l);
00617         SETD(p, c, e);  /* reform in b */
00618     }
00619     memmove(p + e, b + e, block_size - e);  /* copy back */
00620     e -= dir_end;
00621     SET_TOTAL_FREE(p, e);
00622     SET_MAX_FREE(p, e);
00623 }
00624 
00628 void
00629 Btree::split_root(uint4 split_n)
00630 {
00631     DEBUGCALL(DB, void, "Btree::split_root", split_n);
00632     /* gain a level */
00633     ++level;
00634 
00635     /* check level overflow - this isn't something that should ever happen
00636      * but deserves more than an Assert()... */
00637     if (level == BTREE_CURSOR_LEVELS) {
00638         throw Xapian::DatabaseCorruptError("Btree has grown impossibly large ("STRINGIZE(BTREE_CURSOR_LEVELS)" levels)");
00639     }
00640 
00641     byte * q = zeroed_new(block_size);
00642     if (q == 0) {
00643         throw std::bad_alloc();
00644     }
00645     C[level].p = q;
00646     C[level].c = DIR_START;
00647     C[level].n = base.next_free_block();
00648     C[level].rewrite = true;
00649     SET_REVISION(q, latest_revision_number + 1);
00650     SET_LEVEL(q, level);
00651     SET_DIR_END(q, DIR_START);
00652     compact(q);   /* to reset TOTAL_FREE, MAX_FREE */
00653 
00654     /* form a null key in b with a pointer to the old root */
00655     byte b[10]; /* 7 is exact */
00656     Item_wr item(b);
00657     item.form_null_key(split_n);
00658     add_item(item, level);
00659 }
00660 
00676 void
00677 Btree::enter_key(int j, Key prevkey, Key newkey)
00678 {
00679     Assert(writable);
00680     Assert(prevkey < newkey);
00681     Assert(j >= 1);
00682 
00683     uint4 blocknumber = C[j - 1].n;
00684 
00685     // FIXME update to use Key
00686     // Keys are truncated here: but don't truncate the count at the end away.
00687     const int newkey_len = newkey.length();
00688     int i;
00689 
00690     if (j == 1) {
00691         // Truncate the key to the minimal key which differs from prevkey,
00692         // the preceding key in the block.
00693         i = 0;
00694         const int min_len = min(newkey_len, prevkey.length());
00695         while (i < min_len && prevkey[i] == newkey[i]) {
00696             i++;
00697         }
00698 
00699         // Want one byte of difference.
00700         if (i < newkey_len) i++;
00701     } else {
00702         /* Can't truncate between branch levels, since the separated keys
00703          * are in at the leaf level, and truncating again will change the
00704          * branch point.
00705          */
00706         i = newkey_len;
00707     }
00708 
00709     byte b[UCHAR_MAX + 6];
00710     Item_wr item(b);
00711     Assert(i <= 256 - I2 - C2);
00712     Assert(i <= (int)sizeof(b) - I2 - C2 - 4);
00713     item.set_key_and_block(newkey, i, blocknumber);
00714 
00715     // When j > 1 we can make the first key of block p null.  This is probably
00716     // worthwhile as it trades a small amount of CPU and RAM use for a small
00717     // saving in disk use.  Other redundant keys will still creep in though.
00718     if (j > 1) {
00719         byte * p = C[j - 1].p;
00720         uint4 n = get_int4(newkey.get_address(), newkey_len + K1 + C2);
00721         int new_total_free = TOTAL_FREE(p) + newkey_len + C2;
00722         // FIXME: incredibly icky going from key to item like this...
00723         Item_wr(const_cast<byte *>(newkey.get_address()) - I2).form_null_key(n);
00724         SET_TOTAL_FREE(p, new_total_free);
00725     }
00726 
00727     C[j].c = find_in_block(C[j].p, item.key(), false, 0) + D2;
00728     C[j].rewrite = true; /* a subtle point: this *is* required. */
00729     add_item(item, j);
00730 }
00731 
00736 int
00737 Btree::mid_point(byte * p)
00738 {
00739     int n = 0;
00740     int dir_end = DIR_END(p);
00741     int size = block_size - TOTAL_FREE(p) - dir_end;
00742     for (int c = DIR_START; c < dir_end; c += D2) {
00743         int l = Item(p, c).size();
00744         n += 2 * l;
00745         if (n >= size) {
00746             if (l < n - size) return c;
00747             return c + D2;
00748         }
00749     }
00750 
00751     /* falling out of mid_point */
00752     Assert(false);
00753     return 0; /* Stop compiler complaining about end of method. */
00754 }
00755 
00764 void
00765 Btree::add_item_to_block(byte * p, Item_wr kt_, int c)
00766 {
00767     Assert(writable);
00768     int dir_end = DIR_END(p);
00769     int kt_len = kt_.size();
00770     int needed = kt_len + D2;
00771     int new_total = TOTAL_FREE(p) - needed;
00772     int new_max = MAX_FREE(p) - needed;
00773 
00774     Assert(new_total >= 0);
00775 
00776     if (new_max < 0) {
00777         compact(p);
00778         new_max = MAX_FREE(p) - needed;
00779         Assert(new_max >= 0);
00780     }
00781     Assert(dir_end >= c);
00782 
00783     memmove(p + c + D2, p + c, dir_end - c);
00784     dir_end += D2;
00785     SET_DIR_END(p, dir_end);
00786 
00787     int o = dir_end + new_max;
00788     SETD(p, c, o);
00789     memmove(p + o, kt_.get_address(), kt_len);
00790 
00791     SET_MAX_FREE(p, new_max);
00792 
00793     SET_TOTAL_FREE(p, new_total);
00794 }
00795 
00801 void
00802 Btree::add_item(Item_wr kt_, int j)
00803 {
00804     Assert(writable);
00805     byte * p = C[j].p;
00806     int c = C[j].c;
00807     uint4 n;
00808 
00809     int needed = kt_.size() + D2;
00810     if (TOTAL_FREE(p) < needed) {
00811         int m;
00812         // Prepare to split p. After splitting, the block is in two halves, the
00813         // lower half is split_p, the upper half p again. add_to_upper_half
00814         // becomes true when the item gets added to p, false when it gets added
00815         // to split_p.
00816 
00817         if (seq_count < 0) {
00818             // If we're not in sequential mode, we split at the mid point
00819             // of the node.
00820             m = mid_point(p);
00821         } else {
00822             // During sequential addition, split at the insert point
00823             m = c;
00824         }
00825 
00826         uint4 split_n = C[j].n;
00827         C[j].n = base.next_free_block();
00828 
00829         memcpy(split_p, p, block_size);  // replicate the whole block in split_p
00830         SET_DIR_END(split_p, m);
00831         compact(split_p);      /* to reset TOTAL_FREE, MAX_FREE */
00832 
00833         {
00834             int residue = DIR_END(p) - m;
00835             int new_dir_end = DIR_START + residue;
00836             memmove(p + DIR_START, p + m, residue);
00837             SET_DIR_END(p, new_dir_end);
00838         }
00839 
00840         compact(p);      /* to reset TOTAL_FREE, MAX_FREE */
00841 
00842         bool add_to_upper_half;
00843         if (seq_count < 0) {
00844             add_to_upper_half = (c >= m);
00845         } else {
00846             // And add item to lower half if split_p has room, otherwise upper
00847             // half
00848             add_to_upper_half = (TOTAL_FREE(split_p) < needed);
00849         }
00850 
00851         if (add_to_upper_half) {
00852             c -= (m - DIR_START);
00853             Assert(seq_count < 0 || c <= DIR_START + D2);
00854             Assert(c >= DIR_START);
00855             Assert(c <= DIR_END(p));
00856             add_item_to_block(p, kt_, c);
00857             n = C[j].n;
00858         } else {
00859             Assert(c >= DIR_START);
00860             Assert(c <= DIR_END(split_p));
00861             add_item_to_block(split_p, kt_, c);
00862             n = split_n;
00863         }
00864         write_block(split_n, split_p);
00865 
00866         // Check if we're splitting the root block.
00867         if (j == level) split_root(split_n);
00868 
00869         /* Enter a separating key at level j + 1 between */
00870         /* the last key of block split_p, and the first key of block p */
00871         enter_key(j + 1,
00872                   Item(split_p, DIR_END(split_p) - D2).key(),
00873                   Item(p, DIR_START).key());
00874     } else {
00875         add_item_to_block(p, kt_, c);
00876         n = C[j].n;
00877     }
00878     if (j == 0) {
00879         changed_n = n;
00880         changed_c = c;
00881     }
00882 }
00883 
00891 void
00892 Btree::delete_item(int j, bool repeatedly)
00893 {
00894     Assert(writable);
00895     byte * p = C[j].p;
00896     int c = C[j].c;
00897     int kt_len = Item(p, c).size(); /* size of the item to be deleted */
00898     int dir_end = DIR_END(p) - D2;   /* directory length will go down by 2 bytes */
00899 
00900     memmove(p + c, p + c + D2, dir_end - c);
00901     SET_DIR_END(p, dir_end);
00902     SET_MAX_FREE(p, MAX_FREE(p) + D2);
00903     SET_TOTAL_FREE(p, TOTAL_FREE(p) + kt_len + D2);
00904 
00905     if (!repeatedly) return;
00906     if (j < level) {
00907         if (dir_end == DIR_START) {
00908             base.free_block(C[j].n);
00909             C[j].rewrite = false;
00910             C[j].n = BLK_UNUSED;
00911             C[j + 1].rewrite = true;  /* *is* necessary */
00912             delete_item(j + 1, true);
00913         }
00914     } else {
00915         Assert(j == level);
00916         while (dir_end == DIR_START + D2 && level > 0) {
00917             /* single item in the root block, so lose a level */
00918             uint4 new_root = Item(p, DIR_START).block_given_by();
00919             delete [] p;
00920             C[level].p = 0;
00921             base.free_block(C[level].n);
00922             C[level].rewrite = false;
00923             C[level].n = BLK_UNUSED;
00924             level--;
00925 
00926             block_to_cursor(C, level, new_root);
00927 
00928             p = C[level].p;
00929             dir_end = DIR_END(p); /* prepare for the loop */
00930         }
00931     }
00932 }
00933 
00934 /* debugging aid:
00935 static addcount = 0;
00936 */
00937 
00963 int
00964 Btree::add_kt(bool found)
00965 {
00966     Assert(writable);
00967     int components = 0;
00968 
00969     /*
00970     {
00971         printf("%d) %s ", addcount++, (found ? "replacing" : "adding"));
00972         print_bytes(kt[I2] - K1 - C2, kt + I2 + K1); putchar('\n');
00973     }
00974     */
00975     alter();
00976 
00977     if (found) { /* replacement */
00978         seq_count = SEQ_START_POINT;
00979         sequential = false;
00980 
00981         byte * p = C[0].p;
00982         int c = C[0].c;
00983         Item item(p, c);
00984         int kt_size = kt.size();
00985         int needed = kt_size - item.size();
00986 
00987         components = Item(p, c).components_of();
00988 
00989         if (needed <= 0) {
00990             /* simple replacement */
00991             memmove(const_cast<byte *>(item.get_address()),
00992                     kt.get_address(), kt_size);
00993             SET_TOTAL_FREE(p, TOTAL_FREE(p) - needed);
00994         } else {
00995             /* new item into the block's freespace */
00996             int new_max = MAX_FREE(p) - kt_size;
00997             if (new_max >= 0) {
00998                 int o = DIR_END(p) + new_max;
00999                 memmove(p + o, kt.get_address(), kt_size);
01000                 SETD(p, c, o);
01001                 SET_MAX_FREE(p, new_max);
01002                 SET_TOTAL_FREE(p, TOTAL_FREE(p) - needed);
01003             } else {
01004                 /* do it the long way */
01005                 delete_item(0, false);
01006                 add_item(kt, 0);
01007             }
01008         }
01009     } else {
01010         /* addition */
01011         if (changed_n == C[0].n && changed_c == C[0].c) {
01012             if (seq_count < 0) seq_count++;
01013         } else {
01014             seq_count = SEQ_START_POINT;
01015             sequential = false;
01016         }
01017         C[0].c += D2;
01018         add_item(kt, 0);
01019     }
01020     return components;
01021 }
01022 
01023 /* delete_kt() corresponds to add_kt(found), but there are only
01024    two cases: if the key is not found nothing is done, and if it is
01025    found the corresponding item is deleted with delete_item.
01026 */
01027 
01028 int
01029 Btree::delete_kt()
01030 {
01031     Assert(writable);
01032 
01033     bool found = find(C);
01034 
01035     int components = 0;
01036     seq_count = SEQ_START_POINT;
01037     sequential = false;
01038 
01039     /*
01040     {
01041         printf("%d) %s ", addcount++, (found ? "deleting " : "ignoring "));
01042         print_bytes(B->kt[I2] - K1 - C2, B->kt + I2 + K1); putchar('\n');
01043     }
01044     */
01045     if (found) {
01046         components = Item(C[0].p, C[0].c).components_of();
01047         alter();
01048         delete_item(0, true);
01049     }
01050     return components;
01051 }
01052 
01053 /* Btree::form_key(key) treats address kt as an item holder and fills in
01054 the key part:
01055 
01056            (I) K key c (C tag)
01057 
01058 The bracketed parts are left blank. The key is filled in with key_len bytes and
01059 K set accordingly. c is set to 1.
01060 */
01061 
01062 void Btree::form_key(const string & key) const
01063 {
01064     kt.form_key(key);
01065 }
01066 
01067 /* Btree::add(key, tag) adds the key/tag item to the
01068    B-tree, replacing any existing item with the same key.
01069 
01070    For a long tag, we end up having to add m components, of the form
01071 
01072        key 1 m tag1
01073        key 2 m tag2
01074        ...
01075        key m m tagm
01076 
01077    and tag1+tag2+...+tagm are equal to tag. These in their turn may be replacing
01078    n components of the form
01079 
01080        key 1 n TAG1
01081        key 2 n TAG2
01082        ...
01083        key n n TAGn
01084 
01085    and n may be greater than, equal to, or less than m. These cases are dealt
01086    with in the code below. If m < n for example, we end up with a series of
01087    deletions.
01088 */
01089 
01090 bool
01091 Btree::add(const string &key, string tag)
01092 {
01093     DEBUGCALL(DB, bool, "Btree::add", key << ", " << tag);
01094     Assert(writable);
01095 
01096     if (key.size() > BTREE_MAX_KEY_LEN) {
01097         throw Xapian::InvalidArgumentError(
01098                 "Key too long: length was " +
01099                 om_tostring(key.size()) +
01100                 " bytes, maximum length of a key is " + 
01101                 STRINGIZE(BTREE_MAX_KEY_LEN) + " bytes");
01102     }
01103 
01104     form_key(key);
01105 
01106     // sort of matching kt.append_chunk(), but setting the chunk
01107     const size_t cd = kt.key().length() + K1 + I2 + C2 + C2;  // offset to the tag data
01108     const size_t L = max_item_size - cd; // largest amount of tag data for any chunk
01109     size_t first_L = L;                  // - amount for tag1
01110     bool found = find(C);
01111     if (!found) {
01112         byte * p = C[0].p;
01113         size_t n = TOTAL_FREE(p) % (max_item_size + D2);
01114         if (n > D2 + cd) {
01115             n -= (D2 + cd);
01116             // if n >= last then fully filling this block won't produce
01117             // an extra item, so we might as well do this even if
01118             // full_compaction isn't active.
01119             //
01120             // In the full_compaction case, it turns out we shouldn't always
01121             // try to fill every last byte.  Doing so can actually increase the
01122             // total space required (I believe this effect is due to longer
01123             // dividing keys being required in the index blocks).  Empirically,
01124             // n >= key.size() + K appears a good criterion for K ~= 34.  This
01125             // seems to save about 0.2% in total database size over always
01126             // splitting the tag.  It'll also give be slightly faster retrieval
01127             // as we can avoid reading an extra block occasionally.
01128             size_t last = tag.length() % L;
01129             if (n >= last || (full_compaction && n >= key.size() + 34))
01130                 first_L = n;
01131         }
01132     }
01133 
01134     // a null tag must be added in of course
01135     int m = tag.empty() ? 1 : (tag.length() - first_L + L - 1) / L + 1;
01136                                       // there are m items to add
01137     /* FIXME: sort out this error higher up and turn this into
01138      * an assert.
01139      */
01140     if (m >= BYTE_PAIR_RANGE) RETURN(false);
01141 
01142     int n = 0; // initialise to shut off warning
01143                                       // - and there will be n to delete
01144     int o = 0;                        // Offset into the tag
01145     size_t residue = tag.length();    // Bytes of the tag remaining to add in
01146     int replacement = false;          // Has there been a replacement ?
01147     int i;
01148     kt.set_components_of(m);
01149     for (i = 1; i <= m; i++) {
01150         size_t l = (i == m ? residue : (i == 1 ? first_L : L));
01151         Assert(cd + l <= block_size);
01152         Assert(string::size_type(o + l) <= tag.length());
01153         kt.set_tag(cd, tag.data() + o, l);
01154         kt.set_component_of(i);
01155         
01156         o += l;
01157         residue -= l;
01158 
01159         if (i > 1) found = find(C);
01160         n = add_kt(found);
01161         if (n > 0) replacement = true;
01162     }
01163     /* o == tag.length() here, and n may be zero */
01164     for (i = m + 1; i <= n; i++) {
01165         kt.set_component_of(i);
01166         delete_kt();
01167     }
01168     if (!replacement) ++item_count;
01169     Btree_modified = true;
01170     if (cursor_created_since_last_modification) {
01171         cursor_created_since_last_modification = false;
01172         ++cursor_version;
01173     }
01174     RETURN(true);
01175 }
01176 
01177 /* Btree::del(key) returns false if the key is not in the B-tree,
01178    otherwise deletes it and returns true.
01179 
01180    Again, this is parallel to Btree::add, but simpler in form.
01181 */
01182 
01183 bool
01184 Btree::del(const string &key)
01185 {
01186     DEBUGCALL(DB, bool, "Btree::del", key);
01187     Assert(writable);
01188 
01189     // We can't delete a key which we is too long for us to store.
01190     if (key.size() > BTREE_MAX_KEY_LEN) RETURN(false);
01191 
01192     if (key.empty()) RETURN(false);
01193     form_key(key);
01194 
01195     int n = delete_kt();  /* there are n items to delete */
01196     if (n <= 0) RETURN(false);
01197 
01198     for (int i = 2; i <= n; i++) {
01199         kt.set_component_of(i);
01200         delete_kt();
01201     }
01202 
01203     item_count--;
01204     Btree_modified = true;
01205     if (cursor_created_since_last_modification) {
01206         cursor_created_since_last_modification = false;
01207         ++cursor_version;
01208     }
01209     RETURN(true);
01210 }
01211 
01212 bool
01213 Btree::get_exact_entry(const string &key, string & tag) const
01214 {
01215     DEBUGCALL(DB, bool, "Btree::get_exact_entry", key << ", " << tag);
01216     Assert(!key.empty());
01217 
01218     // An oversized key can't exist, so attempting to search for it should fail.
01219     if (key.size() > BTREE_MAX_KEY_LEN) RETURN(false);
01220 
01221     RETURN(find_tag(key, &tag));
01222 }
01223 
01224 bool
01225 Btree::find_tag(const string &key, string * tag) const
01226 {
01227     DEBUGCALL(DB, bool, "Btree::find_tag", key << ", &tag");
01228     form_key(key);
01229     if (!find(C)) RETURN(false);
01230 
01231     read_tag(C, tag);
01232     RETURN(true);
01233 }
01234 
01235 void
01236 Btree::read_tag(Cursor * C_, string *tag) const
01237 {
01238     Item item(C_[0].p, C_[0].c);
01239 
01240     /* n components to join */
01241     int n = item.components_of();
01242 
01243     tag->resize(0);
01244     // max_item_size also includes K1 + I2 + C2 + C2 bytes overhead and the key
01245     // (which is at least 1 byte long).
01246     if (n > 1) tag->reserve((max_item_size - (1 + K1 + I2 + C2 + C2)) * n);
01247 
01248     item.append_chunk(tag);
01249 
01250     for (int i = 2; i <= n; i++) {
01251         if (!next(C_, 0)) {
01252             throw Xapian::DatabaseCorruptError("Unexpected end of table when reading continuation of tag");
01253         }
01254         (void)Item(C_[0].p, C_[0].c).append_chunk(tag);
01255     }
01256     // At this point the cursor is on the last item - calling next will move
01257     // it to the next key (Bcursor::get_tag() relies on this).
01258 }
01259 
01260 void
01261 Btree::set_full_compaction(bool parity)
01262 {
01263     Assert(writable);
01264 
01265     if (parity) seq_count = 0;
01266     full_compaction = parity;
01267 }
01268 
01269 Bcursor * Btree::cursor_get() const {
01270     // FIXME Ick - casting away const is nasty
01271     return new Bcursor(const_cast<Btree *>(this));
01272 }
01273 
01274 /************ B-tree opening and closing ************/
01275 
01276 bool
01277 Btree::basic_open(bool revision_supplied, quartz_revision_number_t revision_)
01278 {
01279     int ch = 'X'; /* will be 'A' or 'B' */
01280 
01281     {
01282         const size_t BTREE_BASES = 2;
01283         string err_msg;
01284         static const char basenames[BTREE_BASES] = { 'A', 'B' };
01285 
01286         Btree_base bases[BTREE_BASES];
01287         bool base_ok[BTREE_BASES];
01288 
01289         both_bases = true;
01290         bool valid_base = false;
01291         {
01292             for (size_t i = 0; i < BTREE_BASES; ++i) {
01293                 bool ok = bases[i].read(name, basenames[i], err_msg);
01294                 base_ok[i] = ok;
01295                 if (ok) {
01296                     valid_base = true;
01297                 } else {
01298                     both_bases = false;
01299                 }
01300             }
01301         }
01302 
01303         if (!valid_base) {
01304             string message = "Error opening table `";
01305             message += name;
01306             message += "':\n";
01307             message += err_msg;
01308             throw Xapian::DatabaseOpeningError(message);
01309         }
01310 
01311         if (revision_supplied) {
01312             bool found_revision = false;
01313             for (size_t i = 0; i < BTREE_BASES; ++i) {
01314                 if (base_ok[i] && bases[i].get_revision() == revision_) {
01315                     ch = basenames[i];
01316                     found_revision = true;
01317                     break;
01318                 }
01319             }
01320             if (!found_revision) {
01321                 /* Couldn't open the revision that was asked for.
01322                  * This shouldn't throw an exception, but should just return
01323                  * false to upper levels.
01324                  */
01325                 return false;
01326             }
01327         } else {
01328             quartz_revision_number_t highest_revision = 0;
01329             for (size_t i = 0; i < BTREE_BASES; ++i) {
01330                 if (base_ok[i] && bases[i].get_revision() >= highest_revision) {
01331                     ch = basenames[i];
01332                     highest_revision = bases[i].get_revision();
01333                 }
01334             }
01335         }
01336 
01337         Btree_base *basep = 0;
01338         Btree_base *other_base = 0;
01339 
01340         for (size_t i = 0; i < BTREE_BASES; ++i) {
01341             DEBUGLINE(UNKNOWN, "Checking (ch == " << ch << ") against "
01342                       "basenames[" << i << "] == " << basenames[i]);
01343             DEBUGLINE(UNKNOWN, "bases[" << i << "].get_revision() == " <<
01344                       bases[i].get_revision());
01345             DEBUGLINE(UNKNOWN, "base_ok[" << i << "] == " << base_ok[i]);
01346             if (ch == basenames[i]) {
01347                 basep = &bases[i];
01348 
01349                 // FIXME: assuming only two bases for other_base
01350                 size_t otherbase_num = 1-i;
01351                 if (base_ok[otherbase_num]) {
01352                     other_base = &bases[otherbase_num];
01353                 }
01354                 break;
01355             }
01356         }
01357         Assert(basep);
01358 
01359         /* basep now points to the most recent base block */
01360 
01361         /* Avoid copying the bitmap etc. - swap contents with the base
01362          * object in the vector, since it'll be destroyed anyway soon.
01363          */
01364         base.swap(*basep);
01365 
01366         revision_number =  base.get_revision();
01367         block_size =       base.get_block_size();
01368         root =             base.get_root();
01369         level =            base.get_level();
01370         //bit_map_size =     basep->get_bit_map_size();
01371         item_count =       base.get_item_count();
01372         faked_root_block = base.get_have_fakeroot();
01373         sequential =       base.get_sequential();
01374 
01375         if (other_base != 0) {
01376             latest_revision_number = other_base->get_revision();
01377             if (revision_number > latest_revision_number)
01378                 latest_revision_number = revision_number;
01379         } else {
01380             latest_revision_number = revision_number;
01381         }
01382     }
01383 
01384     /* kt holds constructed items as well as keys */
01385     kt = Item_wr(zeroed_new(block_size));
01386     if (kt.get_address() == 0) {
01387         throw std::bad_alloc();
01388     }
01389 
01390     set_max_item_size(BLOCK_CAPACITY);
01391 
01392     base_letter = ch;
01393 
01394     /* ready to open the main file */
01395 
01396     return true;
01397 }
01398 
01399 void
01400 Btree::read_root()
01401 {
01402     if (faked_root_block) {
01403         /* root block for an unmodified database. */
01404         byte * p = C[0].p;
01405         Assert(p);
01406 
01407         /* clear block - shouldn't be neccessary, but is a bit nicer,
01408          * and means that the same operations should always produce
01409          * the same database. */
01410         memset(p, 0, block_size);
01411 
01412         int o = block_size - I2 - K1 - C2 - C2;
01413         Item_wr(p + o).fake_root_item();
01414 
01415         SETD(p, DIR_START, o);         // its directory entry
01416         SET_DIR_END(p, DIR_START + D2);// the directory size
01417 
01418         o -= (DIR_START + D2);
01419         SET_MAX_FREE(p, o);
01420         SET_TOTAL_FREE(p, o);
01421         SET_LEVEL(p, 0);
01422 
01423         if (!writable) {
01424             /* reading - revision number doesn't matter as long as
01425              * it's not greater than the current one. */
01426             SET_REVISION(p, 0);
01427             C[0].n = 0;
01428         } else {
01429             /* writing - */
01430             SET_REVISION(p, latest_revision_number + 1);
01431             C[0].n = base.next_free_block();
01432         }
01433     } else {
01434         /* using a root block stored on disk */
01435         block_to_cursor(C, level, root);
01436 
01437         if (REVISION(C[level].p) > revision_number) set_overwritten();
01438         /* although this is unlikely */
01439     }
01440 }
01441 
01442 bool
01443 Btree::do_open_to_write(bool revision_supplied, quartz_revision_number_t revision_)
01444 {
01445     /* FIXME: do the exception safety the right way, by making all the
01446      * parts into sensible objects.
01447      */
01448     if (!basic_open(revision_supplied, revision_)) {
01449         if (!revision_supplied) {
01450             throw Xapian::DatabaseOpeningError("Failed to open for writing");
01451         }
01452         /* When the revision is supplied, it's not an exceptional
01453          * case when open failed, so we just return false here.
01454          */
01455         return false;
01456     }
01457 
01458     writable = true;
01459 
01460     handle = sys_open_for_readwrite(name + "DB");
01461 
01462     prev_ptr = &Btree::prev_default;
01463     next_ptr = &Btree::next_default;
01464 
01465     for (int j = 0; j <= level; j++) {
01466         C[j].n = BLK_UNUSED;
01467         C[j].p = new byte[block_size];
01468         if (C[j].p == 0) {
01469             throw std::bad_alloc();
01470         }
01471     }
01472     split_p = new byte[block_size];
01473     if (split_p == 0) {
01474         throw std::bad_alloc();
01475     }
01476     read_root();
01477 
01478     buffer = zeroed_new(block_size);
01479     if (buffer == 0) {
01480         throw std::bad_alloc();
01481     }
01482 
01483     // swap for writing
01484     other_base_letter = base_letter == 'A' ? 'B' : 'A';
01485 
01486     changed_n = 0;
01487     changed_c = DIR_START;
01488     seq_count = SEQ_START_POINT;
01489 
01490     return true;
01491 }
01492 
01493 Btree::Btree(string path_, bool readonly_)
01494         : revision_number(0),
01495           item_count(0),
01496           block_size(0),
01497           latest_revision_number(0),
01498           both_bases(false),
01499           base_letter('A'),
01500           faked_root_block(true),
01501           sequential(true),
01502           handle(-1),
01503           level(0),
01504           root(0),
01505           kt(0),
01506           buffer(0),
01507           base(),
01508           other_base_letter(0),
01509           name(path_),
01510           seq_count(0),
01511           changed_n(0),
01512           changed_c(0),
01513           max_item_size(0),
01514           Btree_modified(false),
01515           full_compaction(false),
01516           writable(!readonly_),
01517           cursor_created_since_last_modification(false),
01518           cursor_version(0),
01519           dont_close_handle(false),
01520           split_p(0)
01521 {
01522     DEBUGCALL(DB, void, "Btree::Btree", path_ << ", " << readonly_);
01523 }
01524 
01525 bool
01526 Btree::exists() const {
01527     DEBUGCALL(DB, bool, "Btree::exists", "");
01528     return (file_exists(name + "DB") &&
01529             (file_exists(name + "baseA") || file_exists(name + "baseB")));
01530 }
01531 
01535 void
01536 sys_unlink_if_exists(const string & filename)
01537 {
01538     if (unlink(filename) == -1) {
01539         if (errno == ENOENT) return;
01540         throw Xapian::DatabaseError("Can't delete file: `" + filename +
01541                               "': " + strerror(errno));
01542     }
01543 }
01544 
01545 void
01546 Btree::create(unsigned int block_size_)
01547 {
01548     DEBUGCALL(DB, void, "Btree::create", block_size_);
01549     close();
01550 
01551     // Block size must in the range 2048..BYTE_PAIR_RANGE, and a power of two.
01552     if (block_size_ < 2048 || block_size_ > BYTE_PAIR_RANGE ||
01553         (block_size_ & (block_size_ - 1)) != 0) {
01554         block_size_ = 8192;
01555     }
01556 
01557     // FIXME: it would be good to arrange that this works such that there's
01558     // always a valid table in place...
01559 
01560     /* write initial values to files */
01561 
01562     /* create the base file */
01563     Btree_base base_;
01564     base_.set_block_size(block_size_);
01565     base_.set_have_fakeroot(true);
01566     base_.set_sequential(true);
01567     base_.write_to_file(name + "baseA");
01568 
01569     /* remove the alternative base file, if any */
01570     sys_unlink_if_exists(name + "baseB");
01571 
01572     /* create the main file */
01573     int h = sys_open_to_write(name + "DB");
01574     if (h == -1 || !sys_close(h)) {
01575         string message = "Error creating DB file: ";
01576         message += strerror(errno);
01577         throw Xapian::DatabaseOpeningError(message);
01578     }
01579 }
01580 
01581 Btree::~Btree() {
01582     DEBUGCALL(DB, void, "Btree::~Btree", "");
01583     Btree::close();
01584 }
01585 
01586 void Btree::close() {
01587     DEBUGCALL(DB, void, "Btree::close", "");
01588 
01589     if (handle != -1) {
01590         // If an error occurs here, we just ignore it, since we're just
01591         // trying to free everything.
01592         if (!dont_close_handle) (void)sys_close(handle);
01593         handle = -1;
01594     }
01595 
01596     for (int j = level; j >= 0; j--) {
01597         delete [] C[j].p;
01598         C[j].p = 0;
01599     }
01600     delete [] split_p;
01601     split_p = 0;
01602 
01603     delete [] kt.get_address();
01604     kt = 0;
01605     delete [] buffer;
01606     buffer = 0;
01607 }
01608 
01609 void
01610 Btree::commit(quartz_revision_number_t revision)
01611 {
01612     DEBUGCALL(DB, void, "Btree::commit", revision);
01613     Assert(writable);
01614 
01615     if (revision <= revision_number) {
01616         throw Xapian::DatabaseError("New revision too low");
01617     }
01618 
01619     // FIXME: this doesn't work (probably because the table revisions get
01620     // out of step) but it's wasteful to keep applying changes to value
01621     // and position if they're never used...
01622     //
01623     // if (!Btree_modified) return;
01624 
01625     for (int j = level; j >= 0; j--) {
01626         if (C[j].rewrite) {
01627             write_block(C[j].n, C[j].p);
01628         }
01629     }
01630 
01631     if (!sys_flush(handle)) {
01632         if (!dont_close_handle) (void)sys_close(handle);
01633         handle = -1;
01634         throw Xapian::DatabaseError("Can't commit new revision - failed to close DB");
01635     }
01636 
01637     if (Btree_modified) {
01638         faked_root_block = false;
01639     }
01640 
01641     if (faked_root_block) {
01642         /* We will use a dummy bitmap. */
01643         base.clear_bit_map();
01644     }
01645 
01646     base.set_revision(revision);
01647     base.set_root(C[level].n);
01648     base.set_level(level);
01649     base.set_item_count(item_count);
01650     base.set_have_fakeroot(faked_root_block);
01651     base.set_sequential(sequential);
01652 
01653     {
01654         int tmp = base_letter;
01655         base_letter = other_base_letter;
01656         other_base_letter = tmp;
01657     }
01658     both_bases = true;
01659     latest_revision_number = revision_number = revision;
01660     root = C[level].n;
01661 
01662     Btree_modified = false;
01663 
01664     for (int i = 0; i < BTREE_CURSOR_LEVELS; ++i) {
01665         C[i].n = BLK_UNUSED;
01666         C[i].c = -1;
01667         C[i].rewrite = false;
01668     }
01669 
01670     base.write_to_file(name + "base" + char(base_letter));
01671     // Save to "<table>.tmp" and then rename to "<table>.base<letter>" so that
01672     // a reader can't try to read a partially written base file.
01673     string tmp = name;
01674     tmp += "tmp";
01675     string basefile = name;
01676     basefile += "base";
01677     basefile += char(base_letter);
01678     base.write_to_file(tmp);
01679 #if defined __WIN32__
01680     if (msvc_posix_rename(tmp.c_str(), basefile.c_str()) < 0) {
01681 #else
01682     if (rename(tmp.c_str(), basefile.c_str()) < 0) {
01683 #endif
01684         // With NFS, rename() failing may just mean that the server crashed
01685         // after successfully renaming, but before reporting this, and then
01686         // the retried operation fails.  So we need to check if the source
01687         // file still exists, which we do by calling unlink(), since we want
01688         // to remove the temporary file anyway.
01689         int saved_errno = errno;
01690         if (unlink(tmp) == 0 || errno != ENOENT) {
01691             string msg("Couldn't update base file ");
01692             msg += basefile;
01693             msg += ": ";
01694             msg += strerror(saved_errno);
01695             throw Xapian::DatabaseError(msg);
01696         }
01697     }
01698     base.commit();
01699 
01700     read_root();
01701 
01702     changed_n = 0;
01703     changed_c = DIR_START;
01704     seq_count = SEQ_START_POINT;
01705 }
01706 
01707 void
01708 Btree::cancel()
01709 {
01710     DEBUGCALL(DB, void, "Btree::cancel", "");
01711     Assert(writable);
01712 
01713     // This causes problems: if (!Btree_modified) return;
01714 
01715     string err_msg;
01716     if (!base.read(name, base_letter, err_msg)) {
01717         throw Xapian::DatabaseCorruptError(string("Couldn't reread base ") + char(base_letter));
01718     }
01719 
01720     revision_number =  base.get_revision();
01721     block_size =       base.get_block_size();
01722     root =             base.get_root();
01723     level =            base.get_level();
01724     //bit_map_size =     basep->get_bit_map_size();
01725     item_count =       base.get_item_count();
01726     faked_root_block = base.get_have_fakeroot();
01727     sequential =       base.get_sequential();
01728 
01729     latest_revision_number = revision_number; // FIXME: we can end up reusing a revision if we opened a btree at an older revision, start to modify it, then cancel...
01730 
01731     prev_ptr = &Btree::prev_default;
01732     next_ptr = &Btree::next_default;
01733 
01734     for (int j = 0; j <= level; j++) {
01735         C[j].n = BLK_UNUSED;
01736         C[j].rewrite = false;
01737     }
01738     read_root();
01739 
01740     changed_n = 0;
01741     changed_c = DIR_START;
01742     seq_count = SEQ_START_POINT;
01743 }
01744 
01745 /************ B-tree reading ************/
01746 
01747 bool
01748 Btree::do_open_to_read(bool revision_supplied, quartz_revision_number_t revision_)
01749 {
01750     if (!basic_open(revision_supplied, revision_)) {
01751         if (revision_supplied) {
01752             // The requested revision was not available.
01753             // This could be because the database was modified underneath us, or
01754             // because a base file is missing.  Return false, and work out what
01755             // the problem was at a higher level.
01756             return false;
01757         }
01758         throw Xapian::DatabaseOpeningError("Failed to open table for reading");
01759     }
01760 
01761     handle = sys_open_to_read(name + "DB");
01762 
01763     if (sequential) {
01764         prev_ptr = &Btree::prev_for_sequential;
01765         next_ptr = &Btree::next_for_sequential;
01766     } else {
01767         prev_ptr = &Btree::prev_default;
01768         next_ptr = &Btree::next_default;
01769     }
01770 
01771     for (int j = 0; j <= level; j++) {
01772         C[j].n = BLK_UNUSED;
01773         C[j].p = new byte[block_size];
01774         if (C[j].p == 0) {
01775             throw std::bad_alloc();
01776         }
01777     }
01778 
01779     read_root();
01780     return true;
01781 }
01782 
01783 void
01784 Btree::open()
01785 {
01786     DEBUGCALL(DB, void, "Btree::open", "");
01787     DEBUGLINE(DB, "opening at path " << name);
01788     close();
01789 
01790     if (!writable) {
01791         // Any errors are thrown if revision_supplied is false
01792         (void)do_open_to_read(false, 0);
01793         return;
01794     }
01795 
01796     // Any errors are thrown if revision_supplied is false
01797     (void)do_open_to_write(false, 0);
01798 }
01799 
01800 bool
01801 Btree::open(quartz_revision_number_t revision)
01802 {
01803     DEBUGCALL(DB, bool, "Btree::open", revision);
01804     DEBUGLINE(DB, "opening for particular revision at path " << name);
01805     close();
01806 
01807     if (!writable) {
01808         if (do_open_to_read(true, revision)) {
01809             AssertEq(revision_number, revision);
01810             RETURN(true);
01811         } else {
01812             close();
01813             RETURN(false);
01814         }
01815     }
01816 
01817     if (!do_open_to_write(true, revision)) {
01818         // Can't open at the requested revision.
01819         close();
01820         RETURN(false);
01821     }
01822 
01823     AssertEq(revision_number, revision);
01824     RETURN(true);
01825 }
01826 
01827 bool
01828 Btree::prev_for_sequential(Cursor * C_, int /*dummy*/) const
01829 {
01830     int c = C_[0].c;
01831     if (c == DIR_START) {
01832         byte * p = C_[0].p;
01833         Assert(p);
01834         uint4 n = C_[0].n;
01835         while (true) {
01836             if (n == 0) return false;
01837             n--;
01838             if (writable) {
01839                 if (n == C[0].n) {
01840                     // Block is a leaf block in the built-in cursor
01841                     // (potentially in modified form).
01842                     memcpy(p, C[0].p, block_size);
01843                 } else {
01844                     // Blocks in the built-in cursor may not have been written
01845                     // to disk yet, so we have to check that the block number
01846                     // isn't in the built-in cursor or we'll read an
01847                     // uninitialised block (for which GET_LEVEL(p) will
01848                     // probably return 0).
01849                     int j;
01850                     for (j = 1; j <= level; ++j) {
01851                         if (n == C[j].n) break;
01852                     }
01853                     if (j <= level) continue;
01854 
01855                     // Block isn't in the built-in cursor, so the form on disk
01856                     // is valid, so read it to check if it's the next level 0
01857                     // block.
01858                     read_block(n, p);
01859                 }
01860             } else {
01861                 read_block(n, p);
01862             }
01863             if (REVISION(p) > 1) {
01864                 set_overwritten();
01865                 return false;
01866             }
01867             if (GET_LEVEL(p) == 0) break;
01868         }
01869         c = DIR_END(p);
01870         C_[0].n = n;
01871     }
01872     c -= D2;
01873     C_[0].c = c;
01874     return true;
01875 }
01876 
01877 bool
01878 Btree::next_for_sequential(Cursor * C_, int /*dummy*/) const
01879 {
01880     byte * p = C_[0].p;
01881     Assert(p);
01882     int c = C_[0].c;
01883     c += D2;
01884     if (c == DIR_END(p)) {
01885         uint4 n = C_[0].n;
01886         while (true) {
01887             n++;
01888             if (n > base.get_last_block()) return false;
01889             if (writable) {
01890                 if (n == C[0].n) {
01891                     // Block is a leaf block in the built-in cursor
01892                     // (potentially in modified form).
01893                     memcpy(p, C[0].p, block_size);
01894                 } else {
01895                     // Blocks in the built-in cursor may not have been written
01896                     // to disk yet, so we have to check that the block number
01897                     // isn't in the built-in cursor or we'll read an
01898                     // uninitialised block (for which GET_LEVEL(p) will
01899                     // probably return 0).
01900                     int j;
01901                     for (j = 1; j <= level; ++j) {
01902                         if (n == C[j].n) break;
01903                     }
01904                     if (j <= level) continue;
01905 
01906                     // Block isn't in the built-in cursor, so the form on disk
01907                     // is valid, so read it to check if it's the next level 0
01908                     // block.
01909                     read_block(n, p);
01910                 }
01911             } else {
01912                 read_block(n, p);
01913             }
01914             if (REVISION(p) > 1) {
01915                 set_overwritten();
01916                 return false;
01917             }
01918             if (GET_LEVEL(p) == 0) break;
01919         }
01920         c = DIR_START;
01921         C_[0].n = n;
01922     }
01923     C_[0].c = c;
01924     return true;
01925 }
01926 
01927 bool
01928 Btree::prev_default(Cursor * C_, int j) const
01929 {
01930     byte * p = C_[j].p;
01931     int c = C_[j].c;
01932     Assert(c >= DIR_START);
01933     Assert((unsigned)c < block_size);
01934     Assert(c <= DIR_END(p));
01935     if (c == DIR_START) {
01936         if (j == level) return false;
01937         if (!prev_default(C_, j + 1)) return false;
01938         c = DIR_END(p);
01939     }
01940     c -= D2;
01941     C_[j].c = c;
01942     if (j > 0) {
01943         block_to_cursor(C_, j - 1, Item(p, c).block_given_by());
01944     }
01945     return true;
01946 }
01947 
01948 bool
01949 Btree::next_default(Cursor * C_, int j) const
01950 {
01951     byte * p = C_[j].p;
01952     int c = C_[j].c;
01953     Assert(c >= DIR_START);
01954     c += D2;
01955     Assert((unsigned)c < block_size);
01956     // Sometimes c can be DIR_END(p) + 2 here it appears...
01957     if (c > DIR_END(p)) c = DIR_END(p);
01958     Assert(c <= DIR_END(p));
01959     if (c == DIR_END(p)) {
01960         if (j == level) return false;
01961         if (!next_default(C_, j + 1)) return false;
01962         c = DIR_START;
01963     }
01964     C_[j].c = c;
01965     if (j > 0) {
01966         block_to_cursor(C_, j - 1, Item(p, c).block_given_by());
01967 #ifdef BTREE_DEBUG_FULL
01968         printf("Block in Btree:next_default");
01969         report_block_full(j - 1, C_[j - 1].n, C_[j - 1].p);
01970 #endif /* BTREE_DEBUG_FULL */
01971     }
01972     return true;
01973 }
01974 
01990 bool Key::operator<(Key key2) const
01991 {
01992     DEBUGCALL(DB, bool, "Key::operator<", reinterpret_cast<const void*>(key2.p));
01993     int key1_len = length();
01994     int key2_len = key2.length();
01995     if (key1_len == key2_len) {
01996         // The keys are the same length, so we can compare the counts
01997         // in the same operation since they're stored as 2 byte
01998         // bigendian numbers.
01999         RETURN(memcmp(p + K1, key2.p + K1, key1_len + C2) < 0);
02000     }
02001 
02002     int k_smaller = (key2_len < key1_len ? key2_len : key1_len);
02003 
02004     // Compare the common part of the keys
02005     int diff = memcmp(p + K1, key2.p + K1, k_smaller);
02006     if (diff != 0) RETURN(diff < 0);
02007 
02008     // We dealt with the "same length" case above so we never need to check
02009     // the count here.
02010     RETURN(key1_len < key2_len);
02011 }
02012 
02013 bool Key::operator==(Key key2) const
02014 {
02015     DEBUGCALL(DB, bool, "Key::operator==", reinterpret_cast<const void*>(key2.p));
02016     int key1_len = length();
02017     if (key1_len != key2.length()) return false;
02018     // The keys are the same length, so we can compare the counts
02019     // in the same operation since they're stored as 2 byte
02020     // bigendian numbers.
02021     RETURN(memcmp(p + K1, key2.p + K1, key1_len + C2) == 0);
02022 }

Documentation for Xapian (version 1.0.20).
Generated on 28 Apr 2010 by Doxygen 1.5.2.