xapian-core  2.0.0
glass_version.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2006,2007,2008,2009,2010,2013,2014,2015,2016,2017,2024 Olly Betts
5  * Copyright (C) 2011 Dan Colish
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, see
19  * <https://www.gnu.org/licenses/>.
20  */
21 
22 #include <config.h>
23 
24 #include "glass_version.h"
25 
26 #include "debuglog.h"
27 #include "fd.h"
28 #include "glass_defs.h"
29 #include "io_utils.h"
30 #include "omassert.h"
31 #include "pack.h"
32 #include "posixy_wrapper.h"
33 #include "stringutils.h" // For STRINGIZE() and CONST_STRLEN().
34 
35 #include <cerrno>
36 #include <cstring> // For memcmp().
37 #include <string>
38 #include <sys/types.h>
39 #include "safesysstat.h"
40 #include "safefcntl.h"
41 #include "safeunistd.h"
42 #include "str.h"
43 #include "stringutils.h"
44 
45 #include "backends/uuids.h"
46 
47 #include "xapian/constants.h"
48 #include "xapian/error.h"
49 
50 using namespace std;
51 
53 #define GLASS_FORMAT_VERSION DATE_TO_VERSION(2016,03,14)
54 // 2016,03,14 1.3.5 compress_min in version file; partly eliminate component_of
55 // 2015,12,24 1.3.4 2 bytes "components_of" per item eliminated, and much more
56 // 2014,11,21 1.3.2 Brass renamed to Glass
57 
59 #define DATE_TO_VERSION(Y,M,D) \
60  ((unsigned(Y) - 2014) << 9 | unsigned(M) << 5 | unsigned(D))
61 #define VERSION_TO_YEAR(V) ((unsigned(V) >> 9) + 2014)
62 #define VERSION_TO_MONTH(V) ((unsigned(V) >> 5) & 0x0f)
63 #define VERSION_TO_DAY(V) (unsigned(V) & 0x1f)
64 
65 #define GLASS_VERSION_MAGIC_LEN 14
66 #define GLASS_VERSION_MAGIC_AND_VERSION_LEN 16
67 
69  '\x0f', '\x0d', 'X', 'a', 'p', 'i', 'a', 'n', ' ', 'G', 'l', 'a', 's', 's',
70  char((GLASS_FORMAT_VERSION >> 8) & 0xff), char(GLASS_FORMAT_VERSION & 0xff)
71 };
72 
74  : rev(0), fd(fd_), offset(0), db_dir(), changes(NULL),
75  doccount(0), total_doclen(0), last_docid(0),
76  doclen_lbound(0), doclen_ubound(0),
77  wdf_ubound(0), spelling_wordfreq_ubound(0),
78  oldest_changeset(0)
79 {
80  offset = lseek(fd, 0, SEEK_CUR);
81  if (rare(offset < 0)) {
82  string msg = "lseek failed on file descriptor ";
83  msg += str(fd);
84  throw Xapian::DatabaseOpeningError(msg, errno);
85  }
86 }
87 
89 {
90  // Either this is a single-file database, or this fd is from opening a new
91  // version file in write(), but sync() was never called.
92  if (fd != -1)
93  (void)::close(fd);
94 }
95 
96 void
98 {
99  LOGCALL_VOID(DB, "GlassVersion::read", NO_ARGS);
100  FD close_fd(-1);
101  int fd_in;
102  if (single_file()) {
103  if (rare(lseek(fd, offset, SEEK_SET) < 0)) {
104  string msg = "Failed to rewind file descriptor ";
105  msg += str(fd);
106  throw Xapian::DatabaseOpeningError(msg, errno);
107  }
108  fd_in = fd;
109  } else {
110  string filename = db_dir;
111  filename += "/iamglass";
112  fd_in = posixy_open(filename.c_str(), O_RDONLY|O_BINARY);
113  if (rare(fd_in < 0)) {
114  string msg = filename;
115  msg += ": Failed to open glass revision file for reading";
116  if (errno == ENOENT || errno == ENOTDIR) {
117  throw Xapian::DatabaseNotFoundError(msg, errno);
118  }
119  throw Xapian::DatabaseOpeningError(msg, errno);
120  }
121  close_fd = fd_in;
122  }
123 
124  char buf[256];
125 
126  const char * p = buf;
127  const char * end = p + io_read(fd_in, buf, sizeof(buf), 33);
128 
129  if (memcmp(buf, GLASS_VERSION_MAGIC, GLASS_VERSION_MAGIC_LEN) != 0)
130  throw Xapian::DatabaseCorruptError("Rev file magic incorrect");
131 
132  unsigned version;
133  version = static_cast<unsigned char>(buf[GLASS_VERSION_MAGIC_LEN]);
134  version <<= 8;
135  version |= static_cast<unsigned char>(buf[GLASS_VERSION_MAGIC_LEN + 1]);
136  if (version != GLASS_FORMAT_VERSION) {
137  string msg;
138  if (!single_file()) {
139  msg = db_dir;
140  msg += ": ";
141  }
142  msg += "Database is format version ";
143  msg += str(VERSION_TO_YEAR(version) * 10000 +
144  VERSION_TO_MONTH(version) * 100 +
145  VERSION_TO_DAY(version));
146  msg += " but I only understand ";
147  msg += str(VERSION_TO_YEAR(GLASS_FORMAT_VERSION) * 10000 +
150  throw Xapian::DatabaseVersionError(msg);
151  }
152 
154  uuid.assign(p);
155  p += uuid.BINARY_SIZE;
156 
157  if (!unpack_uint(&p, end, &rev))
158  throw Xapian::DatabaseCorruptError("Rev file failed to decode revision");
159 
160  for (unsigned table_no = 0; table_no < Glass::MAX_; ++table_no) {
161  if (!root[table_no].unserialise(&p, end)) {
162  throw Xapian::DatabaseCorruptError("Rev file root_info missing");
163  }
164  old_root[table_no] = root[table_no];
165  }
166 
167  // For a single-file database, this will assign extra data. We read
168  // sizeof(buf) above, then skip GLASS_VERSION_MAGIC_AND_VERSION_LEN,
169  // then 16, then the size of the serialised root info.
170  serialised_stats.assign(p, end);
172 }
173 
174 void
176 {
177  serialised_stats.resize(0);
179  // last_docid must always be >= doccount.
183  // doclen_ubound should always be >= wdf_ubound, so we store the
184  // difference as it may encode smaller. wdf_ubound is likely to
185  // be larger than doclen_lbound.
190 }
191 
192 void
194 {
195  const char * p = serialised_stats.data();
196  const char * end = p + serialised_stats.size();
197  if (p == end) {
198  doccount = 0;
199  total_doclen = 0;
200  last_docid = 0;
201  doclen_lbound = 0;
202  doclen_ubound = 0;
203  wdf_ubound = 0;
204  oldest_changeset = 0;
206  return;
207  }
208 
209  if (!unpack_uint(&p, end, &doccount) ||
210  !unpack_uint(&p, end, &last_docid) ||
211  !unpack_uint(&p, end, &doclen_lbound) ||
212  !unpack_uint(&p, end, &wdf_ubound) ||
213  !unpack_uint(&p, end, &doclen_ubound) ||
214  !unpack_uint(&p, end, &oldest_changeset) ||
215  !unpack_uint(&p, end, &total_doclen) ||
217  const char * m = p ?
218  "Bad serialised DB stats (overflowed)" :
219  "Bad serialised DB stats (out of data)";
221  }
222 
223  // In the single-file DB case, there will be extra data in
224  // serialised_stats, so suppress this check.
225  if (p != end && !single_file())
226  throw Xapian::DatabaseCorruptError("Rev file has junk at end");
227 
228  // last_docid must always be >= doccount.
229  last_docid += doccount;
230  // doclen_ubound should always be >= wdf_ubound, so we store the
231  // difference as it may encode smaller. wdf_ubound is likely to
232  // be larger than doclen_lbound.
234 }
235 
236 void
238 {
239  doccount += o.get_doccount();
240  if (doccount < o.get_doccount()) {
241  throw Xapian::DatabaseError("doccount overflowed!");
242  }
243 
244  Xapian::termcount o_doclen_lbound = o.get_doclength_lower_bound();
245  if (o_doclen_lbound > 0) {
246  if (doclen_lbound == 0 || o_doclen_lbound < doclen_lbound)
247  doclen_lbound = o_doclen_lbound;
248  }
249 
253  if (total_doclen < o.get_total_doclen()) {
254  throw Xapian::DatabaseError("Total document length overflowed!");
255  }
256 
257  // The upper bounds might be on the same word, so we must sum them.
259 }
260 
261 void
263 {
264  LOGCALL_VOID(DB, "GlassVersion::cancel", NO_ARGS);
265  for (unsigned table_no = 0; table_no < Glass::MAX_; ++table_no) {
266  root[table_no] = old_root[table_no];
267  }
269 }
270 
271 const string
273 {
274  LOGCALL(DB, const string, "GlassVersion::write", new_rev|flags);
275 
277  s.append(uuid.data(), uuid.BINARY_SIZE);
278 
279  pack_uint(s, new_rev);
280 
281  for (unsigned table_no = 0; table_no < Glass::MAX_; ++table_no) {
282  root[table_no].serialise(s);
283  }
284 
285  // Serialise database statistics.
286  serialise_stats();
287  s += serialised_stats;
288 
289  string tmpfile;
290  if (!single_file()) {
291  tmpfile = db_dir;
292  // In dangerous mode, just write the new version file in place.
293  if (flags & Xapian::DB_DANGEROUS)
294  tmpfile += "/iamglass";
295  else
296  tmpfile += "/v.tmp";
297 
298 #ifdef __EMSCRIPTEN__
299  // Emscripten < 1.39.10 fails to create a file if O_TRUNC is specified
300  // and the filename is the previous name of a renamed file (which it
301  // will be the second time we write out the version file for a DB):
302  //
303  // https://github.com/emscripten-core/emscripten/issues/8187
304  //
305  // We avoid triggering this bug by not using O_TRUNC and instead
306  // truncating once the file is opened.
307  fd = posixy_open(tmpfile.c_str(),
308  O_CREAT|O_WRONLY|O_BINARY,
309  0666);
310  if (fd >= 0)
311  ftruncate(fd, 0);
312 #else
313  fd = posixy_open(tmpfile.c_str(),
314  O_CREAT|O_TRUNC|O_WRONLY|O_BINARY,
315  0666);
316 #endif
317 
318  if (rare(fd < 0))
319  throw Xapian::DatabaseOpeningError("Couldn't write new rev file: " + tmpfile,
320  errno);
321 
322  if (flags & Xapian::DB_DANGEROUS)
323  tmpfile = string();
324  }
325 
326  try {
327  io_write(fd, s.data(), s.size());
328  } catch (...) {
329  if (!single_file())
330  (void)close(fd);
331  throw;
332  }
333 
334  if (changes) {
335  string changes_buf;
336  changes_buf += '\xfe';
337  pack_uint(changes_buf, new_rev);
338  pack_uint(changes_buf, s.size());
339  changes->write_block(changes_buf);
340  changes->write_block(s);
341  }
342 
343  RETURN(tmpfile);
344 }
345 
346 bool
347 GlassVersion::sync(const string & tmpfile,
348  glass_revision_number_t new_rev, int flags)
349 {
350  Assert(new_rev > rev || rev == 0);
351 
352  if (single_file()) {
353  if ((flags & Xapian::DB_NO_SYNC) == 0 &&
354  ((flags & Xapian::DB_FULL_SYNC) ?
355  !io_full_sync(fd) :
356  !io_sync(fd))) {
357  // FIXME what to do?
358  }
359  } else {
360  int fd_to_close = fd;
361  fd = -1;
362  if ((flags & Xapian::DB_NO_SYNC) == 0 &&
363  ((flags & Xapian::DB_FULL_SYNC) ?
364  !io_full_sync(fd_to_close) :
365  !io_sync(fd_to_close))) {
366  int save_errno = errno;
367  (void)close(fd_to_close);
368  if (!tmpfile.empty())
369  (void)unlink(tmpfile.c_str());
370  errno = save_errno;
371  return false;
372  }
373 
374  if (close(fd_to_close) != 0) {
375  if (!tmpfile.empty()) {
376  int save_errno = errno;
377  (void)unlink(tmpfile.c_str());
378  errno = save_errno;
379  }
380  return false;
381  }
382 
383  if (!tmpfile.empty()) {
384  if (!io_tmp_rename(tmpfile, db_dir + "/iamglass")) {
385  return false;
386  }
387  }
388  }
389 
390  for (unsigned table_no = 0; table_no < Glass::MAX_; ++table_no) {
391  old_root[table_no] = root[table_no];
392  }
393 
394  rev = new_rev;
395  return true;
396 }
397 
398 /* Only try to compress tags strictly longer than this many bytes.
399  *
400  * This can theoretically usefully be set as low as 4, but in practical terms
401  * zlib can't compress in very many cases for short inputs and even when it can
402  * the savings are small, so we default to a higher threshold to save CPU time
403  * for marginal size reductions.
404  */
405 const size_t COMPRESS_MIN = 18;
406 
407 static const uint4 compress_min_tab[] = {
408  0, // POSTLIST
409  COMPRESS_MIN, // DOCDATA
410  COMPRESS_MIN, // TERMLIST
411  0, // POSITION
412  COMPRESS_MIN, // SPELLING
413  COMPRESS_MIN // SYNONYM
414 };
415 
416 void
417 GlassVersion::create(unsigned blocksize)
418 {
419  AssertRel(blocksize,>=,GLASS_MIN_BLOCKSIZE);
420  uuid.generate();
421  for (unsigned table_no = 0; table_no < Glass::MAX_; ++table_no) {
422  root[table_no].init(blocksize, compress_min_tab[table_no]);
423  }
424 }
425 
426 namespace Glass {
427 
428 void
429 RootInfo::init(unsigned blocksize_, uint4 compress_min_)
430 {
431  AssertRel(blocksize_,>=,GLASS_MIN_BLOCKSIZE);
432  root = 0;
433  level = 0;
434  num_entries = 0;
435  root_is_fake = true;
436  sequential = true;
437  blocksize = blocksize_;
438  compress_min = compress_min_;
439  fl_serialised.resize(0);
440 }
441 
442 void
443 RootInfo::serialise(string &s) const
444 {
445  pack_uint(s, root);
446  unsigned val = level << 2;
447  if (sequential) val |= 0x02;
448  if (root_is_fake) val |= 0x01;
449  pack_uint(s, val);
451  pack_uint(s, blocksize >> 11);
454 }
455 
456 bool
457 RootInfo::unserialise(const char ** p, const char * end)
458 {
459  unsigned val, b;
460  if (!unpack_uint(p, end, &root) ||
461  !unpack_uint(p, end, &val) ||
462  !unpack_uint(p, end, &num_entries) ||
463  !unpack_uint(p, end, &b) ||
464  !unpack_uint(p, end, &compress_min) ||
465  !unpack_string(p, end, fl_serialised)) return false;
466  auto level_ = val >> 2;
467  if (rare(level_ >= GLASS_BTREE_CURSOR_LEVELS))
468  throw Xapian::DatabaseCorruptError("Impossibly deep Btree");
469  level = level_;
470  sequential = val & 0x02;
471  root_is_fake = val & 0x01;
472 
473  if (root_is_fake && level > 0) {
474  throw Xapian::DatabaseCorruptError("Fake root but level > 0");
475  }
476 
477  b <<= 11;
478  if (rare(b < GLASS_MIN_BLOCKSIZE ||
479  b > GLASS_MAX_BLOCKSIZE ||
480  (b & (b - 1)) != 0)) {
481  throw Xapian::DatabaseCorruptError("Invalid block size");
482  }
483  blocksize = b;
484 
485  // Map old default to new default.
486  if (compress_min == 4) {
488  }
489 
490  return true;
491 }
492 
493 }
Definition: fd.h:30
void write_block(const char *p, size_t len)
The GlassVersion class manages the revision files.
Definition: glass_version.h:96
glass_revision_number_t oldest_changeset
Oldest changeset removed when max_changesets is set.
Xapian::termcount spelling_wordfreq_ubound
An upper bound on the spelling wordfreq in this database.
RootInfo old_root[Glass::MAX_]
Xapian::termcount doclen_ubound
An upper bound on the greatest document length in this database.
off_t offset
Offset into the file at which the version data starts.
const std::string write(glass_revision_number_t new_rev, int flags)
std::string db_dir
The database directory.
void serialise_stats()
Xapian::termcount get_doclength_lower_bound() const
bool single_file() const
Xapian::docid last_docid
Greatest document id ever used in this database.
Xapian::termcount get_spelling_wordfreq_upper_bound() const
std::string serialised_stats
The serialised database stats.
Uuid uuid
The UUID of this database.
glass_revision_number_t rev
Definition: glass_version.h:97
Xapian::totallength total_doclen
The total of the lengths of all documents in the database.
bool sync(const std::string &tmpfile, glass_revision_number_t new_rev, int flags)
Xapian::doccount get_doccount() const
void merge_stats(const GlassVersion &o)
Merge the database stats.
Xapian::termcount get_wdf_upper_bound() const
GlassVersion(std::string_view db_dir_)
Xapian::termcount doclen_lbound
A lower bound on the smallest document length in this database.
GlassChanges * changes
Xapian::termcount wdf_ubound
An upper bound on the greatest wdf in this database.
Xapian::doccount doccount
The number of documents in the database.
int fd
File descriptor.
void create(unsigned blocksize)
Create the version file.
RootInfo root[Glass::MAX_]
Definition: glass_version.h:99
void unserialise_stats()
void read()
Read the version file and check it's a version we understand.
Xapian::totallength get_total_doclen() const
Xapian::termcount get_doclength_upper_bound() const
void init(unsigned blocksize_, uint4 compress_min_)
void serialise(std::string &s) const
bool unserialise(const char **p, const char *end)
std::string fl_serialised
Definition: glass_version.h:50
unsigned blocksize
Definition: glass_version.h:47
glass_tablesize_t num_entries
Definition: glass_version.h:44
glass_block_t root
Definition: glass_version.h:42
uint4 compress_min
Should be >= 4 or 0 for no compression.
Definition: glass_version.h:49
void generate()
Definition: uuids.cc:63
static constexpr unsigned BINARY_SIZE
The size of a UUID in bytes.
Definition: uuids.h:31
void assign(const char *p)
Definition: uuids.h:64
const char * data() const
Definition: uuids.h:60
DatabaseCorruptError indicates database corruption was detected.
Definition: error.h:397
DatabaseError indicates some sort of database related error.
Definition: error.h:355
Indicates an attempt to access a database not present.
Definition: error.h:1043
DatabaseOpeningError indicates failure to open a database.
Definition: error.h:569
DatabaseVersionError indicates that a database is in an unsupported format.
Definition: error.h:620
#define rare(COND)
Definition: config.h:607
Constants in the Xapian namespace.
PositionList * p
Debug logging macros.
#define RETURN(...)
Definition: debuglog.h:484
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:478
#define LOGCALL_VOID(CATEGORY, FUNC, PARAMS)
Definition: debuglog.h:479
Hierarchy of classes which Xapian can throw as exceptions.
Wrapper class around a file descriptor to avoid leaks.
int close(FD &fd)
Definition: fd.h:63
Definitions, types, etc for use inside glass.
#define GLASS_BTREE_CURSOR_LEVELS
Allow for this many levels in the B-tree.
Definition: glass_defs.h:43
#define GLASS_MIN_BLOCKSIZE
Minimum B-tree block size.
Definition: glass_defs.h:33
uint4 glass_revision_number_t
The revision number of a glass database.
Definition: glass_defs.h:68
#define GLASS_MAX_BLOCKSIZE
Maximum B-tree block size.
Definition: glass_defs.h:36
static const char GLASS_VERSION_MAGIC[GLASS_VERSION_MAGIC_AND_VERSION_LEN]
const size_t COMPRESS_MIN
#define GLASS_VERSION_MAGIC_AND_VERSION_LEN
#define VERSION_TO_MONTH(V)
#define VERSION_TO_DAY(V)
#define VERSION_TO_YEAR(V)
#define GLASS_FORMAT_VERSION
Glass format version (date of change):
#define GLASS_VERSION_MAGIC_LEN
static const uint4 compress_min_tab[]
GlassVersion class.
uint32_t uint4
Definition: internaltypes.h:31
void io_write(int fd, const char *p, size_t n)
Write n bytes from block pointed to by p to file descriptor fd.
Definition: io_utils.cc:263
size_t io_read(int fd, char *p, size_t n, size_t min)
Read n bytes (or until EOF) into block pointed to by p from file descriptor fd.
Definition: io_utils.cc:241
bool io_tmp_rename(const std::string &tmp_file, const std::string &real_file)
Rename a temporary file to its final position.
Definition: io_utils.cc:573
Wrappers for low-level POSIX I/O routines.
bool io_sync(int fd)
Ensure all data previously written to file descriptor fd has been written to disk.
Definition: io_utils.h:107
bool io_full_sync(int fd)
Definition: io_utils.h:122
@ MAX_
Definition: glass_defs.h:60
string str(int value)
Convert int to std::string.
Definition: str.cc:91
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
XAPIAN_REVISION_TYPE rev
Revision number of a database.
Definition: types.h:108
const int DB_NO_SYNC
Don't attempt to ensure changes have hit disk.
Definition: constants.h:65
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
const int DB_FULL_SYNC
Try to ensure changes are really written to disk.
Definition: constants.h:82
const int DB_DANGEROUS
Update the database in-place.
Definition: constants.h:102
Various assertion macros.
#define AssertRel(A, REL, B)
Definition: omassert.h:123
#define Assert(COND)
Definition: omassert.h:122
Pack types into strings and unpack them again.
bool unpack_string(const char **p, const char *end, std::string &result)
Decode a std::string from a string.
Definition: pack.h:468
bool unpack_uint(const char **p, const char *end, U *result)
Decode an unsigned integer from a string.
Definition: pack.h:346
void pack_uint(std::string &s, U value)
Append an encoded unsigned integer to a string.
Definition: pack.h:315
void pack_string(std::string &s, std::string_view value)
Append an encoded std::string to a string.
Definition: pack.h:442
Provides wrappers with POSIXy semantics.
#define posixy_open
include <fcntl.h>, but working around broken platforms.
#define O_BINARY
Definition: safefcntl.h:80
include <sys/stat.h> with portability enhancements
<unistd.h>, but with compat.
Convert types to std::string.
Various handy string-related helpers.
Class for handling UUIDs.