xapian-core  2.0.0
compactor.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2003-2024 Olly Betts
5  * Copyright (C) 2008 Lemur Consulting Ltd
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, see
19  * <https://www.gnu.org/licenses/>.
20  */
21 
22 #include <config.h>
23 
24 #include <xapian/compactor.h>
25 
26 #include <algorithm>
27 #include <fstream>
28 #include <string_view>
29 #include <vector>
30 
31 #include <cerrno>
32 #include <cstring>
33 #include <ctime>
34 #include "safesysstat.h"
35 #include <sys/types.h>
36 
37 #include "safeunistd.h"
38 #include "safefcntl.h"
39 
40 #include "backends/backends.h"
42 #include "backends/postlist.h"
43 #include "debuglog.h"
44 #include "omassert.h"
45 #include "filetests.h"
46 #include "fileutils.h"
47 #include "io_utils.h"
48 #include "stringutils.h"
49 #include "str.h"
50 
51 #ifdef XAPIAN_HAS_GLASS_BACKEND
54 #endif
55 
56 #ifdef XAPIAN_HAS_HONEY_BACKEND
59 #endif
60 
62 
63 #include <xapian/constants.h>
64 #include <xapian/database.h>
65 #include <xapian/error.h>
66 
67 using namespace std;
68 
70  const vector<pair<Xapian::docid, Xapian::docid>>& used_ranges;
71 
72  public:
73  explicit
74  CmpByFirstUsed(const vector<pair<Xapian::docid, Xapian::docid>>& ur)
75  : used_ranges(ur) { }
76 
78  return used_ranges[a].first < used_ranges[b].first;
79  }
80 };
81 
82 namespace Xapian {
83 
84 Compactor::~Compactor() { }
85 
86 void
87 Compactor::set_status(const string & table, const string & status)
88 {
89  (void)table;
90  (void)status;
91 }
92 
93 string
94 Compactor::resolve_duplicate_metadata(const string & key,
95  size_t num_tags, const std::string tags[])
96 {
97  (void)key;
98  (void)num_tags;
99  return tags[0];
100 }
101 
102 }
103 
104 [[noreturn]]
105 static void
107  const string &dbpath2, int backend2)
108 {
109  string dbpath1;
110  db->get_backend_info(&dbpath1);
111  string msg = "All databases must be the same type ('";
112  msg += dbpath1;
113  msg += "' is ";
114  msg += backend_name(backend1);
115  msg += ", but '";
116  msg += dbpath2;
117  msg += "' is ";
118  msg += backend_name(backend2);
119  msg += ')';
120  throw Xapian::InvalidArgumentError(msg);
121 }
122 
123 namespace Xapian {
124 
125 void
126 Database::compact_(const string_view* output_ptr, int fd, unsigned flags,
127  int block_size,
128  Xapian::Compactor * compactor) const
129 {
130  LOGCALL_VOID(API, "Database::compact_", output_ptr | fd | flags | block_size | compactor);
131 
132  bool renumber = !(flags & DBCOMPACT_NO_RENUMBER);
133 
134  enum { STUB_NO, STUB_FILE, STUB_DIR } compact_to_stub = STUB_NO;
135  string destdir;
136  if (output_ptr) {
137  // We need a modifiable destdir in this function.
138  destdir = *output_ptr;
139  if (!(flags & DBCOMPACT_SINGLE_FILE)) {
140  if (file_exists(destdir)) {
141  // Stub file.
142  compact_to_stub = STUB_FILE;
143  } else if (file_exists(destdir + "/XAPIANDB")) {
144  // Stub directory.
145  compact_to_stub = STUB_DIR;
146  }
147  }
148  } else {
149  // Single file is implied when writing to a file descriptor.
150  flags |= DBCOMPACT_SINGLE_FILE;
151  }
152 
153  auto n_shards = internal->size();
154  Xapian::docid tot_off = 0;
155  Xapian::docid last_docid = 0;
156 
157  vector<Xapian::docid> offset;
158  vector<pair<Xapian::docid, Xapian::docid>> used_ranges;
159  vector<const Xapian::Database::Internal*> internals;
160  offset.reserve(n_shards);
161  used_ranges.reserve(n_shards);
162  internals.reserve(n_shards);
163 
164  if (n_shards > 1) {
165  auto multi_db = static_cast<MultiDatabase*>(internal.get());
166  for (auto&& db : multi_db->shards) {
167  internals.push_back(db);
168  }
169  } else {
170  internals.push_back(internal.get());
171  }
172 
173  int backend = BACKEND_UNKNOWN;
174  for (auto&& shard : internals) {
175  string srcdir;
176  int type = shard->get_backend_info(&srcdir);
177  // Check destdir isn't the same as any source directory, unless it
178  // is a stub database or we're compacting to an fd.
179  if (!compact_to_stub && !destdir.empty() && srcdir == destdir) {
180  throw InvalidArgumentError("destination may not be the same as "
181  "any source database, unless it is a "
182  "stub database");
183  }
184  switch (type) {
185  case BACKEND_GLASS:
186  if (backend != type && backend != BACKEND_UNKNOWN) {
187  backend_mismatch(internals[0], backend, srcdir, type);
188  }
189  backend = type;
190  break;
191  case BACKEND_HONEY:
192  if (backend != type && backend != BACKEND_UNKNOWN) {
193  backend_mismatch(internals[0], backend, srcdir, type);
194  }
195  backend = type;
196  break;
197  default:
198  throw DatabaseError("Only glass and honey databases can be "
199  "compacted");
200  }
201 
202  Xapian::docid first = 0, last = 0;
203 
204  // "Empty" databases might have spelling or synonym data so can't
205  // just be completely ignored.
206  Xapian::doccount num_docs = shard->get_doccount();
207  if (num_docs != 0) {
208  shard->get_used_docid_range(first, last);
209 
210  if (renumber && first) {
211  // Prune any unused docids off the start of this source
212  // database.
213  //
214  // tot_off could wrap here, but it's unsigned, so that's
215  // OK.
216  UNSIGNED_OVERFLOW_OK(tot_off -= (first - 1));
217  }
218 
219 #ifdef XAPIAN_ASSERTIONS
220  PostList* pl = shard->open_post_list({});
221  // We don't do this for an empty shard.
222  Assert(pl);
223  pl->next();
224  // This test should never fail, since shard->get_doccount() is
225  // non-zero!
226  Assert(!pl->at_end());
227  AssertEq(pl->get_docid(), first);
228  AssertRel(last,>=,first);
229  pl->skip_to(last);
230  Assert(!pl->at_end());
231  AssertEq(pl->get_docid(), last);
232  pl->next();
233  Assert(pl->at_end());
234  delete pl;
235 #endif
236  }
237 
238  offset.push_back(tot_off);
239  if (renumber)
240  UNSIGNED_OVERFLOW_OK(tot_off += last);
241  else if (last_docid < shard->get_lastdocid())
242  last_docid = shard->get_lastdocid();
243  used_ranges.push_back(make_pair(first, last));
244  }
245 
246  if (renumber)
247  last_docid = tot_off;
248 
249  if (!renumber && n_shards > 1) {
250  // We want to process the sources in ascending order of first
251  // docid. So we create a vector "order" with ascending integers
252  // and then sort so the indirected order is right.
253  vector<Xapian::doccount> order;
254  order.reserve(n_shards);
255  for (Xapian::doccount i = 0; i < n_shards; ++i)
256  order.push_back(i);
257 
258  sort(order.begin(), order.end(), CmpByFirstUsed(used_ranges));
259 
260  // Now use order to reorder internals to be in ascending order by first
261  // docid, and while we're at it check the ranges are disjoint.
262  vector<const Xapian::Database::Internal*> internals_;
263  internals_.reserve(n_shards);
264  vector<pair<Xapian::docid, Xapian::docid>> used_ranges_;
265  used_ranges_.reserve(n_shards);
266 
267  Xapian::docid last_start = 0, last_end = 0;
268  for (Xapian::doccount j = 0; j != order.size(); ++j) {
269  Xapian::doccount n = order[j];
270 
271  internals_.push_back(internals[n]);
272  used_ranges_.push_back(used_ranges[n]);
273 
274  const pair<Xapian::docid, Xapian::docid> p = used_ranges[n];
275  // Skip empty databases.
276  if (p.first == 0 && p.second == 0)
277  continue;
278  // Check for overlap with the previous database's range.
279  if (p.first <= last_end) {
280  string tmp;
281  string msg = "when merging databases, --no-renumber is only currently supported if the databases have disjoint ranges of used document ids: ";
282  internals_[j - 1]->get_backend_info(&tmp);
283  msg += tmp;
284  msg += " has range ";
285  msg += str(last_start);
286  msg += '-';
287  msg += str(last_end);
288  msg += ", ";
289  internals_[j]->get_backend_info(&tmp);
290  msg += tmp;
291  msg += " has range ";
292  msg += str(p.first);
293  msg += '-';
294  msg += str(p.second);
296  }
297  last_start = p.first;
298  last_end = p.second;
299  }
300 
301  swap(internals, internals_);
302  swap(used_ranges, used_ranges_);
303  }
304 
305  string stub_file;
306  if (compact_to_stub) {
307  stub_file = destdir;
308  if (compact_to_stub == STUB_DIR) {
309  stub_file += "/XAPIANDB";
310  destdir += '/';
311  } else {
312  destdir += '_';
313  }
314  size_t sfx = destdir.size();
315  time_t now = time(NULL);
316  while (true) {
317  destdir.resize(sfx);
318  destdir += str(now++);
319  if (mkdir(destdir.c_str(), 0755) == 0)
320  break;
321  if (errno != EEXIST) {
322  string msg = destdir;
323  msg += ": mkdir failed";
324  throw Xapian::DatabaseError(msg, errno);
325  }
326  }
327  } else if (!(flags & Xapian::DBCOMPACT_SINGLE_FILE)) {
328  // If the destination database directory doesn't exist, create it.
329  if (mkdir(destdir.c_str(), 0755) < 0) {
330  // Check why mkdir failed. It's ok if the directory already
331  // exists, but we also get EEXIST if there's an existing file with
332  // that name.
333  int mkdir_errno = errno;
334  if (mkdir_errno != EEXIST || !dir_exists(destdir)) {
335  string msg = destdir;
336  msg += ": cannot create directory";
337  throw Xapian::DatabaseError(msg, mkdir_errno);
338  }
339  }
340  }
341 
342 #if defined XAPIAN_HAS_GLASS_BACKEND || defined XAPIAN_HAS_HONEY_BACKEND
345 #else
346  (void)compactor;
347  (void)block_size;
348 #endif
349 
350  auto output_backend = flags & Xapian::DB_BACKEND_MASK_;
351  if (backend == BACKEND_GLASS) {
352  switch (output_backend) {
353  case 0:
355 #ifdef XAPIAN_HAS_GLASS_BACKEND
356  if (output_ptr) {
357  GlassDatabase::compact(compactor, destdir.c_str(), 0,
358  internals, offset,
359  block_size, compaction, flags,
360  last_docid);
361  } else {
362  GlassDatabase::compact(compactor, NULL, fd,
363  internals, offset,
364  block_size, compaction, flags,
365  last_docid);
366  }
367  break;
368 #else
369  (void)fd;
370  (void)last_docid;
371  throw Xapian::FeatureUnavailableError("Glass backend disabled "
372  "at build time");
373 #endif
375  // Honey isn't block based.
376  (void)block_size;
377 #ifdef XAPIAN_HAS_HONEY_BACKEND
378  if (output_ptr) {
379  HoneyDatabase::compact(compactor, destdir.c_str(), 0,
381  internals, offset,
382  compaction, flags,
383  last_docid);
384  } else {
385  HoneyDatabase::compact(compactor, NULL, fd,
387  internals, offset,
388  compaction, flags,
389  last_docid);
390  }
391  break;
392 #else
393  (void)fd;
394  (void)last_docid;
395  throw Xapian::FeatureUnavailableError("Honey backend disabled "
396  "at build time");
397 #endif
398  default:
399  throw Xapian::UnimplementedError("Glass can only be "
400  "compacted to itself or "
401  "honey");
402  }
403  } else if (backend == BACKEND_HONEY) {
404  switch (output_backend) {
405  case 0:
407 #ifdef XAPIAN_HAS_HONEY_BACKEND
408  // Honey isn't block based.
409  (void)block_size;
410  if (output_ptr) {
411  HoneyDatabase::compact(compactor, destdir.c_str(), 0,
413  internals, offset,
414  compaction, flags,
415  last_docid);
416  } else {
417  HoneyDatabase::compact(compactor, NULL, fd,
419  internals, offset,
420  compaction, flags,
421  last_docid);
422  }
423  break;
424 #else
425  (void)fd;
426  (void)last_docid;
427  throw Xapian::FeatureUnavailableError("Honey backend disabled "
428  "at build time");
429 #endif
430  default:
431  throw Xapian::UnimplementedError("Honey can only be "
432  "compacted to itself");
433  }
434  }
435 
436  if (compact_to_stub) {
437  string new_stub_file = destdir;
438  new_stub_file += "/new_stub.tmp";
439  {
440  ofstream new_stub(new_stub_file.c_str());
441  size_t slash = destdir.find_last_of(DIR_SEPS);
442  new_stub << "auto " << destdir.substr(slash + 1) << '\n';
443  }
444  if (!io_tmp_rename(new_stub_file, stub_file)) {
445  string msg = "Cannot rename '";
446  msg += new_stub_file;
447  msg += "' to '";
448  msg += stub_file;
449  msg += '\'';
450  throw Xapian::DatabaseError(msg, errno);
451  }
452  }
453 }
454 
455 }
BACKEND_* constants.
const char * backend_name(int code)
Definition: backends.h:34
@ BACKEND_GLASS
Definition: backends.h:29
@ BACKEND_UNKNOWN
Definition: backends.h:26
@ BACKEND_HONEY
Definition: backends.h:30
const vector< pair< Xapian::docid, Xapian::docid > > & used_ranges
Definition: compactor.cc:70
bool operator()(Xapian::doccount a, Xapian::doccount b) const
Definition: compactor.cc:77
CmpByFirstUsed(const vector< pair< Xapian::docid, Xapian::docid >> &ur)
Definition: compactor.cc:74
static void compact(Xapian::Compactor *compactor, const char *destdir, int fd, const std::vector< const Xapian::Database::Internal * > &sources, const std::vector< Xapian::docid > &offset, unsigned block_size, Xapian::Compactor::compaction_level compaction, unsigned flags, Xapian::docid last_docid)
static void compact(Xapian::Compactor *compactor, const char *destdir, int fd, int source_backend, const std::vector< const Xapian::Database::Internal * > &sources, const std::vector< Xapian::docid > &offset, Xapian::Compactor::compaction_level compaction, unsigned flags, Xapian::docid last_docid)
Sharded database backend.
void push_back(Xapian::Database::Internal *shard)
Compact a database, or merge and compact several.
Definition: compactor.h:39
compaction_level
Compaction level.
Definition: compactor.h:42
@ FULL
Split items whenever it saves space (the default).
Definition: compactor.h:46
@ FULLER
Allow oversize items to save more space (not recommended if you ever plan to update the compacted dat...
Definition: compactor.h:52
@ STANDARD
Don't split items unnecessarily.
Definition: compactor.h:44
DatabaseError indicates some sort of database related error.
Definition: error.h:355
Virtual base class for Database internals.
virtual int get_backend_info(std::string *path) const =0
Get backend information about this database.
Indicates an attempt to use a feature which is unavailable.
Definition: error.h:707
Abstract base class for postlists.
Definition: postlist.h:40
virtual PostList * skip_to(Xapian::docid did, double w_min)=0
Skip forward to the specified docid.
virtual PostList * next(double w_min)=0
Advance the current position to the next document in the postlist.
virtual Xapian::docid get_docid() const =0
Return the current docid.
virtual bool at_end() const =0
Return true if the current position is past the last entry in this list.
InvalidArgumentError indicates an invalid parameter value was passed to the API.
Definition: error.h:229
InvalidOperationError indicates the API was used in an invalid way.
Definition: error.h:271
UnimplementedError indicates an attempt to use an unimplemented feature.
Definition: error.h:313
static void backend_mismatch(const Xapian::Database::Internal *db, int backend1, const string &dbpath2, int backend2)
Definition: compactor.cc:106
Compact a database, or merge and compact several.
#define UNSIGNED_OVERFLOW_OK(X)
Definition: config.h:626
#define DIR_SEPS
Definition: config.h:8
Constants in the Xapian namespace.
An indexed database of documents.
PositionList * p
Virtual base class for Database internals.
Debug logging macros.
#define LOGCALL_VOID(CATEGORY, FUNC, PARAMS)
Definition: debuglog.h:479
Hierarchy of classes which Xapian can throw as exceptions.
Utility functions for testing files.
bool dir_exists(const char *path)
Test if a directory exists.
Definition: filetests.h:145
bool file_exists(const char *path)
Test if a file exists.
Definition: filetests.h:40
File and path manipulation routines.
C++ class definition for glass database.
GlassVersion class.
Database using honey backend.
HoneyVersion class.
bool io_tmp_rename(const std::string &tmp_file, const std::string &real_file)
Rename a temporary file to its final position.
Definition: io_utils.cc:573
Wrappers for low-level POSIX I/O routines.
Sharded database backend.
void sort(_RandomAccessIterator first, _RandomAccessIterator last, _Compare comp)
Definition: heap.h:277
double now()
Return the current time.
Definition: realtime.h:49
string str(int value)
Convert int to std::string.
Definition: str.cc:91
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
const int DB_BACKEND_HONEY
Use the honey backend.
Definition: constants.h:197
const int DB_BACKEND_GLASS
Use the glass backend.
Definition: constants.h:157
const int DBCOMPACT_NO_RENUMBER
Use the same document ids in the output as in the input(s).
Definition: constants.h:251
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
const int DBCOMPACT_SINGLE_FILE
Produce a single-file database.
Definition: constants.h:263
Various assertion macros.
#define AssertEq(A, B)
Definition: omassert.h:124
#define AssertRel(A, REL, B)
Definition: omassert.h:123
#define Assert(COND)
Definition: omassert.h:122
Abstract base class for postlists.
include <fcntl.h>, but working around broken platforms.
include <sys/stat.h> with portability enhancements
<unistd.h>, but with compat.
static string srcdir
Definition: stemtest.cc:44
Convert types to std::string.
Various handy string-related helpers.
static bool tags