xapian-core  2.0.0
io_utils.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2004-2025 Olly Betts
5  * Copyright (C) 2010 Richard Boulton
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, see
19  * <https://www.gnu.org/licenses/>.
20  */
21 
22 #include <config.h>
23 
24 #include "io_utils.h"
25 #include "posixy_wrapper.h"
26 
27 #include "safeunistd.h"
28 
29 #include <cerrno>
30 #include <cstring>
31 #include <limits>
32 #include <string>
33 
34 #include <xapian/error.h>
35 
36 #include "omassert.h"
37 #include "str.h"
38 
39 #ifdef __WIN32__
40 # include "safewindows.h"
41 #endif
42 
43 // Trying to include the correct headers with the correct defines set to
44 // get pread() and pwrite() prototyped on every platform without breaking any
45 // other platform is a real can of worms. So instead we probe for what
46 // prototypes (if any) are required in configure and put them into
47 // PREAD_PROTOTYPE and PWRITE_PROTOTYPE.
48 #if defined HAVE_PREAD && defined PREAD_PROTOTYPE
49 PREAD_PROTOTYPE
50 #endif
51 #if defined HAVE_PWRITE && defined PWRITE_PROTOTYPE
52 PWRITE_PROTOTYPE
53 #endif
54 
55 bool
56 io_unlink(const std::string & filename)
57 {
58  if (posixy_unlink(filename.c_str()) == 0) {
59  return true;
60  }
61  if (errno != ENOENT) {
62  throw Xapian::DatabaseError(filename + ": delete failed", errno);
63  }
64  return false;
65 }
66 
67 // The smallest fd we want to use for a writable handle.
68 //
69 // We want to avoid using fd < MIN_WRITE_FD, in case some other code in the
70 // same process tries to write to stdout or stderr, which would end up
71 // corrupting our database.
72 const int MIN_WRITE_FD = 3;
73 
74 static int
76 {
77  int badfd = fd;
78 #ifdef F_DUPFD_CLOEXEC
79  // dup to the first unused fd >= MIN_WRITE_FD.
80  fd = fcntl(badfd, F_DUPFD_CLOEXEC, MIN_WRITE_FD);
81  // F_DUPFD_CLOEXEC may not be supported.
82  if (fd < 0 && errno == EINVAL)
83 #endif
84 #ifdef F_DUPFD
85  {
86  fd = fcntl(badfd, F_DUPFD, MIN_WRITE_FD);
87 # ifdef FD_CLOEXEC
88  if (fd >= 0)
89  (void)fcntl(fd, F_SETFD, FD_CLOEXEC);
90 # endif
91  }
92  int save_errno = errno;
93  (void)close(badfd);
94  errno = save_errno;
95 #else
96  {
97  char toclose[MIN_WRITE_FD];
98  memset(toclose, 0, sizeof(toclose));
99  fd = badfd;
100  do {
101  toclose[fd] = 1;
102  fd = dup(fd);
103  } while (fd >= 0 && fd < MIN_WRITE_FD);
104  int save_errno = errno;
105  for (badfd = 0; badfd != MIN_WRITE_FD; ++badfd)
106  if (toclose[badfd])
107  close(badfd);
108  if (fd < 0) {
109  errno = save_errno;
110  } else {
111 # ifdef FD_CLOEXEC
112  (void)fcntl(fd, F_SETFD, FD_CLOEXEC);
113 # endif
114  }
115  }
116 #endif
117  Assert(fd >= MIN_WRITE_FD || fd < 0);
118  return fd;
119 }
120 
121 static inline int
123 {
124  if (usual(fd >= MIN_WRITE_FD || fd < 0)) return fd;
125  return move_to_higher_fd_(fd);
126 }
127 
134 static inline void
136 {
137 #if !defined HAVE_PREAD || !defined HAVE_PWRITE
138  // No point setting the file position high here as it'll just get reset
139  // by the first block read or write.
140  (void)fd;
141 #elif defined __linux__
142  // The maximum off_t value works for at least btrfs.
143  if (lseek(fd, std::numeric_limits<off_t>::max(), SEEK_SET) < 0) {
144  if constexpr (sizeof(off_t) > 4) {
145  // Try the actual maximum for ext4 (which matches the documented
146  // maximum filesize) since ext4 is very widely used.
147  (void)lseek(fd, off_t(0xffffffff000), SEEK_SET);
148  }
149  }
150 #elif defined _AIX
151  // It seems prudent to try the maximum off_t value first.
152  if (lseek(fd, std::numeric_limits<off_t>::max(), SEEK_SET) < 0) {
153  if constexpr (sizeof(off_t) > 4) {
154  // Actual maximum seen in testing AIX 7.1 and 7.3 on JFS.
155  (void)lseek(fd, off_t(0xffffffff000), SEEK_SET);
156  }
157  }
158 #elif defined __CYGWIN__ || \
159  defined __DragonFly__ || \
160  defined __FreeBSD__ || \
161  defined __APPLE__ || \
162  defined __NetBSD__ || \
163  defined __OpenBSD__ || \
164  defined __sun__
165  // The maximum off_t value worked in testing on:
166  // * Cygwin 3.6.5
167  // * DragonFlyBSD 6.4.2
168  // * FreeBSD 14.0 and 15.0
169  // * macOS 10.10 and 12.6
170  // * NetBSD 10.0
171  // * OpenBSD 7.5
172  // * Solaris 10 and 11.4
173  (void)lseek(fd, std::numeric_limits<off_t>::max(), SEEK_SET);
174 #elif defined __EMSCRIPTEN__
175  if constexpr (sizeof(off_t) > 4) {
176  // Anything larger fails with EOVERFLOW (tested with Emscripten SDK
177  // 4.0.19).
178  (void)lseek(fd, off_t(0x20000000000000), SEEK_SET);
179  }
180 #elif defined __WIN32__
181  // For Microsoft Windows we open the file with CreateFile() and
182  // FILE_FLAG_OVERLAPPED so write() will always fail with EINVAL which
183  // protects us from accidental writes. Tested on Microsoft Windows Server
184  // 2025 10.0.26100.
185  (void)fd;
186 #else
187  (void)fd;
188 #endif
189 }
190 
191 int
192 io_open_block_wr(const char* filename, bool anew)
193 {
194 #ifndef __WIN32__
195  // Use auto because on AIX O_CLOEXEC may be a 64-bit integer constant.
196  auto flags = O_RDWR | O_BINARY | O_CLOEXEC;
197  if (anew) flags |= O_CREAT | O_TRUNC;
198  int fd = ::open(filename, flags, 0666);
199 #else
200  // Microsoft Windows lacks the POSIX standard function pwrite().
201  // We can still protect the fd from accidental writes by opening the file
202  // with CreateFile() and specifying FILE_FLAG_OVERLAPPED, then wrapping the
203  // returned handle in a file descriptor to give a file descriptor for which
204  // write() fails with EINVAL. We implement an equivalent to pwrite() using
205  // WriteFile().
206  HANDLE handleWin =
207  CreateFile(filename,
208  GENERIC_READ | GENERIC_WRITE,
209  /* Subsequent operations may open this file to read, write
210  * or delete it */
211  FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
212  NULL,
213  anew ? CREATE_ALWAYS : OPEN_EXISTING,
214  FILE_ATTRIBUTE_NORMAL | FILE_FLAG_OVERLAPPED,
215  NULL);
216  if (handleWin == INVALID_HANDLE_VALUE) {
217  return posixy_set_errno_from_getlasterror();
218  }
219 
220  // Wrap in a standard file descriptor.
221  int fd = _open_osfhandle((intptr_t)(handleWin), O_RDWR|O_BINARY);
222 #endif
223  if (fd >= 0) {
224  fd = move_to_higher_fd(fd);
225  protect_from_write(fd);
226  }
227  return fd;
228 }
229 
230 int
231 io_open_stream_wr(const char* filename, bool anew)
232 {
233  // Use auto because on AIX O_CLOEXEC may be a 64-bit integer constant.
234  auto flags = O_RDWR | O_BINARY | O_CLOEXEC;
235  if (anew) flags |= O_CREAT | O_TRUNC;
236  int fd = ::open(filename, flags, 0666);
237  return move_to_higher_fd(fd);
238 }
239 
240 size_t
241 io_read(int fd, char * p, size_t n, size_t min)
242 {
243  size_t total = 0;
244  while (n) {
245  ssize_t c = read(fd, p, n);
246  if (c <= 0) {
247  if (c == 0) {
248  if (total >= min) break;
249  throw Xapian::DatabaseCorruptError("Couldn't read enough (EOF)");
250  }
251  if (errno == EINTR) continue;
252  throw Xapian::DatabaseError("Error reading from file", errno);
253  }
254  p += c;
255  total += c;
256  n -= c;
257  }
258  return total;
259 }
260 
262 void
263 io_write(int fd, const char * p, size_t n)
264 {
265  while (n) {
266  ssize_t c = write(fd, p, n);
267  if (c < 0) {
268  if (errno == EINTR) continue;
269  throw Xapian::DatabaseError("Error writing to file", errno);
270  }
271  p += c;
272  n -= c;
273  }
274 }
275 
276 size_t
277 io_pread(int fd, char * p, size_t n, off_t o, size_t min)
278 {
279 #ifdef HAVE_PREAD
280  size_t total = 0;
281  while (true) {
282  ssize_t c = pread(fd, p, n, o);
283  // We should get a full read most of the time, so streamline that case.
284  if (usual(c == ssize_t(n)))
285  return total + n;
286  // -1 is error, 0 is EOF
287  if (c <= 0) {
288  if (c == 0) {
289  if (min == 0)
290  return total;
291  throw Xapian::DatabaseError("EOF reading database");
292  }
293  // We get EINTR if the syscall was interrupted by a signal.
294  // In this case we should retry the read.
295  if (errno == EINTR) continue;
296  throw Xapian::DatabaseError("Error reading database", errno);
297  }
298  total += c;
299  if (total >= min)
300  return total;
301  p += c;
302  n -= c;
303  o += c;
304  }
305 #elif defined __WIN32__
306  HANDLE h = (HANDLE)_get_osfhandle(fd);
307  if (h == INVALID_HANDLE_VALUE) {
308  // _get_osfhandle() sets errno to EBADF.
309  throw Xapian::DatabaseError("Error reading database", errno);
310  }
311 
312  OVERLAPPED overlapped;
313  memset(&overlapped, 0, sizeof(overlapped));
314  overlapped.Offset = (DWORD)o;
315  if constexpr (sizeof(off_t) > 4) {
316  overlapped.OffsetHigh = o >> 32;
317  }
318  DWORD c;
319  if (!ReadFile(h, p, n, &c, &overlapped)) {
320  if (GetLastError() != ERROR_IO_PENDING ||
321  !GetOverlappedResult(h,
322  &overlapped,
323  &c,
324  TRUE)) {
325  throw Xapian::DatabaseError("Error reading database",
326  -GetLastError());
327  }
328  }
329  if (c < min) {
330  throw Xapian::DatabaseError("EOF reading database");
331  }
332  return c;
333 #else
334  size_t total = 0;
335  if (rare(lseek(fd, o, SEEK_SET) < 0))
336  throw Xapian::DatabaseError("Error seeking database", errno);
337  while (true) {
338  ssize_t c = read(fd, p, n);
339  // We should get a full read most of the time, so streamline that case.
340  if (usual(c == ssize_t(n)))
341  return total + n;
342  if (c <= 0) {
343  if (c == 0) {
344  if (min == 0)
345  return total;
346  throw Xapian::DatabaseError("EOF reading database");
347  }
348  // We get EINTR if the syscall was interrupted by a signal.
349  // In this case we should retry the read.
350  if (errno == EINTR) continue;
351  throw Xapian::DatabaseError("Error reading database", errno);
352  }
353  total += c;
354  if (total >= min)
355  return total;
356  p += c;
357  n -= c;
358  }
359 #endif
360 }
361 
362 void
363 io_pwrite(int fd, const char * p, size_t n, off_t o)
364 {
365 #ifdef HAVE_PWRITE
366  while (n) {
367  ssize_t c = pwrite(fd, p, n, o);
368  // We should get a full write most of the time, so streamline that
369  // case.
370  if (usual(c == ssize_t(n)))
371  return;
372  if (c < 0) {
373  if (errno == EINTR) continue;
374  throw Xapian::DatabaseError("Error writing database", errno);
375  }
376  p += c;
377  n -= c;
378  o += c;
379  }
380 #elif defined __WIN32__
381  HANDLE h = (HANDLE)_get_osfhandle(fd);
382  if (h == INVALID_HANDLE_VALUE) {
383  // _get_osfhandle() sets errno to EBADF.
384  throw Xapian::DatabaseError("Error writing database", errno);
385  }
386 
387  OVERLAPPED overlapped;
388  memset(&overlapped, 0, sizeof(overlapped));
389  overlapped.Offset = (DWORD)o;
390  if constexpr (sizeof(off_t) > 4) {
391  overlapped.OffsetHigh = o >> 32;
392  }
393  DWORD c;
394  if (!WriteFile(h, p, n, &c, &overlapped)) {
395  if (GetLastError() != ERROR_IO_PENDING ||
396  !GetOverlappedResult(h,
397  &overlapped,
398  &c,
399  TRUE)) {
400  throw Xapian::DatabaseError("Error writing database",
401  -GetLastError());
402  }
403  }
404 #else
405  if (rare(lseek(fd, o, SEEK_SET) < 0))
406  throw Xapian::DatabaseError("Error seeking database", errno);
407  io_write(fd, p, n);
408 #endif
409 }
410 
411 [[noreturn]]
412 static void
413 throw_block_error(const char * s, off_t b, int e = 0)
414 {
415  std::string m = s;
416  m += str(b);
417  throw Xapian::DatabaseError(m, e);
418 }
419 
420 #ifdef HAVE_POSIX_FADVISE
421 bool
422 io_readahead_block(int fd, size_t n, off_t b, off_t o)
423 {
424  o += b * n;
425  // Assume that any failure is likely to also happen for another call with
426  // the same fd.
427  return posix_fadvise(fd, o, n, POSIX_FADV_WILLNEED) == 0;
428 }
429 #endif
430 
431 void
432 io_read_block(int fd, char * p, size_t n, off_t b, off_t o)
433 {
434  o += b * n;
435  // Prefer pread if available since it's typically implemented as a
436  // separate syscall, and that eliminates the overhead of an extra syscall
437  // per block read.
438 #ifdef HAVE_PREAD
439  while (true) {
440  ssize_t c = pread(fd, p, n, o);
441  // We should get a full read most of the time, so streamline that case.
442  if (usual(c == ssize_t(n)))
443  return;
444  // -1 is error, 0 is EOF
445  if (c <= 0) {
446  if (c == 0)
447  throw_block_error("EOF reading block ", b);
448  // We get EINTR if the syscall was interrupted by a signal.
449  // In this case we should retry the read.
450  if (errno == EINTR) continue;
451  throw_block_error("Error reading block ", b, errno);
452  }
453  p += c;
454  n -= c;
455  o += c;
456  }
457 #elif defined __WIN32__
458  // Using ReadFile() seems to be faster than lseek() and read().
459  HANDLE h = (HANDLE)_get_osfhandle(fd);
460  if (h == INVALID_HANDLE_VALUE) {
461  // _get_osfhandle() sets errno to EBADF.
462  throw_block_error("Error reading block ", b, errno);
463  }
464 
465  OVERLAPPED overlapped;
466  memset(&overlapped, 0, sizeof(overlapped));
467  overlapped.Offset = (DWORD)o;
468  if constexpr (sizeof(off_t) > 4) {
469  overlapped.OffsetHigh = o >> 32;
470  }
471  DWORD c;
472  if (!ReadFile(h, p, n, &c, &overlapped)) {
473  if (GetLastError() != ERROR_IO_PENDING ||
474  !GetOverlappedResult(h,
475  &overlapped,
476  &c,
477  TRUE)) {
478  throw_block_error("Error reading block ", b, -GetLastError());
479  }
480  }
481  if (c != n) {
482  throw_block_error("EOF reading block ", b);
483  }
484 #else
485  if (rare(lseek(fd, o, SEEK_SET) < 0))
486  throw_block_error("Error seeking to block ", b, errno);
487  while (true) {
488  ssize_t c = read(fd, p, n);
489  // We should get a full read most of the time, so streamline that case.
490  if (usual(c == ssize_t(n)))
491  return;
492  if (c <= 0) {
493  if (c == 0)
494  throw_block_error("EOF reading block ", b);
495  // We get EINTR if the syscall was interrupted by a signal.
496  // In this case we should retry the read.
497  if (errno == EINTR) continue;
498  throw_block_error("Error reading block ", b, errno);
499  }
500  p += c;
501  n -= c;
502  }
503 #endif
504 }
505 
506 void
507 io_write_block(int fd, const char * p, size_t n, off_t b, off_t o)
508 {
509  o += b * n;
510  // Prefer pwrite if available since it's typically implemented as a
511  // separate syscall, and that eliminates the overhead of an extra syscall
512  // per block write.
513 #ifdef HAVE_PWRITE
514  while (true) {
515  ssize_t c = pwrite(fd, p, n, o);
516  // We should get a full write most of the time, so streamline that case.
517  if (usual(c == ssize_t(n)))
518  return;
519  if (c < 0) {
520  // We get EINTR if the syscall was interrupted by a signal.
521  // In this case we should retry the write.
522  if (errno == EINTR) continue;
523  throw_block_error("Error writing block ", b, errno);
524  }
525  p += c;
526  n -= c;
527  o += c;
528  }
529 #elif defined __WIN32__
530  HANDLE h = (HANDLE)_get_osfhandle(fd);
531  if (h == INVALID_HANDLE_VALUE) {
532  // _get_osfhandle() sets errno to EBADF.
533  throw_block_error("Error writing block ", b, errno);
534  }
535 
536  OVERLAPPED overlapped;
537  memset(&overlapped, 0, sizeof(overlapped));
538  overlapped.Offset = (DWORD)o;
539  if constexpr (sizeof(off_t) > 4) {
540  overlapped.OffsetHigh = o >> 32;
541  }
542  DWORD c;
543  if (!WriteFile(h, p, n, &c, &overlapped)) {
544  if (GetLastError() != ERROR_IO_PENDING ||
545  !GetOverlappedResult(h,
546  &overlapped,
547  &c,
548  TRUE)) {
549  throw_block_error("Error writing block ", b, -GetLastError());
550  }
551  }
552 #else
553  if (rare(lseek(fd, o, SEEK_SET) < 0))
554  throw_block_error("Error seeking to block ", b, errno);
555  while (true) {
556  ssize_t c = write(fd, p, n);
557  // We should get a full write most of the time, so streamline that case.
558  if (usual(c == ssize_t(n)))
559  return;
560  if (c < 0) {
561  // We get EINTR if the syscall was interrupted by a signal.
562  // In this case we should retry the write.
563  if (errno == EINTR) continue;
564  throw_block_error("Error writing block ", b, errno);
565  }
566  p += c;
567  n -= c;
568  }
569 #endif
570 }
571 
572 bool
573 io_tmp_rename(const std::string & tmp_file, const std::string & real_file)
574 {
575 #ifdef EXDEV
576  // We retry on EXDEV a few times as some older Linux kernels are buggy and
577  // fail with EXDEV when the two files are on the same device (as they
578  // always ought to be when this function is used). Don't retry forever in
579  // case someone calls this with files on different devices.
580  //
581  // We're not sure exactly which kernels are buggy in this way, but there's
582  // discussion here: https://www.spinics.net/lists/linux-nfs/msg17306.html
583  //
584  // Reported at: https://trac.xapian.org/ticket/698
585  int retries = 5;
586 retry:
587 #endif
588  if (posixy_rename(tmp_file.c_str(), real_file.c_str()) < 0) {
589 #ifdef EXDEV
590  if (errno == EXDEV && --retries > 0) goto retry;
591 #endif
592  // With NFS, rename() failing may just mean that the server crashed
593  // after successfully renaming, but before reporting this, and then
594  // the retried operation fails. So we need to check if the source
595  // file still exists, which we do by calling unlink(), since we want
596  // to remove the temporary file anyway.
597  int saved_errno = errno;
598  if (unlink(tmp_file.c_str()) == 0 || errno != ENOENT) {
599  errno = saved_errno;
600  return false;
601  }
602  }
603  return true;
604 }
DatabaseCorruptError indicates database corruption was detected.
Definition: error.h:397
DatabaseError indicates some sort of database related error.
Definition: error.h:355
#define usual(COND)
Definition: config.h:608
#define rare(COND)
Definition: config.h:607
PositionList * p
Hierarchy of classes which Xapian can throw as exceptions.
int close(FD &fd)
Definition: fd.h:63
void io_read_block(int fd, char *p, size_t n, off_t b, off_t o)
Read block b size n bytes into buffer p from file descriptor fd, offset o.
Definition: io_utils.cc:432
int io_open_block_wr(const char *filename, bool anew)
Open a block-based file for writing.
Definition: io_utils.cc:192
bool io_unlink(const std::string &filename)
Delete a file.
Definition: io_utils.cc:56
void io_write(int fd, const char *p, size_t n)
Write n bytes from block pointed to by p to file descriptor fd.
Definition: io_utils.cc:263
const int MIN_WRITE_FD
Definition: io_utils.cc:72
void io_pwrite(int fd, const char *p, size_t n, off_t o)
Write n bytes from block pointed to by p to file descriptor fd starting at position o.
Definition: io_utils.cc:363
size_t io_read(int fd, char *p, size_t n, size_t min)
Read n bytes (or until EOF) into block pointed to by p from file descriptor fd.
Definition: io_utils.cc:241
static int move_to_higher_fd_(int fd)
Definition: io_utils.cc:75
static void protect_from_write(int fd)
Protect against stray writes to fds we use pwrite() on.
Definition: io_utils.cc:135
int io_open_stream_wr(const char *filename, bool anew)
Open a stream-based file for writing.
Definition: io_utils.cc:231
size_t io_pread(int fd, char *p, size_t n, off_t o, size_t min)
Read n bytes (or until EOF) into block pointed to by p from file descriptor fd starting at position o...
Definition: io_utils.cc:277
bool io_tmp_rename(const std::string &tmp_file, const std::string &real_file)
Rename a temporary file to its final position.
Definition: io_utils.cc:573
void io_write_block(int fd, const char *p, size_t n, off_t b, off_t o)
Write block b size n bytes from buffer p to file descriptor fd, offset o.
Definition: io_utils.cc:507
static void throw_block_error(const char *s, off_t b, int e=0)
Definition: io_utils.cc:413
static int move_to_higher_fd(int fd)
Definition: io_utils.cc:122
Wrappers for low-level POSIX I/O routines.
bool io_readahead_block(int, size_t, off_t, off_t=0)
Readahead block b size n bytes from file descriptor fd.
Definition: io_utils.h:190
string str(int value)
Convert int to std::string.
Definition: str.cc:91
Database open(std::string_view host, unsigned int port, unsigned timeout=10000, unsigned connect_timeout=10000)
Construct a Database object for read-only access to a remote database accessed via a TCP connection.
Various assertion macros.
#define Assert(COND)
Definition: omassert.h:122
Provides wrappers with POSIXy semantics.
#define posixy_rename(F, T)
#define posixy_unlink(F)
#define O_BINARY
Definition: safefcntl.h:80
#define O_CLOEXEC
Definition: safefcntl.h:89
<unistd.h>, but with compat.
include <windows.h> without all the bloat and damage.
Convert types to std::string.