diff options
-rw-r--r-- | docs/ENVIRONMENT.md | 5 | ||||
-rw-r--r-- | src/basic/missing_loop.h | 4 | ||||
-rw-r--r-- | src/shared/loop-util.c | 137 |
3 files changed, 140 insertions, 6 deletions
diff --git a/docs/ENVIRONMENT.md b/docs/ENVIRONMENT.md index aba9ede259..9a824820da 100644 --- a/docs/ENVIRONMENT.md +++ b/docs/ENVIRONMENT.md @@ -369,6 +369,11 @@ disk images with `--image=` or similar: directores in `/usr/lib/`, `/run`, …) or passed to the kernel for validation against its built-in certificates. +* `$SYSTEMD_LOOP_DIRECT_IO` – takes a boolean, which controls whether to enable + LO_FLAGS_DIRECT_IO (i.e. direct IO + asynchronous IO) on loopback block + devices when opening them. Defaults to on, set this to "0" to disable this + feature. + `systemd-cryptsetup`: * `$SYSTEMD_CRYPTSETUP_USE_TOKEN_MODULE` – takes a boolean, which controls diff --git a/src/basic/missing_loop.h b/src/basic/missing_loop.h index 99e90543ff..5fe63ad1ca 100644 --- a/src/basic/missing_loop.h +++ b/src/basic/missing_loop.h @@ -18,3 +18,7 @@ struct loop_config { #ifndef BLKGETDISKSEQ #define BLKGETDISKSEQ _IOR(0x12,128,__u64) #endif + +#ifndef LOOP_SET_STATUS_SETTABLE_FLAGS +#define LOOP_SET_STATUS_SETTABLE_FLAGS (LO_FLAGS_AUTOCLEAR | LO_FLAGS_PARTSCAN) +#endif diff --git a/src/shared/loop-util.c b/src/shared/loop-util.c index 933756cf80..072acc8c47 100644 --- a/src/shared/loop-util.c +++ b/src/shared/loop-util.c @@ -18,6 +18,7 @@ #include "alloc-util.h" #include "blockdev-util.h" #include "device-util.h" +#include "env-util.h" #include "errno-util.h" #include "fd-util.h" #include "fileio.h" @@ -159,6 +160,7 @@ static int loop_configure( _cleanup_(sd_device_unrefp) sd_device *d = NULL; _cleanup_free_ char *sysname = NULL; _cleanup_close_ int lock_fd = -1; + struct loop_info64 info_copy; uint64_t seqnum; usec_t timestamp; int r; @@ -305,8 +307,13 @@ static int loop_configure( if (ioctl(fd, LOOP_SET_FD, c->fd) < 0) return -errno; + /* Only some of the flags LOOP_CONFIGURE can set are also settable via LOOP_SET_STATUS64, hence mask + * them out. */ + info_copy = c->info; + info_copy.lo_flags &= LOOP_SET_STATUS_SETTABLE_FLAGS; + for (unsigned n_attempts = 0;;) { - if (ioctl(fd, LOOP_SET_STATUS64, &c->info) >= 0) + if (ioctl(fd, LOOP_SET_STATUS64, &info_copy) >= 0) break; if (errno != EAGAIN || ++n_attempts >= 64) { r = log_debug_errno(errno, "Failed to configure loopback device: %m"); @@ -319,6 +326,29 @@ static int loop_configure( random_u64_range(UINT64_C(240) * USEC_PER_MSEC * n_attempts/64)); } + /* Work around a kernel bug, where changing offset/size of the loopback device doesn't correctly + * invalidate the buffer cache. For details see: + * + * https://android.googlesource.com/platform/system/apex/+/bef74542fbbb4cd629793f4efee8e0053b360570 + * + * This was fixed in kernel 5.0, see: + * + * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5db470e229e22b7eda6e23b5566e532c96fb5bc3 + * + * We'll run the work-around here in the legacy LOOP_SET_STATUS64 codepath. In the LOOP_CONFIGURE + * codepath above it should not be necessary. */ + if (c->info.lo_offset != 0 || c->info.lo_sizelimit != 0) + if (ioctl(fd, BLKFLSBUF, 0) < 0) + log_debug_errno(errno, "Failed to issue BLKFLSBUF ioctl, ignoring: %m"); + + /* LO_FLAGS_DIRECT_IO is a flags we need to configure via explicit ioctls. */ + if (FLAGS_SET(c->info.lo_flags, LO_FLAGS_DIRECT_IO)) { + unsigned long b = 1; + + if (ioctl(fd, LOOP_SET_DIRECT_IO, b) < 0) + log_debug_errno(errno, "Failed to enable direct IO mode on loopback device /dev/loop%i, ignoring: %m", nr); + } + if (ret_seqnum_not_before) *ret_seqnum_not_before = seqnum; if (ret_timestamp_not_before) @@ -369,7 +399,7 @@ static int attach_empty_file(int loop, int nr) { return 0; } -int loop_device_make( +static int loop_device_make_internal( int fd, int open_flags, uint64_t offset, @@ -377,14 +407,15 @@ int loop_device_make( uint32_t loop_flags, LoopDevice **ret) { + _cleanup_close_ int direct_io_fd = -1; _cleanup_free_ char *loopdev = NULL; bool try_loop_configure = true; struct loop_config config; LoopDevice *d = NULL; uint64_t seqnum = UINT64_MAX; usec_t timestamp = USEC_INFINITY; + int nr = -1, r, f_flags; struct stat st; - int nr = -1, r; assert(fd >= 0); assert(ret); @@ -444,6 +475,30 @@ int loop_device_make( return r; } + f_flags = fcntl(fd, F_GETFL); + if (f_flags < 0) + return -errno; + + if (FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO) != FLAGS_SET(f_flags, O_DIRECT)) { + /* If LO_FLAGS_DIRECT_IO is requested, then make sure we have the fd open with O_DIRECT, as + * that's required. Conversely, if it's off require that O_DIRECT is off too (that's because + * new kernels will implicitly enable LO_FLAGS_DIRECT_IO if O_DIRECT is set). + * + * Our intention here is that LO_FLAGS_DIRECT_IO is the primary knob, and O_DIRECT derived + * from that automatically. */ + + direct_io_fd = fd_reopen(fd, (FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO) ? O_DIRECT : 0)|O_CLOEXEC|O_NONBLOCK|open_flags); + if (direct_io_fd < 0) { + if (!FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO)) + return log_debug_errno(errno, "Failed to reopen file descriptor without O_DIRECT: %m"); + + /* Some file systems might not support O_DIRECT, let's gracefully continue without it then. */ + log_debug_errno(errno, "Failed to enable O_DIRECT for backing file descriptor for loopback device. Continuing without."); + loop_flags &= ~LO_FLAGS_DIRECT_IO; + } else + fd = direct_io_fd; /* From now on, operate on our new O_DIRECT fd */ + } + _cleanup_close_ int control = -1; _cleanup_(cleanup_clear_loop_close) int loop_with_fd = -1; @@ -505,6 +560,28 @@ int loop_device_make( UINT64_C(240) * USEC_PER_MSEC * n_attempts/64)); } + if (FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO)) { + struct loop_info64 info; + + if (ioctl(loop_with_fd, LOOP_GET_STATUS64, &info) < 0) + return -errno; + +#if HAVE_VALGRIND_MEMCHECK_H + VALGRIND_MAKE_MEM_DEFINED(&info, sizeof(info)); +#endif + + /* On older kernels (<= 5.3) it was necessary to set the block size of the loopback block + * device to the logical block size of the underlying file system. Since there was no nice + * way to query the value, we are not bothering to do this however. On newer kernels the + * block size is propagated automatically and does not require intervention from us. We'll + * check here if enabling direct IO worked, to make this easily debuggable however. + * + * (Should anyone really care and actually wants direct IO on old kernels: it might be worth + * enabling direct IO with iteratively larger block sizes until it eventually works.) */ + if (!FLAGS_SET(info.lo_flags, LO_FLAGS_DIRECT_IO)) + log_debug("Could not enable direct IO mode, proceeding in buffered IO mode."); + } + if (fstat(loop_with_fd, &st) < 0) return -errno; assert(S_ISBLK(st.st_mode)); @@ -531,14 +608,48 @@ int loop_device_make( return d->fd; } +static uint32_t loop_flags_mangle(uint32_t loop_flags) { + int r; + + r = getenv_bool("SYSTEMD_LOOP_DIRECT_IO"); + if (r < 0 && r != -ENXIO) + log_debug_errno(r, "Failed to parse $SYSTEMD_LOOP_DIRECT_IO, ignoring: %m"); + + SET_FLAG(loop_flags, LO_FLAGS_DIRECT_IO, r != 0); /* Turn on LO_FLAGS_DIRECT_IO by default, unless explicitly configured to off. */ + return loop_flags; +} + +int loop_device_make( + int fd, + int open_flags, + uint64_t offset, + uint64_t size, + uint32_t loop_flags, + LoopDevice **ret) { + + assert(fd >= 0); + assert(ret); + assert(IN_SET(open_flags, O_RDWR, O_RDONLY)); + + loop_flags = loop_flags_mangle(loop_flags); + + return loop_device_make_internal( + fd, + open_flags, + offset, + size, + loop_flags, + ret); +} + int loop_device_make_by_path( const char *path, int open_flags, uint32_t loop_flags, LoopDevice **ret) { + int r, basic_flags, direct_flags, rdwr_flags; _cleanup_close_ int fd = -1; - int r; assert(path); assert(ret); @@ -547,7 +658,18 @@ int loop_device_make_by_path( /* Passing < 0 as open_flags here means we'll try to open the device writable if we can, retrying * read-only if we cannot. */ - fd = open(path, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|(open_flags >= 0 ? open_flags : O_RDWR)); + loop_flags = loop_flags_mangle(loop_flags); + + /* Let's open with O_DIRECT if we can. But not all file systems support that, hence fall back to + * non-O_DIRECT mode automatically, if it fails. */ + + basic_flags = O_CLOEXEC|O_NONBLOCK|O_NOCTTY; + direct_flags = FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO) ? O_DIRECT : 0; + rdwr_flags = open_flags >= 0 ? open_flags : O_RDWR; + + fd = open(path, basic_flags|direct_flags|rdwr_flags); + if (fd < 0 && direct_flags != 0) /* If we had O_DIRECT on, and things failed with that, let's immediately try again without */ + fd = open(path, basic_flags|rdwr_flags); if (fd < 0) { r = -errno; @@ -555,7 +677,9 @@ int loop_device_make_by_path( if (open_flags >= 0 || !(ERRNO_IS_PRIVILEGE(r) || r == -EROFS)) return r; - fd = open(path, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|O_RDONLY); + fd = open(path, basic_flags|direct_flags|O_RDONLY); + if (fd < 0 && direct_flags != 0) /* as above */ + fd = open(path, basic_flags|O_RDONLY); if (fd < 0) return r; /* Propagate original error */ @@ -626,6 +750,7 @@ int loop_device_open(const char *loop_path, int open_flags, LoopDevice **ret) { int nr; assert(loop_path); + assert(IN_SET(open_flags, O_RDWR, O_RDONLY)); assert(ret); loop_fd = open(loop_path, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|open_flags); |