summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLennart Poettering <lennart@poettering.net>2021-10-19 14:56:49 +0200
committerLennart Poettering <lennart@poettering.net>2021-10-20 11:35:15 +0200
commit4a4654e0241fbeabecb8587fd3520b6b39264b9c (patch)
tree2e4366a646eb12e254fc631e344a832987aa27c6
parent231c7645ca761f0347c98fa48c68b3fde00fbc15 (diff)
downloadsystemd-4a4654e0241fbeabecb8587fd3520b6b39264b9c.tar.gz
nspawn: add --suppress-sync=yes mode for turning sync() and friends into NOPs via seccomp
This is supposed to be used by package/image builders such as mkosi to speed up building, since it allows us to suppress sync() inside a container. This does what Debian's eatmydata tool does, but for a container, and via seccomp (instead of LD_PRELOAD).
-rw-r--r--docs/ENVIRONMENT.md6
-rw-r--r--man/systemd-nspawn.xml18
-rw-r--r--man/systemd.nspawn.xml10
-rw-r--r--shell-completion/bash/systemd-nspawn2
-rw-r--r--src/nspawn/nspawn-gperf.gperf1
-rw-r--r--src/nspawn/nspawn-settings.h8
-rw-r--r--src/nspawn/nspawn.c30
-rw-r--r--src/shared/seccomp-util.c95
-rw-r--r--src/shared/seccomp-util.h2
9 files changed, 167 insertions, 5 deletions
diff --git a/docs/ENVIRONMENT.md b/docs/ENVIRONMENT.md
index 9a824820da..328934cd17 100644
--- a/docs/ENVIRONMENT.md
+++ b/docs/ENVIRONMENT.md
@@ -138,6 +138,12 @@ All tools:
* `$SYSTEMD_NSPAWN_TMPFS_TMP=0` — if set, do not overmount `/tmp/` in the
container with a tmpfs, but leave the directory from the image in place.
+* `$SYSTEMD_SUPPRESS_SYNC=1` — if set, all disk synchronization syscalls are
+ blocked to the container payload (e.g. `sync()`, `fsync()`, `syncfs()`, …)
+ and the `O_SYNC`/`O_DSYNC` flags are made unavailable to `open()` and
+ friends. This is equivalent to passing `--suppress-sync=yes` on the
+ `systemd-nspawn` command line.
+
`systemd-logind`:
* `$SYSTEMD_BYPASS_HIBERNATION_MEMORY_CHECK=1` — if set, report that
diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml
index e84ac6ae42..aec0b0e129 100644
--- a/man/systemd-nspawn.xml
+++ b/man/systemd-nspawn.xml
@@ -570,6 +570,24 @@
before sending its own to systemd. For more details about notifications
see <citerefentry><refentrytitle>sd_notify</refentrytitle><manvolnum>3</manvolnum></citerefentry>.</para></listitem>
</varlistentry>
+
+ <varlistentry>
+ <term><option>--suppress-sync=</option></term>
+
+ <listitem><para>Expects a boolean argument. If true, turns off any form of on-disk file system
+ synchronization for the container payload. This means all system calls such as <citerefentry
+ project='man-pages'><refentrytitle>sync</refentrytitle><manvolnum>2</manvolnum></citerefentry>,
+ <function>fsync()</function>, <function>syncfs()</function>, … will execute no operation, and the
+ <constant>O_SYNC</constant>/<constant>O_DSYNC</constant> flags to <citerefentry
+ project='man-pages'><refentrytitle>open</refentrytitle><manvolnum>2</manvolnum></citerefentry> and
+ related calls will be made unavailable. This is potentially dangerous, as assumed data integrity
+ guarantees to the container payload are not actually enforced (i.e. data assumed to have been written
+ to disk might be lost if the system is shut down abnormally). However, this can dramatically improve
+ container runtime performance – as long as these guarantees are not required or desirable, for
+ example because any data written by the container is of temporary, redundant nature, or just an
+ intermediary artifact that will be further processed and finalized by a later step in a
+ pipeline. Defaults to false.</para></listitem>
+ </varlistentry>
</variablelist>
</refsect2><refsect2>
diff --git a/man/systemd.nspawn.xml b/man/systemd.nspawn.xml
index dc0e2f9fd2..bb9bf4b5d9 100644
--- a/man/systemd.nspawn.xml
+++ b/man/systemd.nspawn.xml
@@ -365,6 +365,16 @@
details.</para></listitem>
</varlistentry>
+ <varlistentry>
+ <term><varname>SuppressSync=</varname></term>
+
+ <listitem><para>Configures whether to suppress disk synchronization for the container payload. This
+ is equivalent to the <option>--suppress-sync=</option> command line switch, and takes the same
+ parameter. See
+ <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry>
+ for details.</para></listitem>
+ </varlistentry>
+
</variablelist>
</refsect1>
diff --git a/shell-completion/bash/systemd-nspawn b/shell-completion/bash/systemd-nspawn
index f367c7d14c..3b6d65d96a 100644
--- a/shell-completion/bash/systemd-nspawn
+++ b/shell-completion/bash/systemd-nspawn
@@ -63,7 +63,7 @@ _systemd_nspawn() {
local -A OPTS=(
[STANDALONE]='-h --help --version --private-network -b --boot --read-only -q --quiet --share-system
- --keep-unit -n --network-veth -j -x --ephemeral -a --as-pid2 -U'
+ --keep-unit -n --network-veth -j -x --ephemeral -a --as-pid2 -U --suppress-sync=yes'
[ARG]='-D --directory -u --user --uuid --capability --drop-capability --link-journal --bind --bind-ro
-M --machine -S --slice -E --setenv -Z --selinux-context -L --selinux-apifs-context
--register --network-interface --network-bridge --personality -i --image --tmpfs
diff --git a/src/nspawn/nspawn-gperf.gperf b/src/nspawn/nspawn-gperf.gperf
index ea15e27148..4af00c8d95 100644
--- a/src/nspawn/nspawn-gperf.gperf
+++ b/src/nspawn/nspawn-gperf.gperf
@@ -59,6 +59,7 @@ Exec.CPUAffinity, config_parse_cpu_affinity, 0, 0
Exec.ResolvConf, config_parse_resolv_conf, 0, offsetof(Settings, resolv_conf)
Exec.LinkJournal, config_parse_link_journal, 0, 0
Exec.Timezone, config_parse_timezone, 0, offsetof(Settings, timezone)
+Exec.SuppressSync, config_parse_bool, 0, offsetof(Settings, suppress_sync)
Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only)
Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode)
Files.Bind, config_parse_bind, 0, 0
diff --git a/src/nspawn/nspawn-settings.h b/src/nspawn/nspawn-settings.h
index 939e1c757b..1b3ace5f8f 100644
--- a/src/nspawn/nspawn-settings.h
+++ b/src/nspawn/nspawn-settings.h
@@ -127,9 +127,10 @@ typedef enum SettingsMask {
SETTING_CONSOLE_MODE = UINT64_C(1) << 29,
SETTING_CREDENTIALS = UINT64_C(1) << 30,
SETTING_BIND_USER = UINT64_C(1) << 31,
- SETTING_RLIMIT_FIRST = UINT64_C(1) << 32, /* we define one bit per resource limit here */
- SETTING_RLIMIT_LAST = UINT64_C(1) << (32 + _RLIMIT_MAX - 1),
- _SETTINGS_MASK_ALL = (UINT64_C(1) << (32 + _RLIMIT_MAX)) -1,
+ SETTING_SUPPRESS_SYNC = UINT64_C(1) << 32,
+ SETTING_RLIMIT_FIRST = UINT64_C(1) << 33, /* we define one bit per resource limit here */
+ SETTING_RLIMIT_LAST = UINT64_C(1) << (33 + _RLIMIT_MAX - 1),
+ _SETTINGS_MASK_ALL = (UINT64_C(1) << (33 + _RLIMIT_MAX)) -1,
_SETTING_FORCE_ENUM_WIDTH = UINT64_MAX
} SettingsMask;
@@ -189,6 +190,7 @@ typedef struct Settings {
LinkJournal link_journal;
bool link_journal_try;
TimezoneMode timezone;
+ bool suppress_sync;
/* [Files] */
int read_only;
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index 7b767fb296..7dbc84369b 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -229,6 +229,7 @@ static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
static Credential *arg_credentials = NULL;
static size_t arg_n_credentials = 0;
static char **arg_bind_user = NULL;
+static bool arg_suppress_sync = false;
STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
@@ -342,7 +343,9 @@ static int help(void) {
" -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n"
" -u --user=USER Run the command under specified user or UID\n"
" --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
- " --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
+ " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
+ " --suppress-sync=BOOLEAN\n"
+ " Suppress any form of disk data synchronization\n\n"
"%3$sSystem Identity:%4$s\n"
" -M --machine=NAME Set the machine name for the container\n"
" --hostname=NAME Override the hostname for the container\n"
@@ -654,6 +657,12 @@ static int parse_environment(void) {
if (e)
arg_container_service_name = e;
+ r = getenv_bool("SYSTEMD_SUPPRESS_SYNC");
+ if (r >= 0)
+ arg_suppress_sync = r;
+ else if (r != -ENXIO)
+ log_debug_errno(r, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
+
return detect_unified_cgroup_hierarchy_from_environment();
}
@@ -713,6 +722,7 @@ static int parse_argv(int argc, char *argv[]) {
ARG_SET_CREDENTIAL,
ARG_LOAD_CREDENTIAL,
ARG_BIND_USER,
+ ARG_SUPPRESS_SYNC,
};
static const struct option options[] = {
@@ -785,6 +795,7 @@ static int parse_argv(int argc, char *argv[]) {
{ "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
{ "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
{ "bind-user", required_argument, NULL, ARG_BIND_USER },
+ { "suppress-sync", required_argument, NULL, ARG_SUPPRESS_SYNC },
{}
};
@@ -1668,6 +1679,14 @@ static int parse_argv(int argc, char *argv[]) {
arg_settings_mask |= SETTING_BIND_USER;
break;
+ case ARG_SUPPRESS_SYNC:
+ r = parse_boolean_argument("--suppress-sync=", optarg, &arg_suppress_sync);
+ if (r < 0)
+ return r;
+
+ arg_settings_mask |= SETTING_SUPPRESS_SYNC;
+ break;
+
case '?':
return -EINVAL;
@@ -3385,6 +3404,12 @@ static int inner_child(
return r;
}
+ if (arg_suppress_sync) {
+ r = seccomp_suppress_sync();
+ if (r < 0)
+ log_debug_errno(r, "Failed to install sync() suppression seccomp filter, ignoring: %m");
+ }
+
#if HAVE_SELINUX
if (arg_selinux_context)
if (setexeccon(arg_selinux_context) < 0)
@@ -4552,6 +4577,9 @@ static int merge_settings(Settings *settings, const char *path) {
arg_console_mode = settings->console_mode;
}
+ if ((arg_settings_mask & SETTING_SUPPRESS_SYNC) == 0)
+ arg_suppress_sync = settings->suppress_sync;
+
/* The following properties can only be set through the OCI settings logic, not from the command line, hence we
* don't consult arg_settings_mask for them. */
diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c
index 31d6b542c0..ff90af538b 100644
--- a/src/shared/seccomp-util.c
+++ b/src/shared/seccomp-util.c
@@ -2205,3 +2205,98 @@ int parse_syscall_and_errno(const char *in, char **name, int *error) {
return 0;
}
+
+static int block_open_flag(scmp_filter_ctx seccomp, int flag) {
+ bool any = false;
+ int r;
+
+ /* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return
+ * EINVAL, in the hope the client code will retry without O_SYNC then. */
+
+#if SCMP_SYS(open) > 0
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EINVAL),
+ SCMP_SYS(open),
+ 1,
+ SCMP_A1(SCMP_CMP_MASKED_EQ, flag, flag));
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for open: %m");
+ else
+ any = true;
+#endif
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EINVAL),
+ SCMP_SYS(openat),
+ 1,
+ SCMP_A2(SCMP_CMP_MASKED_EQ, flag, flag));
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for openat: %m");
+ else
+ any = true;
+
+#if defined(__SNR_openat2)
+ /* The new openat2() system call can't be filtered sensibly, see above. */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(ENOSYS),
+ SCMP_SYS(openat2),
+ 0);
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for openat2: %m");
+ else
+ any = true;
+#endif
+
+ return any ? 0 : r;
+}
+
+int seccomp_suppress_sync(void) {
+ uint32_t arch;
+ int r;
+
+ /* This is mostly identical to SystemCallFilter=~@sync:0, but simpler to use, and separately
+ * manageable, and also masks O_SYNC/O_DSYNC */
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+ const char *c;
+
+ r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return r;
+
+ NULSTR_FOREACH(c, syscall_filter_sets[SYSCALL_FILTER_SET_SYNC].value) {
+ int id;
+
+ id = seccomp_syscall_resolve_name(c);
+ if (id == __NR_SCMP_ERROR) {
+ log_debug("System call %s is not known, ignoring.", c);
+ continue;
+ }
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
+ id,
+ 0);
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for system call %s, ignoring: %m", c);
+ }
+
+ (void) block_open_flag(seccomp, O_SYNC);
+#if O_DSYNC != O_SYNC
+ (void) block_open_flag(seccomp, O_DSYNC);
+#endif
+
+ r = seccomp_load(seccomp);
+ if (ERRNO_IS_SECCOMP_FATAL(r))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to apply sync() suppression for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ }
+
+ return 0;
+}
diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h
index b3d25c9f3f..4f4bc48431 100644
--- a/src/shared/seccomp-util.h
+++ b/src/shared/seccomp-util.h
@@ -150,3 +150,5 @@ static inline const char *seccomp_errno_or_action_to_string(int num) {
}
int parse_syscall_and_errno(const char *in, char **name, int *error);
+
+int seccomp_suppress_sync(void);