From b33c333bcb88557ad23a9bc5be0d619d537984e9 Mon Sep 17 00:00:00 2001 From: Simon McVittie Date: Tue, 22 Mar 2022 17:12:33 +0000 Subject: Add an option to disable nested user namespaces by setting limit to 1 Some use-cases of bubblewrap want to ensure that the subprocess can't further re-arrange the filesystem namespace, or do other more complex namespace modification. For example, Flatpak wants to prevent sandboxed processes from altering their /proc/$pid/root/.flatpak-info, so that /.flatpak-info can safely be used as an indicator that a process is part of a Flatpak app. This approach was suggested by lukts30 on containers/bubblewrap#452. The sysctl-controlled maximum numbers of namespaces are themselves namespaced, so we can disable nested user namespaces by setting the limit to 1 and then entering a new, nested user namespace. The resulting process loses its privileges in the namespace where the limit was set to 1, so it is unable to move the limit back up. Co-authored-by: Alexander Larsson Signed-off-by: Simon McVittie --- bubblewrap.c | 54 ++++++++++++++++++++++++++++++++++++++++++++------ bwrap.xml | 14 +++++++++++++ completions/bash/bwrap | 1 + completions/zsh/_bwrap | 1 + tests/test-run.sh | 10 +++++++++- 5 files changed, 73 insertions(+), 7 deletions(-) diff --git a/bubblewrap.c b/bubblewrap.c index eece9d8..bcfbe9d 100644 --- a/bubblewrap.c +++ b/bubblewrap.c @@ -73,6 +73,7 @@ static const char *opt_file_label = NULL; static bool opt_as_pid_1; const char *opt_chdir_path = NULL; +bool opt_disable_userns = FALSE; bool opt_unshare_user = FALSE; bool opt_unshare_user_try = FALSE; bool opt_unshare_pid = FALSE; @@ -311,6 +312,7 @@ usage (int ecode, FILE *out) " --unshare-cgroup-try Create new cgroup namespace if possible else continue by skipping it\n" " --userns FD Use this user namespace (cannot combine with --unshare-user)\n" " --userns2 FD After setup switch to this user namespace, only useful with --userns\n" + " --disable-userns Disable further use of user namespaces inside sandbox\n" " --pidns FD Use this pid namespace (as parent namespace if using --unshare-pid)\n" " --uid UID Custom uid in the sandbox (requires --unshare-user or --userns)\n" " --gid GID Custom gid in the sandbox (requires --unshare-user or --userns)\n" @@ -1777,6 +1779,10 @@ parse_args_recurse (int *argcp, argv++; argc--; } + else if (strcmp (arg, "--disable-userns") == 0) + { + opt_disable_userns = TRUE; + } else if (strcmp (arg, "--remount-ro") == 0) { if (argc < 2) @@ -2677,6 +2683,12 @@ main (int argc, if (opt_userns_fd != -1 && opt_unshare_user_try) die ("--userns not compatible --unshare-user-try"); + if (opt_disable_userns && !opt_unshare_user) + die ("--disable-userns requires --unshare-user"); + + if (opt_disable_userns && opt_userns_block_fd != -1) + die ("--disable-userns is not compatible with --userns-block-fd"); + /* Technically using setns() is probably safe even in the privileged * case, because we got passed in a file descriptor to the * namespace, and that can only be gotten if you have ptrace @@ -3155,13 +3167,34 @@ main (int argc, if (opt_userns2_fd > 0 && setns (opt_userns2_fd, CLONE_NEWUSER) != 0) die_with_error ("Setting userns2 failed"); - if (opt_unshare_user && - (ns_uid != opt_sandbox_uid || ns_gid != opt_sandbox_gid) && - opt_userns_block_fd == -1) + if (opt_unshare_user && opt_userns_block_fd == -1 && + (ns_uid != opt_sandbox_uid || ns_gid != opt_sandbox_gid || + opt_disable_userns)) { - /* Now that devpts is mounted and we've no need for mount - permissions we can create a new userspace and map our uid - 1:1 */ + /* Here we create a second level userns inside the first one. This is + used for one or more of these reasons: + + * The 1st level namespace has a different uid/gid than the + requested due to requirements of beeing root in the first + level due for mounting devpts (opt_needs_devpts). + + * To disable user namespaces we set max_user_namespaces and then + create the second namespace so that the sandbox cannot undo this + change. + */ + + if (opt_disable_userns) + { + cleanup_fd int sysctl_fd = -1; + + sysctl_fd = openat (proc_fd, "sys/user/max_user_namespaces", O_WRONLY); + + if (sysctl_fd < 0) + die_with_error ("cannot open /proc/sys/user/max_user_namespaces"); + + if (write_to_fd (sysctl_fd, "1", 1) < 0) + die_with_error ("sysctl user.max_user_namespaces = 1"); + } if (unshare (CLONE_NEWUSER)) die_with_error ("unshare user ns"); @@ -3169,6 +3202,15 @@ main (int argc, /* We're in a new user namespace, we got back the bounding set, clear it again */ drop_cap_bounding_set (FALSE); + if (opt_disable_userns) + { + /* Verify that we can't make a new userns again */ + res = unshare (CLONE_NEWUSER); + + if (res == 0) + die ("unable to disable creation of new user namespaces"); + } + write_uid_gid_map (opt_sandbox_uid, ns_uid, opt_sandbox_gid, ns_gid, -1, FALSE, FALSE); diff --git a/bwrap.xml b/bwrap.xml index 46e2478..8690d64 100644 --- a/bwrap.xml +++ b/bwrap.xml @@ -144,6 +144,20 @@ After setting up the new namespace, switch into the specified namespace. For this to work the specified namespace must be a descendant of the user namespace used for the setup, so this is only useful in combination with --userns. This is useful because sometimes bubblewrap itself creates nested user namespaces (to work around some kernel issues) and --userns2 can be used to enter these. + + + + Prevent the process in the sandbox from creating further user namespaces, + so that it cannot rearrange the filesystem namespace or do other more + complex namespace modification. + This is currently implemented by setting the + user.max_user_namespaces sysctl to 1, and then + entering a nested user namespace which is unable to raise that limit + in the outer namespace. + This option requires , and doesn't work + in the setuid version of bubblewrap. + + Use an existing pid namespace instead of creating one. This is often used with --userns, because the pid namespace must be owned by the same user namespace that bwrap uses. diff --git a/completions/bash/bwrap b/completions/bash/bwrap index e796be3..962d04c 100644 --- a/completions/bash/bwrap +++ b/completions/bash/bwrap @@ -11,6 +11,7 @@ _bwrap() { local boolean_options=" --as-pid-1 --clearenv + --disable-userns --help --new-session --unshare-all diff --git a/completions/zsh/_bwrap b/completions/zsh/_bwrap index f81ffaf..7488727 100644 --- a/completions/zsh/_bwrap +++ b/completions/zsh/_bwrap @@ -41,6 +41,7 @@ _bwrap_args=( '--dev-bind[Bind mount the host path SRC on DEST, allowing device access]:source:_files:destination:_files' '--dev[Mount new dev on DEST]:mount point for /dev:_files -/' "--die-with-parent[Kills with SIGKILL child process (COMMAND) when bwrap or bwrap's parent dies.]" + '--disable-userns[Disable further use of user namespaces inside sandbox]' '--exec-label[Exec label for the sandbox]:SELinux label:_selinux_contexts' '--file-label[File label for temporary sandbox content]:SELinux label:_selinux_contexts' '--gid[Custom gid in the sandbox (requires --unshare-user or --userns)]: :_guard "[0-9]#" "numeric group ID"' diff --git a/tests/test-run.sh b/tests/test-run.sh index a08998b..f1506bb 100755 --- a/tests/test-run.sh +++ b/tests/test-run.sh @@ -8,7 +8,7 @@ srcd=$(cd $(dirname "$0") && pwd) bn=$(basename "$0") -echo "1..57" +echo "1..58" # Test help ${BWRAP} --help > help.txt @@ -112,6 +112,7 @@ echo "ok exec failure doesn't include exit-code in json-status" if test -n "${bwrap_is_suid:-}"; then echo "ok - # SKIP no --cap-add support" echo "ok - # SKIP no --cap-add support" + echo "ok - # SKIP no --disable-userns" else BWRAP_RECURSE="$BWRAP --unshare-user --uid 0 --gid 0 --cap-add ALL --bind / / --bind /proc /proc" @@ -123,6 +124,13 @@ else $BWRAP_RECURSE -- /proc/self/exe --unshare-all ${BWRAP_RO_HOST_ARGS} findmnt > recursive-newroot.txt assert_file_has_content recursive-newroot.txt "/usr" echo "ok - can pivot to new rootfs recursively" + + $BWRAP --dev-bind / / -- true + $BWRAP --unshare-user --disable-userns --dev-bind / / -- true + ! $BWRAP --unshare-user --disable-userns --dev-bind / / -- $BWRAP --dev-bind / / -- true + $BWRAP --unshare-user --disable-userns --dev-bind / / -- sh -c "echo 2 > /proc/sys/user/max_user_namespaces || true; ! $BWRAP --dev-bind / / -- true" + $BWRAP --unshare-user --disable-userns --dev-bind / / -- sh -c "echo 100 > /proc/sys/user/max_user_namespaces || true; ! $BWRAP --dev-bind / / -- true" + echo "ok - can disable nested userns" fi # Test error prefixing -- cgit v1.2.1