From b33c333bcb88557ad23a9bc5be0d619d537984e9 Mon Sep 17 00:00:00 2001
From: Simon McVittie <smcv@collabora.com>
Date: Tue, 22 Mar 2022 17:12:33 +0000
Subject: Add an option to disable nested user namespaces by setting limit to 1

Some use-cases of bubblewrap want to ensure that the subprocess can't
further re-arrange the filesystem namespace, or do other more complex
namespace modification. For example, Flatpak wants to prevent sandboxed
processes from altering their /proc/$pid/root/.flatpak-info, so that
/.flatpak-info can safely be used as an indicator that a process is part
of a Flatpak app.

This approach was suggested by lukts30 on containers/bubblewrap#452.
The sysctl-controlled maximum numbers of namespaces are themselves
namespaced, so we can disable nested user namespaces by setting the
limit to 1 and then entering a new, nested user namespace. The resulting
process loses its privileges in the namespace where the limit was set
to 1, so it is unable to move the limit back up.

Co-authored-by: Alexander Larsson <alexl@redhat.com>
Signed-off-by: Simon McVittie <smcv@collabora.com>
---
 bubblewrap.c           | 54 ++++++++++++++++++++++++++++++++++++++++++++------
 bwrap.xml              | 14 +++++++++++++
 completions/bash/bwrap |  1 +
 completions/zsh/_bwrap |  1 +
 tests/test-run.sh      | 10 +++++++++-
 5 files changed, 73 insertions(+), 7 deletions(-)

diff --git a/bubblewrap.c b/bubblewrap.c
index eece9d8..bcfbe9d 100644
--- a/bubblewrap.c
+++ b/bubblewrap.c
@@ -73,6 +73,7 @@ static const char *opt_file_label = NULL;
 static bool opt_as_pid_1;
 
 const char *opt_chdir_path = NULL;
+bool opt_disable_userns = FALSE;
 bool opt_unshare_user = FALSE;
 bool opt_unshare_user_try = FALSE;
 bool opt_unshare_pid = FALSE;
@@ -311,6 +312,7 @@ usage (int ecode, FILE *out)
            "    --unshare-cgroup-try         Create new cgroup namespace if possible else continue by skipping it\n"
            "    --userns FD                  Use this user namespace (cannot combine with --unshare-user)\n"
            "    --userns2 FD                 After setup switch to this user namespace, only useful with --userns\n"
+           "    --disable-userns             Disable further use of user namespaces inside sandbox\n"
            "    --pidns FD                   Use this pid namespace (as parent namespace if using --unshare-pid)\n"
            "    --uid UID                    Custom uid in the sandbox (requires --unshare-user or --userns)\n"
            "    --gid GID                    Custom gid in the sandbox (requires --unshare-user or --userns)\n"
@@ -1777,6 +1779,10 @@ parse_args_recurse (int          *argcp,
           argv++;
           argc--;
         }
+      else if (strcmp (arg, "--disable-userns") == 0)
+        {
+          opt_disable_userns = TRUE;
+        }
       else if (strcmp (arg, "--remount-ro") == 0)
         {
           if (argc < 2)
@@ -2677,6 +2683,12 @@ main (int    argc,
   if (opt_userns_fd != -1 && opt_unshare_user_try)
     die ("--userns not compatible --unshare-user-try");
 
+  if (opt_disable_userns && !opt_unshare_user)
+    die ("--disable-userns requires --unshare-user");
+
+  if (opt_disable_userns && opt_userns_block_fd != -1)
+    die ("--disable-userns is not compatible with  --userns-block-fd");
+
   /* Technically using setns() is probably safe even in the privileged
    * case, because we got passed in a file descriptor to the
    * namespace, and that can only be gotten if you have ptrace
@@ -3155,13 +3167,34 @@ main (int    argc,
   if (opt_userns2_fd > 0 && setns (opt_userns2_fd, CLONE_NEWUSER) != 0)
     die_with_error ("Setting userns2 failed");
 
-  if (opt_unshare_user &&
-      (ns_uid != opt_sandbox_uid || ns_gid != opt_sandbox_gid) &&
-      opt_userns_block_fd == -1)
+  if (opt_unshare_user && opt_userns_block_fd == -1 &&
+      (ns_uid != opt_sandbox_uid || ns_gid != opt_sandbox_gid ||
+       opt_disable_userns))
     {
-      /* Now that devpts is mounted and we've no need for mount
-         permissions we can create a new userspace and map our uid
-         1:1 */
+      /* Here we create a second level userns inside the first one. This is
+         used for one or more of these reasons:
+
+         * The 1st level namespace has a different uid/gid than the
+           requested due to requirements of beeing root in the first
+           level due for mounting devpts (opt_needs_devpts).
+
+         * To disable user namespaces we set max_user_namespaces and then
+           create the second namespace so that the sandbox cannot undo this
+           change.
+      */
+
+      if (opt_disable_userns)
+        {
+          cleanup_fd int sysctl_fd = -1;
+
+          sysctl_fd = openat (proc_fd, "sys/user/max_user_namespaces", O_WRONLY);
+
+          if (sysctl_fd < 0)
+            die_with_error ("cannot open /proc/sys/user/max_user_namespaces");
+
+          if (write_to_fd (sysctl_fd, "1", 1) < 0)
+            die_with_error ("sysctl user.max_user_namespaces = 1");
+        }
 
       if (unshare (CLONE_NEWUSER))
         die_with_error ("unshare user ns");
@@ -3169,6 +3202,15 @@ main (int    argc,
       /* We're in a new user namespace, we got back the bounding set, clear it again */
       drop_cap_bounding_set (FALSE);
 
+      if (opt_disable_userns)
+        {
+          /* Verify that we can't make a new userns again */
+          res = unshare (CLONE_NEWUSER);
+
+          if (res == 0)
+            die ("unable to disable creation of new user namespaces");
+        }
+
       write_uid_gid_map (opt_sandbox_uid, ns_uid,
                          opt_sandbox_gid, ns_gid,
                          -1, FALSE, FALSE);
diff --git a/bwrap.xml b/bwrap.xml
index 46e2478..8690d64 100644
--- a/bwrap.xml
+++ b/bwrap.xml
@@ -144,6 +144,20 @@
       <listitem><para>After setting up the new namespace, switch into the specified namespace. For this to work the specified namespace must be a descendant of the user namespace used for the setup, so this is only useful in combination with --userns.</para>
       <para>This is useful because sometimes bubblewrap itself creates nested user namespaces (to work around some kernel issues) and --userns2 can be used to enter these.</para></listitem>
     </varlistentry>
+    <varlistentry>
+      <term><option>--disable-userns</option></term>
+      <listitem><para>
+        Prevent the process in the sandbox from creating further user namespaces,
+        so that it cannot rearrange the filesystem namespace or do other more
+        complex namespace modification.
+        This is currently implemented by setting the
+        <literal>user.max_user_namespaces</literal> sysctl to 1, and then
+        entering a nested user namespace which is unable to raise that limit
+        in the outer namespace.
+        This option requires <option>--unshare-user</option>, and doesn't work
+        in the setuid version of bubblewrap.
+      </para></listitem>
+    </varlistentry>
     <varlistentry>
       <term><option>--pidns <arg choice="plain">FD</arg></option></term>
       <listitem><para>Use an existing pid namespace instead of creating one. This is often used with --userns, because the pid namespace must be owned by the same user namespace that bwrap uses. </para>
diff --git a/completions/bash/bwrap b/completions/bash/bwrap
index e796be3..962d04c 100644
--- a/completions/bash/bwrap
+++ b/completions/bash/bwrap
@@ -11,6 +11,7 @@ _bwrap() {
 	local boolean_options="
 		--as-pid-1
 		--clearenv
+		--disable-userns
 		--help
 		--new-session
 		--unshare-all
diff --git a/completions/zsh/_bwrap b/completions/zsh/_bwrap
index f81ffaf..7488727 100644
--- a/completions/zsh/_bwrap
+++ b/completions/zsh/_bwrap
@@ -41,6 +41,7 @@ _bwrap_args=(
     '--dev-bind[Bind mount the host path SRC on DEST, allowing device access]:source:_files:destination:_files'
     '--dev[Mount new dev on DEST]:mount point for /dev:_files -/'
     "--die-with-parent[Kills with SIGKILL child process (COMMAND) when bwrap or bwrap's parent dies.]"
+    '--disable-userns[Disable further use of user namespaces inside sandbox]'
     '--exec-label[Exec label for the sandbox]:SELinux label:_selinux_contexts'
     '--file-label[File label for temporary sandbox content]:SELinux label:_selinux_contexts'
     '--gid[Custom gid in the sandbox (requires --unshare-user or --userns)]: :_guard "[0-9]#" "numeric group ID"'
diff --git a/tests/test-run.sh b/tests/test-run.sh
index a08998b..f1506bb 100755
--- a/tests/test-run.sh
+++ b/tests/test-run.sh
@@ -8,7 +8,7 @@ srcd=$(cd $(dirname "$0") && pwd)
 
 bn=$(basename "$0")
 
-echo "1..57"
+echo "1..58"
 
 # Test help
 ${BWRAP} --help > help.txt
@@ -112,6 +112,7 @@ echo "ok exec failure doesn't include exit-code in json-status"
 if test -n "${bwrap_is_suid:-}"; then
     echo "ok - # SKIP no --cap-add support"
     echo "ok - # SKIP no --cap-add support"
+    echo "ok - # SKIP no --disable-userns"
 else
     BWRAP_RECURSE="$BWRAP --unshare-user --uid 0 --gid 0 --cap-add ALL --bind / / --bind /proc /proc"
 
@@ -123,6 +124,13 @@ else
     $BWRAP_RECURSE -- /proc/self/exe --unshare-all ${BWRAP_RO_HOST_ARGS} findmnt > recursive-newroot.txt
     assert_file_has_content recursive-newroot.txt "/usr"
     echo "ok - can pivot to new rootfs recursively"
+
+    $BWRAP --dev-bind / / -- true
+    $BWRAP --unshare-user --disable-userns --dev-bind / / -- true
+    ! $BWRAP --unshare-user --disable-userns --dev-bind / / -- $BWRAP --dev-bind / / -- true
+    $BWRAP --unshare-user --disable-userns --dev-bind / / -- sh -c "echo 2 > /proc/sys/user/max_user_namespaces || true; ! $BWRAP --dev-bind / / -- true"
+    $BWRAP --unshare-user --disable-userns --dev-bind / / -- sh -c "echo 100 > /proc/sys/user/max_user_namespaces || true; ! $BWRAP --dev-bind / / -- true"
+    echo "ok - can disable nested userns"
 fi
 
 # Test error prefixing
-- 
cgit v1.2.1