diff options
Diffstat (limited to 'src/nspawn')
-rw-r--r-- | src/nspawn/nspawn-mount.c | 65 | ||||
-rw-r--r-- | src/nspawn/nspawn-mount.h | 2 | ||||
-rw-r--r-- | src/nspawn/nspawn.c | 50 |
3 files changed, 107 insertions, 10 deletions
diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c index a54f1464ba..0e8aaa1e3c 100644 --- a/src/nspawn/nspawn-mount.c +++ b/src/nspawn/nspawn-mount.c @@ -13,6 +13,7 @@ #include "mkdir-label.h" #include "mount-util.h" #include "mountpoint-util.h" +#include "namespace-util.h" #include "nspawn-mount.h" #include "parse-util.h" #include "path-util.h" @@ -510,6 +511,9 @@ int mount_sysfs(const char *dest, MountSettingsMask mount_settings) { MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL); } +#define PROC_DEFAULT_MOUNT_FLAGS (MS_NOSUID|MS_NOEXEC|MS_NODEV) +#define SYS_DEFAULT_MOUNT_FLAGS (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV) + int mount_all(const char *dest, MountSettingsMask mount_settings, uid_t uid_shift, @@ -538,7 +542,7 @@ int mount_all(const char *dest, static const MountPoint mount_table[] = { /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */ - { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + { "proc", "/proc", "proc", NULL, PROC_DEFAULT_MOUNT_FLAGS, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_MKDIR|MOUNT_FOLLOW_SYMLINKS }, /* we follow symlinks here since not following them requires /proc/ already being mounted, which we don't have here. */ { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, @@ -576,7 +580,7 @@ int mount_all(const char *dest, MOUNT_FATAL|MOUNT_APPLY_TMPFS_TMP|MOUNT_MKDIR }, { "tmpfs", "/sys", "tmpfs", "mode=555" TMPFS_LIMITS_SYS, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS|MOUNT_MKDIR }, - { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, + { "sysfs", "/sys", "sysfs", NULL, SYS_DEFAULT_MOUNT_FLAGS, MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO|MOUNT_MKDIR }, /* skipped if above was mounted */ { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_MKDIR }, /* skipped if above was mounted */ @@ -1336,3 +1340,60 @@ done: return r; } + +#define NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS "/run/host/proc" +#define NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS "/run/host/sys" + +int pin_fully_visible_fs(void) { + int r; + + (void) mkdir_p(NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS, 0755); + (void) mkdir_p(NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS, 0755); + + r = mount_follow_verbose(LOG_ERR, "proc", NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS, "proc", PROC_DEFAULT_MOUNT_FLAGS, NULL); + if (r < 0) + return r; + + r = mount_follow_verbose(LOG_ERR, "sysfs", NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS, "sysfs", SYS_DEFAULT_MOUNT_FLAGS, NULL); + if (r < 0) + return r; + + return 0; +} + +static int do_wipe_fully_visible_fs(void) { + if (umount2(NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS, MNT_DETACH) < 0) + return log_error_errno(errno, "Failed to unmount temporary proc: %m"); + + if (rmdir(NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS) < 0) + return log_error_errno(errno, "Failed to remove temporary proc mountpoint: %m"); + + if (umount2(NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS, MNT_DETACH) < 0) + return log_error_errno(errno, "Failed to unmount temporary sys: %m"); + + if (rmdir(NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS) < 0) + return log_error_errno(errno, "Failed to remove temporary sys mountpoint: %m"); + + return 0; +} + +int wipe_fully_visible_fs(int mntns_fd) { + _cleanup_close_ int orig_mntns_fd = -EBADF; + int r, rr; + + r = namespace_open(0, NULL, &orig_mntns_fd, NULL, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to pin originating mount namespace: %m"); + + r = namespace_enter(-EBADF, mntns_fd, -EBADF, -EBADF, -EBADF); + if (r < 0) + return log_error_errno(r, "Failed to enter mount namespace: %m"); + + rr = do_wipe_fully_visible_fs(); + + r = namespace_enter(-EBADF, orig_mntns_fd, -EBADF, -EBADF, -EBADF); + if (r < 0) + return log_error_errno(r, "Failed to enter original mount namespace: %m"); + + return rr; +} diff --git a/src/nspawn/nspawn-mount.h b/src/nspawn/nspawn-mount.h index 6bedbf9b3f..bf5e47dce4 100644 --- a/src/nspawn/nspawn-mount.h +++ b/src/nspawn/nspawn-mount.h @@ -67,3 +67,5 @@ int pivot_root_parse(char **pivot_root_new, char **pivot_root_old, const char *s int setup_pivot_root(const char *directory, const char *pivot_root_new, const char *pivot_root_old); int tmpfs_patch_options(const char *options,uid_t uid_shift, const char *selinux_apifs_context, char **ret); +int pin_fully_visible_fs(void); +int wipe_fully_visible_fs(int mntns_fd); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 5844674d95..392336dfa5 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -3632,7 +3632,7 @@ static int outer_child( _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL; _cleanup_strv_free_ char **os_release_pairs = NULL; - _cleanup_close_ int fd = -1; + _cleanup_close_ int fd = -1, mntns_fd = -EBADF; bool idmap = false; const char *p; pid_t pid; @@ -3697,6 +3697,15 @@ static int outer_child( return r; if (arg_userns_mode != USER_NAMESPACE_NO) { + r = namespace_open(0, NULL, &mntns_fd, NULL, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to pin outer mount namespace: %m"); + + l = send_one_fd(notify_socket, mntns_fd, 0); + if (l < 0) + return log_error_errno(l, "Failed to send outer mount namespace fd: %m"); + mntns_fd = safe_close(mntns_fd); + /* Let the parent know which UID shift we read from the image */ l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL); if (l < 0) @@ -3974,6 +3983,20 @@ static int outer_child( if (r < 0) return log_error_errno(r, "Failed to move root directory: %m"); + if (arg_userns_mode != USER_NAMESPACE_NO) { + /* In order to mount procfs and sysfs in an unprivileged container the kernel + * requires that a fully visible instance is already present in the target mount + * namespace. Mount one here so the inner child can mount its own instances. Later + * we umount the temporary instances created here before we actually exec the + * payload. Since the rootfs is shared the umount will propagate into the container. + * Note, the inner child wouldn't be able to unmount the instances on its own since + * it doesn't own the originating mount namespace. IOW, the outer child needs to do + * this. */ + r = pin_fully_visible_fs(); + if (r < 0) + return r; + } + fd = setup_notify_child(); if (fd < 0) return fd; @@ -4731,12 +4754,12 @@ static int run_container( rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 }, uuid_socket_pair[2] = { -1, -1 }, - notify_socket_pair[2] = { -1, -1 }, + fd_socket_pair[2] = { -EBADF, -EBADF }, uid_shift_socket_pair[2] = { -1, -1 }, master_pty_socket_pair[2] = { -1, -1 }, unified_cgroup_hierarchy_socket_pair[2] = { -1, -1}; - _cleanup_close_ int notify_socket = -1; + _cleanup_close_ int notify_socket = -1, mntns_fd = -EBADF; _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL; _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL; _cleanup_(sd_event_unrefp) sd_event *event = NULL; @@ -4783,7 +4806,7 @@ static int run_container( if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0) return log_error_errno(errno, "Failed to create id socket pair: %m"); - if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0) + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_socket_pair) < 0) return log_error_errno(errno, "Failed to create notify socket pair: %m"); if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, master_pty_socket_pair) < 0) @@ -4836,7 +4859,7 @@ static int run_container( rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]); pid_socket_pair[0] = safe_close(pid_socket_pair[0]); uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]); - notify_socket_pair[0] = safe_close(notify_socket_pair[0]); + fd_socket_pair[0] = safe_close(fd_socket_pair[0]); master_pty_socket_pair[0] = safe_close(master_pty_socket_pair[0]); uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]); unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]); @@ -4850,7 +4873,7 @@ static int run_container( secondary, pid_socket_pair[1], uuid_socket_pair[1], - notify_socket_pair[1], + fd_socket_pair[1], kmsg_socket_pair[1], rtnl_socket_pair[1], uid_shift_socket_pair[1], @@ -4872,12 +4895,16 @@ static int run_container( rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]); pid_socket_pair[1] = safe_close(pid_socket_pair[1]); uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]); - notify_socket_pair[1] = safe_close(notify_socket_pair[1]); + fd_socket_pair[1] = safe_close(fd_socket_pair[1]); master_pty_socket_pair[1] = safe_close(master_pty_socket_pair[1]); uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]); unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]); if (arg_userns_mode != USER_NAMESPACE_NO) { + mntns_fd = receive_one_fd(fd_socket_pair[0], 0); + if (mntns_fd < 0) + return log_error_errno(mntns_fd, "Failed to receive mount namespace fd from outer child: %m"); + /* The child just let us know the UID shift it might have read from the image. */ l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0); if (l < 0) @@ -4954,7 +4981,7 @@ static int run_container( return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID."); /* We also retrieve the socket used for notifications generated by outer child */ - notify_socket = receive_one_fd(notify_socket_pair[0], 0); + notify_socket = receive_one_fd(fd_socket_pair[0], 0); if (notify_socket < 0) return log_error_errno(notify_socket, "Failed to receive notification socket from the outer child: %m"); @@ -5139,6 +5166,13 @@ static int run_container( if (r < 0) return r; + if (arg_userns_mode != USER_NAMESPACE_NO) { + r = wipe_fully_visible_fs(mntns_fd); + if (r < 0) + return r; + mntns_fd = safe_close(mntns_fd); + } + /* Let the child know that we are ready and wait that the child is completely ready now. */ if (!barrier_place_and_sync(&barrier)) /* #5 */ return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early."); |