summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--man/systemd-nspawn.xml36
-rw-r--r--src/core/mount-setup.c2
-rw-r--r--src/core/namespace.c2
-rw-r--r--src/nspawn/nspawn.c1528
-rw-r--r--src/shared/base-filesystem.c16
-rw-r--r--src/shared/base-filesystem.h4
-rw-r--r--src/shared/dev-setup.c29
-rw-r--r--src/shared/dev-setup.h4
-rw-r--r--src/shared/switch-root.c2
-rw-r--r--src/udev/udevd.c2
10 files changed, 971 insertions, 654 deletions
diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml
index 3a2af2711c..6a5db86cec 100644
--- a/man/systemd-nspawn.xml
+++ b/man/systemd-nspawn.xml
@@ -321,6 +321,42 @@
</varlistentry>
<varlistentry>
+ <term><option>--private-users=</option></term>
+
+ <listitem><para>Enables user namespacing. If enabled the
+ container will run with its own private set of Unix user and
+ group ids (UIDs and GIDs). Takes none, one or two
+ colon-separated parameters: the first parameter specifies the
+ first host UID to assign to the container, the second
+ parameter specifies the number of host UIDs to assign to the
+ container. If the second parameter is omitted, 65536 UIDs are
+ assigned. If the first parameter is also ommitted (and hence
+ no parameter passed at all), the first UID assigned to the
+ container is read from the owner of the root directory of the
+ container's directory tree. By default no user namespacing is
+ applied.</para>
+
+ <para>Note that user namespacing currently requires OS trees
+ that are prepared for the UID shift that is being applied:
+ UIDs and GIDs used for file ownership or in file ACL entries
+ must be shifted to the container UID base that is
+ used during container runtime.</para>
+
+ <para>It is recommended to assign as least 65536 UIDs to each
+ container, so that the usable UID range in the container
+ covers 16bit. For best security do not assign overlapping UID
+ ranges to multiple containers. It is hence a good idea to use
+ the upper 16bit of the host 32bit UIDs as container
+ identifier, while the lower 16bit encode the container UID
+ used.</para>
+
+ <para>When user namespaces are used the GID range assigned to
+ each container is always chosen identical to the UID
+ range.</para></listitem>
+ </varlistentry>
+
+
+ <varlistentry>
<term><option>--private-network</option></term>
<listitem><para>Disconnect networking of the container from
diff --git a/src/core/mount-setup.c b/src/core/mount-setup.c
index 23fb0a1e38..ba96741e95 100644
--- a/src/core/mount-setup.c
+++ b/src/core/mount-setup.c
@@ -373,7 +373,7 @@ int mount_setup(bool loaded_policy) {
/* Create a few default symlinks, which are normally created
* by udevd, but some scripts might need them before we start
* udevd. */
- dev_setup(NULL);
+ dev_setup(NULL, UID_INVALID, GID_INVALID);
/* Mark the root directory as shared in regards to mount
* propagation. The kernel defaults to "private", but we think
diff --git a/src/core/namespace.c b/src/core/namespace.c
index c0c64fd706..7d0b7e7e84 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -224,7 +224,7 @@ static int mount_dev(BindMount *m) {
}
}
- dev_setup(temporary_mount);
+ dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
/* Create the /dev directory if missing. It is more likely to be
* missing when the service is started with RootDirectory. This is
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index 1f919c082b..d6b24c6f8f 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -222,6 +222,8 @@ static void help(void) {
" --uuid=UUID Set a specific machine UUID for the container\n"
" -S --slice=SLICE Place the container in the specified slice\n"
" --property=NAME=VALUE Set scope unit property\n"
+ " --private-users[=UIDBASE[:NUIDS]]\n"
+ " Run within user namespace\n"
" --private-network Disable network in container\n"
" --network-interface=INTERFACE\n"
" Assign an existing network interface to the\n"
@@ -238,8 +240,6 @@ static void help(void) {
" Add a virtual ethernet connection between host\n"
" and container and add it to an existing bridge on\n"
" the host\n"
- " --private-users[=UIDBASE[:NUIDS]]\n"
- " Run within user namespace\n"
" -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
" Expose a container IP port on the host\n"
" -Z --selinux-context=SECLABEL\n"
@@ -1020,7 +1020,44 @@ static int parse_argv(int argc, char *argv[]) {
return 1;
}
-static int mount_all(const char *dest) {
+static int tmpfs_patch_options(const char *options, char **ret) {
+ char *buf = NULL;
+
+ if (arg_userns && arg_uid_shift != 0) {
+
+ if (options)
+ asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift);
+ else
+ asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
+ if (!buf)
+ return -ENOMEM;
+
+ options = buf;
+ }
+
+#ifdef HAVE_SELINUX
+ if (arg_selinux_apifs_context) {
+ char *t;
+
+ if (options)
+ t = strjoin(options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
+ else
+ t = strjoin("context=\"", arg_selinux_apifs_context, "\"", NULL);
+ if (!t) {
+ free(buf);
+ return -ENOMEM;
+ }
+
+ free(buf);
+ buf = t;
+ }
+#endif
+
+ *ret = buf;
+ return !!buf;
+}
+
+static int mount_all(const char *dest, bool userns) {
typedef struct MountPoint {
const char *what;
@@ -1029,88 +1066,63 @@ static int mount_all(const char *dest) {
const char *options;
unsigned long flags;
bool fatal;
+ bool userns;
} MountPoint;
static const MountPoint mount_table[] = {
- { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
- { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
- { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
- { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
- { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, true },
- { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
- { "devpts", "/dev/pts", "devpts", "newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
- { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
- { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
- { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true },
+ { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true },
+ { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */
+ { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true, true }, /* Then, make it r/o */
+ { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false },
+ { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, true, false },
+ { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false },
+ { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
+ { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
+ { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false },
#ifdef HAVE_SELINUX
- { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
- { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
+ { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */
+ { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false, false }, /* Then, make it r/o */
#endif
};
unsigned k;
- int r = 0;
+ int r;
for (k = 0; k < ELEMENTSOF(mount_table); k++) {
_cleanup_free_ char *where = NULL, *options = NULL;
const char *o;
- int t;
- where = strjoin(dest, "/", mount_table[k].where, NULL);
+ if (userns != mount_table[k].userns)
+ continue;
+
+ where = prefix_root(dest, mount_table[k].where);
if (!where)
return log_oom();
- t = path_is_mount_point(where, true);
- if (t < 0 && t != -ENOENT) {
- log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
-
- if (r == 0)
- r = t;
-
- continue;
- }
+ r = path_is_mount_point(where, true);
+ if (r < 0 && r != -ENOENT)
+ return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
/* Skip this entry if it is not a remount. */
- if (mount_table[k].what && t > 0)
+ if (mount_table[k].what && r > 0)
continue;
- t = mkdir_p(where, 0755);
- if (t < 0) {
- if (mount_table[k].fatal) {
- log_error_errno(t, "Failed to create directory %s: %m", where);
-
- if (r == 0)
- r = t;
- } else
- log_warning_errno(t, "Failed to create directory %s: %m", where);
+ r = mkdir_p(where, 0755);
+ if (r < 0) {
+ if (mount_table[k].fatal)
+ return log_error_errno(r, "Failed to create directory %s: %m", where);
+ log_warning_errno(r, "Failed to create directory %s: %m", where);
continue;
}
-#ifdef HAVE_SELINUX
- if (arg_selinux_apifs_context &&
- (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
- options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
- if (!options)
- return log_oom();
-
- o = options;
- } else
-#endif
- o = mount_table[k].options;
-
- if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
- char *uid_options = NULL;
-
- if (o)
- asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
- else
- asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
- if (!uid_options)
+ o = mount_table[k].options;
+ if (streq_ptr(mount_table[k].type, "tmpfs")) {
+ r = tmpfs_patch_options(o, &options);
+ if (r < 0)
return log_oom();
-
- free(options);
- o = options = uid_options;
+ if (r > 0)
+ o = options;
}
if (mount(mount_table[k].what,
@@ -1119,34 +1131,29 @@ static int mount_all(const char *dest) {
mount_table[k].flags,
o) < 0) {
- if (mount_table[k].fatal) {
- log_error_errno(errno, "mount(%s) failed: %m", where);
+ if (mount_table[k].fatal)
+ return log_error_errno(errno, "mount(%s) failed: %m", where);
- if (r == 0)
- r = -errno;
- } else
- log_warning_errno(errno, "mount(%s) failed: %m", where);
+ log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
}
}
- return r;
+ return 0;
}
static int mount_bind(const char *dest, CustomMount *m) {
struct stat source_st, dest_st;
- char *where;
+ const char *where;
int r;
- assert(dest);
assert(m);
if (stat(m->source, &source_st) < 0)
return log_error_errno(errno, "Failed to stat %s: %m", m->source);
- where = strjoina(dest, m->destination);
+ where = prefix_roota(dest, m->destination);
- r = stat(where, &dest_st);
- if (r >= 0) {
+ if (stat(where, &dest_st) >= 0) {
if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
log_error("Cannot bind mount directory %s on file %s.", m->source, where);
return -EINVAL;
@@ -1190,19 +1197,25 @@ static int mount_bind(const char *dest, CustomMount *m) {
}
static int mount_tmpfs(const char *dest, CustomMount *m) {
- char *where;
+ const char *where, *options;
+ _cleanup_free_ char *buf = NULL;
int r;
assert(dest);
assert(m);
- where = strjoina(dest, m->destination);
+ where = prefix_roota(dest, m->destination);
- r = mkdir_label(where, 0755);
+ r = mkdir_p_label(where, 0755);
if (r < 0 && r != -EEXIST)
return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
- if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, m->options) < 0)
+ r = tmpfs_patch_options(m->options, &buf);
+ if (r < 0)
+ return log_oom();
+ options = r > 0 ? buf : m->options;
+
+ if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
return 0;
@@ -1210,13 +1223,13 @@ static int mount_tmpfs(const char *dest, CustomMount *m) {
static int mount_overlay(const char *dest, CustomMount *m) {
_cleanup_free_ char *lower = NULL;
- char *where, *options;
+ const char *where, *options;
int r;
assert(dest);
assert(m);
- where = strjoina(dest, m->destination);
+ where = prefix_roota(dest, m->destination);
r = mkdir_label(where, 0755);
if (r < 0 && r != -EEXIST)
@@ -1227,7 +1240,6 @@ static int mount_overlay(const char *dest, CustomMount *m) {
strv_reverse(m->lower);
lower = strv_join(m->lower, ":");
strv_reverse(m->lower);
-
if (!lower)
return log_oom();
@@ -1310,8 +1322,7 @@ static int mount_cgroup_hierarchy(const char *dest, const char *controller, cons
static int mount_cgroup(const char *dest) {
_cleanup_set_free_free_ Set *controllers = NULL;
- _cleanup_free_ char *own_cgroup_path = NULL;
- const char *cgroup_root, *systemd_root, *systemd_own;
+ const char *cgroup_root;
int r;
controllers = set_new(&string_hash_ops);
@@ -1322,10 +1333,6 @@ static int mount_cgroup(const char *dest) {
if (r < 0)
return log_error_errno(r, "Failed to determine cgroup controllers: %m");
- r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
- if (r < 0)
- return log_error_errno(r, "Failed to determine our own cgroup path: %m");
-
for (;;) {
_cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
@@ -1333,7 +1340,7 @@ static int mount_cgroup(const char *dest) {
if (!controller)
break;
- origin = strappend("/sys/fs/cgroup/", controller);
+ origin = prefix_root("/sys/fs/cgroup/", controller);
if (!origin)
return log_oom();
@@ -1350,7 +1357,7 @@ static int mount_cgroup(const char *dest) {
else {
_cleanup_free_ char *target = NULL;
- target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
+ target = prefix_root(dest, origin);
if (!target)
return log_oom();
@@ -1379,25 +1386,82 @@ static int mount_cgroup(const char *dest) {
if (r < 0)
return r;
+ cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
+ if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
+ return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
+
+ return 0;
+}
+
+static int mount_systemd_cgroup_writable(const char *dest) {
+ _cleanup_free_ char *own_cgroup_path = NULL;
+ const char *systemd_root, *systemd_own;
+ int r;
+
+ assert(dest);
+
+ r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine our own cgroup path: %m");
+
/* Make our own cgroup a (writable) bind mount */
systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
/* And then remount the systemd cgroup root read-only */
- systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
+ systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
- cgroup_root = strjoina(dest, "/sys/fs/cgroup");
- if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
- return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
+ return 0;
+}
+
+static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
+ assert(p);
+
+ if (!arg_userns)
+ return 0;
+
+ if (uid == UID_INVALID && gid == GID_INVALID)
+ return 0;
+
+ if (uid != UID_INVALID) {
+ uid += arg_uid_shift;
+
+ if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
+ return -EOVERFLOW;
+ }
+
+ if (gid != GID_INVALID) {
+ gid += (gid_t) arg_uid_shift;
+
+ if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
+ return -EOVERFLOW;
+ }
+
+ if (lchown(p, uid, gid) < 0)
+ return -errno;
return 0;
}
+static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
+ const char *q;
+
+ q = prefix_roota(root, path);
+ if (mkdir(q, mode) < 0) {
+ if (errno == EEXIST)
+ return 0;
+ return -errno;
+ }
+
+ return userns_lchown(q, uid, gid);
+}
+
static int setup_timezone(const char *dest) {
- _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
+ _cleanup_free_ char *p = NULL, *q = NULL;
+ const char *where, *check, *what;
char *z, *y;
int r;
@@ -1418,10 +1482,7 @@ static int setup_timezone(const char *dest) {
return 0;
}
- where = strappend(dest, "/etc/localtime");
- if (!where)
- return log_oom();
-
+ where = prefix_roota(dest, "/etc/localtime");
r = readlink_malloc(where, &q);
if (r >= 0) {
y = path_startswith(q, "../usr/share/zoneinfo/");
@@ -1433,43 +1494,34 @@ static int setup_timezone(const char *dest) {
return 0;
}
- check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
- if (!check)
- return log_oom();
-
- if (access(check, F_OK) < 0) {
+ check = strjoina("/usr/share/zoneinfo/", z);
+ check = prefix_root(dest, check);
+ if (laccess(check, F_OK) < 0) {
log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
return 0;
}
- what = strappend("../usr/share/zoneinfo/", z);
- if (!what)
- return log_oom();
-
- r = mkdir_parents(where, 0755);
- if (r < 0) {
- log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
-
- return 0;
- }
-
r = unlink(where);
if (r < 0 && errno != ENOENT) {
log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
-
return 0;
}
+ what = strjoina("../usr/share/zoneinfo/", z);
if (symlink(what, where) < 0) {
log_error_errno(errno, "Failed to correct timezone of container: %m");
return 0;
}
+ r = userns_lchown(where, 0, 0);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
+
return 0;
}
static int setup_resolv_conf(const char *dest) {
- _cleanup_free_ char *where = NULL;
+ const char *where = NULL;
int r;
assert(dest);
@@ -1478,31 +1530,24 @@ static int setup_resolv_conf(const char *dest) {
return 0;
/* Fix resolv.conf, if possible */
- where = strappend(dest, "/etc/resolv.conf");
- if (!where)
- return log_oom();
-
- /* We don't really care for the results of this really. If it
- * fails, it fails, but meh... */
- r = mkdir_parents(where, 0755);
- if (r < 0) {
- log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
-
- return 0;
- }
+ where = prefix_roota(dest, "/etc/resolv.conf");
r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
if (r < 0) {
log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
-
return 0;
}
+ r = userns_lchown(where, 0, 0);
+ if (r < 0)
+ log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
+
return 0;
}
static int setup_volatile_state(const char *directory) {
- const char *p;
+ _cleanup_free_ char *buf = NULL;
+ const char *p, *options;
int r;
assert(directory);
@@ -1517,12 +1562,19 @@ static int setup_volatile_state(const char *directory) {
if (r < 0)
return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
- p = strjoina(directory, "/var");
+ p = prefix_roota(directory, "/var");
r = mkdir(p, 0755);
if (r < 0 && errno != EEXIST)
return log_error_errno(errno, "Failed to create %s: %m", directory);
- if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
+ options = "mode=755";
+ r = tmpfs_patch_options(options, &buf);
+ if (r < 0)
+ return log_oom();
+ if (r > 0)
+ options = buf;
+
+ if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
return 0;
@@ -1531,7 +1583,8 @@ static int setup_volatile_state(const char *directory) {
static int setup_volatile(const char *directory) {
bool tmpfs_mounted = false, bind_mounted = false;
char template[] = "/tmp/nspawn-volatile-XXXXXX";
- const char *f, *t;
+ _cleanup_free_ char *buf = NULL;
+ const char *f, *t, *options;
int r;
assert(directory);
@@ -1545,27 +1598,31 @@ static int setup_volatile(const char *directory) {
if (!mkdtemp(template))
return log_error_errno(errno, "Failed to create temporary directory: %m");
- if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
- log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
- r = -errno;
+ options = "mode=755";
+ r = tmpfs_patch_options(options, &buf);
+ if (r < 0)
+ return log_oom();
+ if (r > 0)
+ options = buf;
+
+ if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
+ r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
goto fail;
}
tmpfs_mounted = true;
- f = strjoina(directory, "/usr");
- t = strjoina(template, "/usr");
+ f = prefix_roota(directory, "/usr");
+ t = prefix_roota(template, "/usr");
r = mkdir(t, 0755);
if (r < 0 && errno != EEXIST) {
- log_error_errno(errno, "Failed to create %s: %m", t);
- r = -errno;
+ r = log_error_errno(errno, "Failed to create %s: %m", t);
goto fail;
}
if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
- log_error_errno(errno, "Failed to create /usr bind mount: %m");
- r = -errno;
+ r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
goto fail;
}
@@ -1578,25 +1635,26 @@ static int setup_volatile(const char *directory) {
}
if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
- log_error_errno(errno, "Failed to move root mount: %m");
- r = -errno;
+ r = log_error_errno(errno, "Failed to move root mount: %m");
goto fail;
}
- rmdir(template);
+ (void) rmdir(template);
return 0;
fail:
if (bind_mounted)
- umount(t);
+ (void) umount(t);
+
if (tmpfs_mounted)
- umount(template);
- rmdir(template);
+ (void) umount(template);
+ (void) rmdir(template);
return r;
}
static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
+ assert(s);
snprintf(s, 37,
"%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
@@ -1606,23 +1664,19 @@ static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
}
static int setup_boot_id(const char *dest) {
- _cleanup_free_ char *from = NULL, *to = NULL;
+ const char *from, *to;
sd_id128_t rnd = {};
char as_uuid[37];
int r;
- assert(dest);
-
if (arg_share_system)
return 0;
/* Generate a new randomized boot ID, so that each boot-up of
* the container gets a new one */
- from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
- to = strappend(dest, "/proc/sys/kernel/random/boot_id");
- if (!from || !to)
- return log_oom();
+ from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
+ to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
r = sd_id128_randomize(&rnd);
if (r < 0)
@@ -1634,10 +1688,9 @@ static int setup_boot_id(const char *dest) {
if (r < 0)
return log_error_errno(r, "Failed to write boot id: %m");
- if (mount(from, to, NULL, MS_BIND, NULL) < 0) {
- log_error_errno(errno, "Failed to bind mount boot id: %m");
- r = -errno;
- } else if (mount(from, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
+ if (mount(from, to, NULL, MS_BIND, NULL) < 0)
+ r = log_error_errno(errno, "Failed to bind mount boot id: %m");
+ else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
log_warning_errno(errno, "Failed to make boot id read-only: %m");
unlink(from);
@@ -1663,14 +1716,16 @@ static int copy_devnodes(const char *dest) {
u = umask(0000);
+ /* Create /dev/net, so that we can create /dev/net/tun in it */
+ if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
+ return log_error_errno(r, "Failed to create /dev/net directory: %m");
+
NULSTR_FOREACH(d, devnodes) {
_cleanup_free_ char *from = NULL, *to = NULL;
struct stat st;
from = strappend("/dev/", d);
- to = strjoin(dest, "/dev/", d, NULL);
- if (!from || !to)
- return log_oom();
+ to = prefix_root(dest, from);
if (stat(from, &st) < 0) {
@@ -1679,16 +1734,10 @@ static int copy_devnodes(const char *dest) {
} else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
- log_error("%s is not a char or block device, cannot copy", from);
+ log_error("%s is not a char or block device, cannot copy.", from);
return -EIO;
} else {
- r = mkdir_parents(to, 0775);
- if (r < 0) {
- log_error_errno(r, "Failed to create parent directory of %s: %m", to);
- return -r;
- }
-
if (mknod(to, st.st_mode, st.st_rdev) < 0) {
if (errno != EPERM)
return log_error_errno(errno, "mknod(%s) failed: %m", to);
@@ -1702,28 +1751,56 @@ static int copy_devnodes(const char *dest) {
return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
}
- if (arg_userns && arg_uid_shift != UID_INVALID)
- if (lchown(to, arg_uid_shift, arg_uid_shift) < 0)
- return log_error_errno(errno, "chown() of device node %s failed: %m", to);
+ r = userns_lchown(to, 0, 0);
+ if (r < 0)
+ return log_error_errno(r, "chown() of device node %s failed: %m", to);
}
}
return r;
}
-static int setup_ptmx(const char *dest) {
- _cleanup_free_ char *p = NULL;
+static int setup_pts(const char *dest) {
+ _cleanup_free_ char *options = NULL;
+ const char *p;
+
+#ifdef HAVE_SELINUX
+ if (arg_selinux_apifs_context)
+ (void) asprintf(&options,
+ "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT ",gid=" GID_FMT ",context=\"%s\"",
+ arg_uid_shift,
+ arg_uid_shift + TTY_GID,
+ arg_selinux_apifs_context);
+ else
+#endif
+ (void) asprintf(&options,
+ "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT ",gid=" GID_FMT,
+ arg_uid_shift,
+ arg_uid_shift + TTY_GID);
- p = strappend(dest, "/dev/ptmx");
- if (!p)
+ if (!options)
return log_oom();
+ /* Mount /dev/pts itself */
+ p = prefix_root(dest, "/dev/pts");
+ if (mkdir(p, 0755) < 0)
+ return log_error_errno(errno, "Failed to create /dev/pts: %m");
+ if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
+ return log_error_errno(errno, "Failed to mount /dev/pts: %m");
+ if (userns_lchown(p, 0, 0) < 0)
+ return log_error_errno(errno, "Failed to chown /dev/pts: %m");
+
+ /* Create /dev/ptmx symlink */
+ p = prefix_roota(dest, "/dev/ptmx");
if (symlink("pts/ptmx", p) < 0)
return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
+ if (userns_lchown(p, 0, 0) < 0)
+ return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
- if (arg_userns && arg_uid_shift != UID_INVALID)
- if (lchown(p, arg_uid_shift, arg_uid_shift) < 0)
- return log_error_errno(errno, "lchown() of symlink %s failed: %m", p);
+ /* And fix /dev/pts/ptmx ownership */
+ p = prefix_roota(dest, "/dev/pts/ptmx");
+ if (userns_lchown(p, 0, 0) < 0)
+ return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
return 0;
}
@@ -1738,7 +1815,7 @@ static int setup_dev_console(const char *dest, const char *console) {
u = umask(0000);
- r = chmod_and_chown(console, 0600, 0, 0);
+ r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
if (r < 0)
return log_error_errno(r, "Failed to correct access mode for TTY: %m");
@@ -1746,7 +1823,7 @@ static int setup_dev_console(const char *dest, const char *console) {
* ptys can only exist on pts file systems. To have something
* to bind mount things on we create a empty regular file. */
- to = strjoina(dest, "/dev/console");
+ to = prefix_roota(dest, "/dev/console");
r = touch(to);
if (r < 0)
return log_error_errno(r, "touch() for /dev/console failed: %m");
@@ -1758,9 +1835,9 @@ static int setup_dev_console(const char *dest, const char *console) {
}
static int setup_kmsg(const char *dest, int kmsg_socket) {
- _cleanup_free_ char *from = NULL, *to = NULL;
+ const char *from, *to;
_cleanup_umask_ mode_t u;
- int r, fd, k;
+ int fd, k;
union {
struct cmsghdr cmsghdr;
uint8_t buf[CMSG_SPACE(sizeof(int))];
@@ -1771,29 +1848,22 @@ static int setup_kmsg(const char *dest, int kmsg_socket) {
};
struct cmsghdr *cmsg;
- assert(dest);
assert(kmsg_socket >= 0);
u = umask(0000);
- /* We create the kmsg FIFO as /dev/kmsg, but immediately
+ /* We create the kmsg FIFO as /run/kmsg, but immediately
* delete it after bind mounting it to /proc/kmsg. While FIFOs
* on the reading side behave very similar to /proc/kmsg,
* their writing side behaves differently from /dev/kmsg in
* that writing blocks when nothing is reading. In order to
* avoid any problems with containers deadlocking due to this
* we simply make /dev/kmsg unavailable to the container. */
- if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
- asprintf(&to, "%s/proc/kmsg", dest) < 0)
- return log_oom();
+ from = prefix_roota(dest, "/run/kmsg");
+ to = prefix_roota(dest, "/proc/kmsg");
if (mkfifo(from, 0600) < 0)
- return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
-
- r = chmod_and_chown(from, 0600, 0, 0);
- if (r < 0)
- return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
-
+ return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
if (mount(from, to, NULL, MS_BIND, NULL) < 0)
return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
@@ -1817,8 +1887,9 @@ static int setup_kmsg(const char *dest, int kmsg_socket) {
if (k < 0)
return log_error_errno(errno, "Failed to send FIFO fd: %m");
- /* And now make the FIFO unavailable as /dev/kmsg... */
- unlink(from);
+ /* And now make the FIFO unavailable as /run/kmsg... */
+ (void) unlink(from);
+
return 0;
}
@@ -1842,7 +1913,7 @@ static int send_rtnl(int send_fd) {
fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
if (fd < 0)
- return log_error_errno(errno, "failed to allocate container netlink: %m");
+ return log_error_errno(errno, "Failed to allocate container netlink: %m");
cmsg = CMSG_FIRSTHDR(&mh);
cmsg->cmsg_level = SOL_SOCKET;
@@ -2027,7 +2098,8 @@ static int setup_hostname(void) {
static int setup_journal(const char *directory) {
sd_id128_t machine_id, this_id;
- _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
+ _cleanup_free_ char *b = NULL, *d = NULL;
+ const char *etc_machine_id, *p, *q;
char *id;
int r;
@@ -2035,15 +2107,13 @@ static int setup_journal(const char *directory) {
if (arg_ephemeral)
return 0;
- p = strappend(directory, "/etc/machine-id");
- if (!p)
- return log_oom();
+ etc_machine_id = prefix_roota(directory, "/etc/machine-id");
- r = read_one_line_file(p, &b);
+ r = read_one_line_file(etc_machine_id, &b);
if (r == -ENOENT && arg_link_journal == LINK_AUTO)
return 0;
else if (r < 0)
- return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
+ return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
id = strstrip(b);
if (isempty(id) && arg_link_journal == LINK_AUTO)
@@ -2052,7 +2122,7 @@ static int setup_journal(const char *directory) {
/* Verify validity */
r = sd_id128_from_string(id, &machine_id);
if (r < 0)
- return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
+ return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
r = sd_id128_get_machine(&this_id);
if (r < 0)
@@ -2069,11 +2139,20 @@ static int setup_journal(const char *directory) {
if (arg_link_journal == LINK_NO)
return 0;
- free(p);
- p = strappend("/var/log/journal/", id);
- q = strjoin(directory, "/var/log/journal/", id, NULL);
- if (!p || !q)
- return log_oom();
+ r = userns_mkdir(directory, "/var", 0755, 0, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create /var: %m");
+
+ r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create /var/log: %m");
+
+ r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create /var/log/journal: %m");
+
+ p = strjoina("/var/log/journal/", id);
+ q = prefix_roota(directory, p);
if (path_is_mount_point(p, false) > 0) {
if (arg_link_journal != LINK_AUTO) {
@@ -2099,7 +2178,7 @@ static int setup_journal(const char *directory) {
arg_link_journal == LINK_AUTO) &&
path_equal(d, q)) {
- r = mkdir_p(q, 0755);
+ r = userns_mkdir(directory, p, 0755, 0, 0);
if (r < 0)
log_warning_errno(errno, "Failed to create directory %s: %m", q);
return 0;
@@ -2137,7 +2216,7 @@ static int setup_journal(const char *directory) {
}
}
- r = mkdir_p(q, 0755);
+ r = userns_mkdir(directory, p, 0755, 0, 0);
if (r < 0)
log_warning_errno(errno, "Failed to create directory %s: %m", q);
return 0;
@@ -2163,7 +2242,7 @@ static int setup_journal(const char *directory) {
if (dir_is_empty(q) == 0)
log_warning("%s is not empty, proceeding anyway.", q);
- r = mkdir_p(q, 0755);
+ r = userns_mkdir(directory, p, 0755, 0, 0);
if (r < 0) {
log_error_errno(errno, "Failed to create %s: %m", q);
return r;
@@ -2941,10 +3020,16 @@ static int setup_propagate(const char *root) {
p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
(void) mkdir_p(p, 0600);
- q = strjoina(root, "/run/systemd/nspawn/incoming");
- mkdir_parents(q, 0755);
- mkdir_p(q, 0600);
+ if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
+ return log_error_errno(errno, "Failed to create /run/systemd: %m");
+ if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
+ return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
+
+ if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
+ return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
+
+ q = prefix_roota(root, "/run/systemd/nspawn/incoming");
if (mount(p, q, NULL, MS_BIND, NULL) < 0)
return log_error_errno(errno, "Failed to install propagation bind mount.");
@@ -3603,14 +3688,9 @@ static int change_uid_gid(char **_home) {
if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
/* Reset everything fully to 0, just in case */
- if (setgroups(0, NULL) < 0)
- return log_error_errno(errno, "setgroups() failed: %m");
-
- if (setresgid(0, 0, 0) < 0)
- return log_error_errno(errno, "setregid() failed: %m");
-
- if (setresuid(0, 0, 0) < 0)
- return log_error_errno(errno, "setreuid() failed: %m");
+ r = reset_uid_gid();
+ if (r < 0)
+ return log_error_errno(r, "Failed to become root: %m");
*_home = NULL;
return 0;
@@ -3754,9 +3834,9 @@ static int change_uid_gid(char **_home) {
if (r < 0 && r != -EEXIST)
return log_error_errno(r, "Failed to make home directory: %m");
- fchown(STDIN_FILENO, uid, gid);
- fchown(STDOUT_FILENO, uid, gid);
- fchown(STDERR_FILENO, uid, gid);
+ (void) fchown(STDIN_FILENO, uid, gid);
+ (void) fchown(STDOUT_FILENO, uid, gid);
+ (void) fchown(STDERR_FILENO, uid, gid);
if (setgroups(n_uids, uids) < 0)
return log_error_errno(errno, "Failed to set auxiliary groups: %m");
@@ -3925,23 +4005,25 @@ static int determine_names(void) {
return 0;
}
-static int determine_uid_shift(void) {
+static int determine_uid_shift(const char *directory) {
int r;
- if (!arg_userns)
+ if (!arg_userns) {
+ arg_uid_shift = 0;
return 0;
+ }
if (arg_uid_shift == UID_INVALID) {
struct stat st;
- r = stat(arg_directory, &st);
+ r = stat(directory, &st);
if (r < 0)
- return log_error_errno(errno, "Failed to determine UID base of %s: %m", arg_directory);
+ return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
- log_error("UID and GID base of %s don't match.", arg_directory);
+ log_error("UID and GID base of %s don't match.", directory);
return -EINVAL;
}
@@ -3957,6 +4039,430 @@ static int determine_uid_shift(void) {
return 0;
}
+static int inner_child(
+ Barrier *barrier,
+ const char *directory,
+ bool secondary,
+ int kmsg_socket,
+ int rtnl_socket,
+ FDSet *fds,
+ int argc,
+ char *argv[]) {
+
+ _cleanup_free_ char *home = NULL;
+ unsigned n_env = 2;
+ const char *envp[] = {
+ "PATH=" DEFAULT_PATH_SPLIT_USR,
+ "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
+ NULL, /* TERM */
+ NULL, /* HOME */
+ NULL, /* USER */
+ NULL, /* LOGNAME */
+ NULL, /* container_uuid */
+ NULL, /* LISTEN_FDS */
+ NULL, /* LISTEN_PID */
+ NULL
+ };
+
+ char **env_use;
+ int r;
+
+ assert(barrier);
+ assert(directory);
+ assert(kmsg_socket >= 0);
+
+ if (arg_userns) {
+ /* Tell the parent, that it now can write the UID map. */
+ (void) barrier_place(barrier); /* #1 */
+
+ /* Wait until the parent wrote the UID map */
+ if (!barrier_place_and_sync(barrier)) { /* #2 */
+ log_error("Parent died too early");
+ return -ESRCH;
+ }
+ }
+
+ r = mount_all(NULL, true);
+ if (r < 0)
+ return r;
+
+ /* Wait until we are cgroup-ified, so that we
+ * can mount the right cgroup path writable */
+ if (!barrier_place_and_sync(barrier)) { /* #3 */
+ log_error("Parent died too early");
+ return -ESRCH;
+ }
+
+ r = mount_systemd_cgroup_writable("");
+ if (r < 0)
+ return r;
+
+ r = reset_uid_gid();
+ if (r < 0)
+ return log_error_errno(r, "Couldn't become new root: %m");
+
+ r = setup_boot_id(NULL);
+ if (r < 0)
+ return r;
+
+ r = setup_kmsg(NULL, kmsg_socket);
+ if (r < 0)
+ return r;
+ kmsg_socket = safe_close(kmsg_socket);
+
+ umask(0022);
+
+ if (setsid() < 0)
+ return log_error_errno(errno, "setsid() failed: %m");
+
+ if (arg_private_network)
+ loopback_setup();
+
+ r = send_rtnl(rtnl_socket);
+ if (r < 0)
+ return r;
+ rtnl_socket = safe_close(rtnl_socket);
+
+ if (drop_capabilities() < 0)
+ return log_error_errno(errno, "drop_capabilities() failed: %m");
+
+ setup_hostname();
+
+ if (arg_personality != 0xffffffffLU) {
+ if (personality(arg_personality) < 0)
+ return log_error_errno(errno, "personality() failed: %m");
+ } else if (secondary) {
+ if (personality(PER_LINUX32) < 0)
+ return log_error_errno(errno, "personality() failed: %m");
+ }
+
+#ifdef HAVE_SELINUX
+ if (arg_selinux_context)
+ if (setexeccon((security_context_t) arg_selinux_context) < 0)
+ return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
+#endif
+
+ r = change_uid_gid(&home);
+ if (r < 0)
+ return r;
+
+ envp[n_env] = strv_find_prefix(environ, "TERM=");
+ if (envp[n_env])
+ n_env ++;
+
+ if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
+ (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
+ (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
+ return log_oom();
+
+ if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
+ char as_uuid[37];
+
+ if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
+ return log_oom();
+ }
+
+ if (fdset_size(fds) > 0) {
+ r = fdset_cloexec(fds, false);
+ if (r < 0)
+ return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
+
+ if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
+ (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
+ return log_oom();
+ }
+
+ if (!strv_isempty(arg_setenv)) {
+ char **n;
+
+ n = strv_env_merge(2, envp, arg_setenv);
+ if (!n)
+ return log_oom();
+
+ env_use = n;
+ } else
+ env_use = (char**) envp;
+
+ /* Let the parent know that we are ready and
+ * wait until the parent is ready with the
+ * setup, too... */
+ if (!barrier_place_and_sync(barrier)) { /* #4 */
+ log_error("Parent died too early");
+ return -ESRCH;
+ }
+
+ /* Now, explicitly close the log, so that we
+ * then can close all remaining fds. Closing
+ * the log explicitly first has the benefit
+ * that the logging subsystem knows about it,
+ * and is thus ready to be reopened should we
+ * need it again. Note that the other fds
+ * closed here are at least the locking and
+ * barrier fds. */
+ log_close();
+ (void) fdset_close_others(fds);
+
+ if (arg_boot) {
+ char **a;
+ size_t m;
+
+ /* Automatically search for the init system */
+
+ m = 1 + argc - optind;
+ a = newa(char*, m + 1);
+ memcpy(a + 1, argv + optind, m * sizeof(char*));
+
+ a[0] = (char*) "/usr/lib/systemd/systemd";
+ execve(a[0], a, env_use);
+
+ a[0] = (char*) "/lib/systemd/systemd";
+ execve(a[0], a, env_use);
+
+ a[0] = (char*) "/sbin/init";
+ execve(a[0], a, env_use);
+ } else if (argc > optind)
+ execvpe(argv[optind], argv + optind, env_use);
+ else {
+ chdir(home ? home : "/root");
+ execle("/bin/bash", "-bash", NULL, env_use);
+ execle("/bin/sh", "-sh", NULL, env_use);
+ }
+
+ (void) log_open();
+ return log_error_errno(errno, "execv() failed: %m");
+}
+
+static int outer_child(
+ Barrier *barrier,
+ const char *directory,
+ const char *console,
+ const char *root_device, bool root_device_rw,
+ const char *home_device, bool home_device_rw,
+ const char *srv_device, bool srv_device_rw,
+ bool interactive,
+ bool secondary,
+ int pid_socket,
+ int kmsg_socket,
+ int rtnl_socket,
+ FDSet *fds,
+ int argc,
+ char *argv[]) {
+
+ pid_t pid;
+ ssize_t l;
+ int r;
+
+ assert(barrier);
+ assert(directory);
+ assert(console);
+ assert(pid_socket >= 0);
+ assert(kmsg_socket >= 0);
+
+ if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
+ return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
+
+ if (interactive) {
+ close_nointr(STDIN_FILENO);
+ close_nointr(STDOUT_FILENO);
+ close_nointr(STDERR_FILENO);
+
+ r = open_terminal(console, O_RDWR);
+ if (r != STDIN_FILENO) {
+ if (r >= 0) {
+ safe_close(r);
+ r = -EINVAL;
+ }
+
+ return log_error_errno(r, "Failed to open console: %m");
+ }
+
+ if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
+ dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
+ return log_error_errno(errno, "Failed to duplicate console: %m");
+ }
+
+ r = reset_audit_loginuid();
+ if (r < 0)
+ return r;
+
+ /* Mark everything as slave, so that we still
+ * receive mounts from the real root, but don't
+ * propagate mounts to the real root. */
+ if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
+ return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
+
+ r = mount_devices(directory,
+ root_device, root_device_rw,
+ home_device, home_device_rw,
+ srv_device, srv_device_rw);
+ if (r < 0)
+ return r;
+
+ r = determine_uid_shift(directory);
+ if (r < 0)
+ return r;
+
+ /* Turn directory into bind mount */
+ if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
+ return log_error_errno(errno, "Failed to make bind mount: %m");
+
+ access("alive12", F_OK);
+
+ r = setup_volatile(directory);
+ if (r < 0)
+ return r;
+
+ access("alive3", F_OK);
+
+ r = setup_volatile_state(directory);
+ if (r < 0)
+ return r;
+
+ access("alive4", F_OK);
+
+ r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
+ if (r < 0)
+ return r;
+
+ access("alive5", F_OK);
+
+ if (arg_read_only) {
+ r = bind_remount_recursive(directory, true);
+ if (r < 0)
+ return log_error_errno(r, "Failed to make tree read-only: %m");
+ }
+
+ access("alive6", F_OK);
+
+ r = mount_all(directory, false);
+ if (r < 0)
+ return r;
+
+ if (copy_devnodes(directory) < 0)
+ return r;
+
+ dev_setup(directory, arg_uid_shift, arg_uid_shift);
+
+ if (setup_pts(directory) < 0)
+ return r;
+
+ r = setup_propagate(directory);
+ if (r < 0)
+ return r;
+
+ r = setup_dev_console(directory, console);
+ if (r < 0)
+ return r;
+
+ r = setup_seccomp();
+ if (r < 0)
+ return r;
+
+ r = setup_timezone(directory);
+ if (r < 0)
+ return r;
+
+ r = setup_resolv_conf(directory);
+ if (r < 0)
+ return r;
+
+ r = setup_journal(directory);
+ if (r < 0)
+ return r;
+
+ r = mount_custom(directory);
+ if (r < 0)
+ return r;
+
+ r = mount_cgroup(directory);
+ if (r < 0)
+ return r;
+
+ r = mount_move_root(directory);
+ if (r < 0)
+ return log_error_errno(r, "Failed to move root directory: %m");
+
+ pid = raw_clone(SIGCHLD|CLONE_NEWNS|
+ (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
+ (arg_private_network ? CLONE_NEWNET : 0) |
+ (arg_userns ? CLONE_NEWUSER : 0),
+ NULL);
+ if (pid < 0)
+ return log_error_errno(errno, "Failed to fork inner child: %m");
+
+ if (pid == 0) {
+ pid_socket = safe_close(pid_socket);
+
+ /* The inner child has all namespaces that are
+ * requested, so that we all are owned by the user if
+ * user namespaces are turned on. */
+
+ r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, argc, argv);
+ if (r < 0)
+ _exit(EXIT_FAILURE);
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
+ if (l < 0)
+ return log_error_errno(errno, "Failed to send PID: %m");
+ if (l != sizeof(pid)) {
+ log_error("Short write while sending PID.");
+ return -EIO;
+ }
+
+ pid_socket = safe_close(pid_socket);
+
+ return 0;
+}
+
+static int setup_uid_map(pid_t pid) {
+ char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
+ int r;
+
+ assert(pid > 1);
+
+ xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
+ xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
+ r = write_string_file(uid_map, line);
+ if (r < 0)
+ return log_error_errno(r, "Failed to write UID map: %m");
+
+ /* We always assign the same UID and GID ranges */
+ xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
+ r = write_string_file(uid_map, line);
+ if (r < 0)
+ return log_error_errno(r, "Failed to write GID map: %m");
+
+ return 0;
+}
+
+static int chown_cgroup(pid_t pid) {
+ _cleanup_free_ char *path = NULL, *fs = NULL;
+ _cleanup_close_ int fd = -1;
+ const char *fn;
+ int r;
+
+ r = cg_pid_get_path(NULL, pid, &path);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get container cgroup path: %m");
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
+
+ fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
+ if (fd < 0)
+ return log_error_errno(errno, "Failed to open %s: %m", fs);
+
+ FOREACH_STRING(fn, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
+ if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
+ log_warning_errno(errno, "Failed to chown() cgroup file %s, ignoring: %m", fn);
+
+ return 0;
+}
+
int main(int argc, char *argv[]) {
_cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
@@ -4136,15 +4642,13 @@ int main(int argc, char *argv[]) {
goto finish;
}
- r = determine_uid_shift();
- if (r < 0)
- goto finish;
-
r = custom_mounts_prepare();
if (r < 0)
goto finish;
- interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0;
+ interactive =
+ isatty(STDIN_FILENO) > 0 &&
+ isatty(STDOUT_FILENO) > 0;
master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
if (master < 0) {
@@ -4174,14 +4678,25 @@ int main(int argc, char *argv[]) {
assert_se(sigemptyset(&mask_chld) == 0);
assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
+ if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
+ r = log_error_errno(errno, "Failed to become subreaper: %m");
+ goto finish;
+ }
+
for (;;) {
- _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
+ _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 };
ContainerStatus container_status;
_cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
- struct sigaction sa = {
+ static const struct sigaction sa = {
.sa_handler = nop_handler,
.sa_flags = SA_NOCLDSTOP,
};
+ int ifi = 0;
+ ssize_t l;
+ _cleanup_event_unref_ sd_event *event = NULL;
+ _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
+ _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
+ char last_char = 0;
r = barrier_create(&barrier);
if (r < 0) {
@@ -4199,6 +4714,11 @@ int main(int argc, char *argv[]) {
goto finish;
}
+ if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
+ r = log_error_errno(errno, "Failed to create pid socket pair: %m");
+ goto finish;
+ }
+
/* Child can be killed before execv(), so handle SIGCHLD
* in order to interrupt parent's blocking calls and
* give it a chance to call wait() and terminate. */
@@ -4214,9 +4734,7 @@ int main(int argc, char *argv[]) {
goto finish;
}
- pid = raw_clone(SIGCHLD|CLONE_NEWNS|
- (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
- (arg_private_network ? CLONE_NEWNET : 0), NULL);
+ pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
if (pid < 0) {
if (errno == EINVAL)
r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
@@ -4227,457 +4745,191 @@ int main(int argc, char *argv[]) {
}
if (pid == 0) {
- /* child */
- _cleanup_free_ char *home = NULL;
- unsigned n_env = 2;
- const char *envp[] = {
- "PATH=" DEFAULT_PATH_SPLIT_USR,
- "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
- NULL, /* TERM */
- NULL, /* HOME */
- NULL, /* USER */
- NULL, /* LOGNAME */
- NULL, /* container_uuid */
- NULL, /* LISTEN_FDS */
- NULL, /* LISTEN_PID */
- NULL
- };
- char **env_use;
-
+ /* The outer child only has a file system namespace. */
barrier_set_role(&barrier, BARRIER_CHILD);
- envp[n_env] = strv_find_prefix(environ, "TERM=");
- if (envp[n_env])
- n_env ++;
-
master = safe_close(master);
kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
+ pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
reset_all_signal_handlers();
reset_signal_mask();
- if (interactive) {
- close_nointr(STDIN_FILENO);
- close_nointr(STDOUT_FILENO);
- close_nointr(STDERR_FILENO);
-
- r = open_terminal(console, O_RDWR);
- if (r != STDIN_FILENO) {
- if (r >= 0) {
- safe_close(r);
- r = -EINVAL;
- }
-
- log_error_errno(r, "Failed to open console: %m");
- _exit(EXIT_FAILURE);
- }
-
- if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
- dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
- log_error_errno(errno, "Failed to duplicate console: %m");
- _exit(EXIT_FAILURE);
- }
- }
-
- if (setsid() < 0) {
- log_error_errno(errno, "setsid() failed: %m");
- _exit(EXIT_FAILURE);
- }
-
- if (reset_audit_loginuid() < 0)
- _exit(EXIT_FAILURE);
-
- if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
- log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
- _exit(EXIT_FAILURE);
- }
-
- if (arg_private_network)
- loopback_setup();
-
- /* Mark everything as slave, so that we still
- * receive mounts from the real root, but don't
- * propagate mounts to the real root. */
- if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
- log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
- _exit(EXIT_FAILURE);
- }
-
- if (mount_devices(arg_directory,
- root_device, root_device_rw,
- home_device, home_device_rw,
- srv_device, srv_device_rw) < 0)
- _exit(EXIT_FAILURE);
-
- /* Turn directory into bind mount */
- if (mount(arg_directory, arg_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
- log_error_errno(errno, "Failed to make bind mount: %m");
- _exit(EXIT_FAILURE);
- }
-
- r = setup_volatile(arg_directory);
- if (r < 0)
- _exit(EXIT_FAILURE);
-
- if (setup_volatile_state(arg_directory) < 0)
- _exit(EXIT_FAILURE);
-
- r = base_filesystem_create(arg_directory);
- if (r < 0)
- _exit(EXIT_FAILURE);
-
- if (arg_read_only) {
- r = bind_remount_recursive(arg_directory, true);
- if (r < 0) {
- log_error_errno(r, "Failed to make tree read-only: %m");
- _exit(EXIT_FAILURE);
- }
- }
-
- if (mount_all(arg_directory) < 0)
- _exit(EXIT_FAILURE);
-
- if (copy_devnodes(arg_directory) < 0)
- _exit(EXIT_FAILURE);
-
- if (setup_ptmx(arg_directory) < 0)
- _exit(EXIT_FAILURE);
-
- dev_setup(arg_directory);
-
- if (setup_propagate(arg_directory) < 0)
- _exit(EXIT_FAILURE);
-
- if (setup_seccomp() < 0)
- _exit(EXIT_FAILURE);
-
- if (setup_dev_console(arg_directory, console) < 0)
- _exit(EXIT_FAILURE);
-
- if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
- _exit(EXIT_FAILURE);
- kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
-
- if (send_rtnl(rtnl_socket_pair[1]) < 0)
- _exit(EXIT_FAILURE);
- rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
-
- /* Tell the parent that we are ready, and that
- * it can cgroupify us to that we lack access
- * to certain devices and resources. */
- (void) barrier_place(&barrier); /* #1 */
-
- if (setup_boot_id(arg_directory) < 0)
- _exit(EXIT_FAILURE);
-
- if (setup_timezone(arg_directory) < 0)
- _exit(EXIT_FAILURE);
-
- if (setup_resolv_conf(arg_directory) < 0)
- _exit(EXIT_FAILURE);
-
- if (setup_journal(arg_directory) < 0)
- _exit(EXIT_FAILURE);
-
- if (mount_custom(arg_directory) < 0)
- _exit(EXIT_FAILURE);
-
- /* Wait until we are cgroup-ified, so that we
- * can mount the right cgroup path writable */
- (void) barrier_place_and_sync(&barrier); /* #2 */
-
- if (mount_cgroup(arg_directory) < 0)
- _exit(EXIT_FAILURE);
-
- r = mount_move_root(arg_directory);
- if (r < 0) {
- log_error_errno(r, "Failed to move root directory: %m");
- _exit(EXIT_FAILURE);
- }
-
- if (arg_userns) {
- if (unshare(CLONE_NEWUSER) < 0) {
- log_error_errno(errno, "unshare(CLONE_NEWUSER) failed: %m");
- _exit(EXIT_FAILURE);
- }
-
- /* Tell the parent, that it now can
- * write the UID map. */
- (void) barrier_place(&barrier); /* #3 */
-
- /* Wait until the parent wrote the UID
- * map */
- (void) barrier_place_and_sync(&barrier); /* #4 */
- }
-
- umask(0022);
-
- if (drop_capabilities() < 0) {
- log_error_errno(errno, "drop_capabilities() failed: %m");
- _exit(EXIT_FAILURE);
- }
-
- setup_hostname();
-
- if (arg_personality != 0xffffffffLU) {
- if (personality(arg_personality) < 0) {
- log_error_errno(errno, "personality() failed: %m");
- _exit(EXIT_FAILURE);
- }
- } else if (secondary) {
- if (personality(PER_LINUX32) < 0) {
- log_error_errno(errno, "personality() failed: %m");
- _exit(EXIT_FAILURE);
- }
- }
-
-#ifdef HAVE_SELINUX
- if (arg_selinux_context)
- if (setexeccon((security_context_t) arg_selinux_context) < 0) {
- log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
- _exit(EXIT_FAILURE);
- }
-#endif
-
- r = change_uid_gid(&home);
+ r = outer_child(&barrier,
+ arg_directory,
+ console,
+ root_device, root_device_rw,
+ home_device, home_device_rw,
+ srv_device, srv_device_rw,
+ interactive,
+ secondary,
+ pid_socket_pair[1],
+ kmsg_socket_pair[1],
+ rtnl_socket_pair[1],
+ fds,
+ argc, argv);
if (r < 0)
_exit(EXIT_FAILURE);
- if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
- (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
- (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
- log_oom();
- _exit(EXIT_FAILURE);
- }
-
- if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
- char as_uuid[37];
-
- if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
- log_oom();
- _exit(EXIT_FAILURE);
- }
- }
-
- if (fdset_size(fds) > 0) {
- r = fdset_cloexec(fds, false);
- if (r < 0) {
- log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
- _exit(EXIT_FAILURE);
- }
-
- if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
- (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
- log_oom();
- _exit(EXIT_FAILURE);
- }
- }
-
- if (!strv_isempty(arg_setenv)) {
- char **n;
-
- n = strv_env_merge(2, envp, arg_setenv);
- if (!n) {
- log_oom();
- _exit(EXIT_FAILURE);
- }
-
- env_use = n;
- } else
- env_use = (char**) envp;
-
- /* Let the parent know that we are ready and
- * wait until the parent is ready with the
- * setup, too... */
- (void) barrier_place_and_sync(&barrier); /* #5 */
-
- /* Now, explicitly close the log, so that we
- * then can close all remaining fds. Closing
- * the log explicitly first has the benefit
- * that the logging subsystem knows about it,
- * and is thus ready to be reopened should we
- * need it again. Note that the other fds
- * closed here are at least the locking and
- * barrier fds. */
- log_close();
- (void) fdset_close_others(fds);
-
- if (arg_boot) {
- char **a;
- size_t l;
-
- /* Automatically search for the init system */
-
- l = 1 + argc - optind;
- a = newa(char*, l + 1);
- memcpy(a + 1, argv + optind, l * sizeof(char*));
-
- a[0] = (char*) "/usr/lib/systemd/systemd";
- execve(a[0], a, env_use);
-
- a[0] = (char*) "/lib/systemd/systemd";
- execve(a[0], a, env_use);
-
- a[0] = (char*) "/sbin/init";
- execve(a[0], a, env_use);
- } else if (argc > optind)
- execvpe(argv[optind], argv + optind, env_use);
- else {
- chdir(home ? home : "/root");
- execle("/bin/bash", "-bash", NULL, env_use);
- execle("/bin/sh", "-sh", NULL, env_use);
- }
-
- (void) log_open();
- log_error_errno(errno, "execv() failed: %m");
- _exit(EXIT_FAILURE);
+ _exit(EXIT_SUCCESS);
}
barrier_set_role(&barrier, BARRIER_PARENT);
+
fdset_free(fds);
fds = NULL;
kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
+ pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
- (void) barrier_place(&barrier); /* #1 */
+ /* Wait for the outer child. */
+ r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
+ if (r < 0)
+ goto finish;
+ if (r != 0) {
+ r = -EIO;
+ goto finish;
+ }
+ pid = 0;
- /* Wait for the most basic Child-setup to be done,
- * before we add hardware to it, and place it in a
- * cgroup. */
- if (barrier_sync(&barrier)) { /* #1 */
- int ifi = 0;
+ /* And now retrieve the PID of the inner child. */
+ l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
+ if (l < 0) {
+ r = log_error_errno(errno, "Failed to read inner child PID: %m");
+ goto finish;
+ }
+ if (l != sizeof(pid)) {
+ log_error("Short read while reading inner child PID: %m");
+ r = EIO;
+ goto finish;
+ }
- r = move_network_interfaces(pid);
- if (r < 0)
- goto finish;
+ log_debug("Init process invoked as PID " PID_FMT, pid);
- r = setup_veth(pid, veth_name, &ifi);
- if (r < 0)
+ if (arg_userns) {
+ if (!barrier_place_and_sync(&barrier)) { /* #1 */
+ log_error("Child died too early.");
+ r = -ESRCH;
goto finish;
+ }
- r = setup_bridge(veth_name, &ifi);
+ r = setup_uid_map(pid);
if (r < 0)
goto finish;
- r = setup_macvlan(pid);
- if (r < 0)
- goto finish;
+ (void) barrier_place(&barrier); /* #2 */
+ }
- r = setup_ipvlan(pid);
- if (r < 0)
- goto finish;
+ r = move_network_interfaces(pid);
+ if (r < 0)
+ goto finish;
- r = register_machine(pid, ifi);
- if (r < 0)
- goto finish;
+ r = setup_veth(pid, veth_name, &ifi);
+ if (r < 0)
+ goto finish;
- /* Notify the child that the parent is ready with all
- * its setup, and that the child can now hand over
- * control to the code to run inside the container. */
- (void) barrier_place(&barrier); /* #2 */
+ r = setup_bridge(veth_name, &ifi);
+ if (r < 0)
+ goto finish;
- if (arg_userns) {
- char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
+ r = setup_macvlan(pid);
+ if (r < 0)
+ goto finish;
- (void) barrier_place_and_sync(&barrier); /* #3 */
+ r = setup_ipvlan(pid);
+ if (r < 0)
+ goto finish;
- xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
- xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
- r = write_string_file(uid_map, line);
- if (r < 0) {
- log_error_errno(r, "Failed to write UID map: %m");
- goto finish;
- }
+ r = register_machine(pid, ifi);
+ if (r < 0)
+ goto finish;
- /* We always assign the same UID and GID ranges */
- xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
- r = write_string_file(uid_map, line);
- if (r < 0) {
- log_error_errno(r, "Failed to write GID map: %m");
- goto finish;
- }
+ r = chown_cgroup(pid);
+ if (r < 0)
+ goto finish;
- (void) barrier_place(&barrier); /* #4 */
- }
+ /* Notify the child that the parent is ready with all
+ * its setup (including cgroup-ification), and that
+ * the child can now hand over control to the code to
+ * run inside the container. */
+ (void) barrier_place(&barrier); /* #3 */
- /* Block SIGCHLD here, before notifying child.
- * process_pty() will handle it with the other signals. */
- r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
- if (r < 0)
- goto finish;
+ /* Block SIGCHLD here, before notifying child.
+ * process_pty() will handle it with the other signals. */
+ assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
- /* Reset signal to default */
- r = default_signals(SIGCHLD, -1);
- if (r < 0)
- goto finish;
+ /* Reset signal to default */
+ r = default_signals(SIGCHLD, -1);
+ if (r < 0) {
+ log_error_errno(r, "Failed to reset SIGCHLD: %m");
+ goto finish;
+ }
- /* Let the child know that we are ready and wait that the child is completely ready now. */
- if (barrier_place_and_sync(&barrier)) { /* #5 */
- _cleanup_event_unref_ sd_event *event = NULL;
- _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
- _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
- char last_char = 0;
+ /* Let the child know that we are ready and wait that the child is completely ready now. */
+ if (!barrier_place_and_sync(&barrier)) { /* #5 */
+ log_error("Client died too early.");
+ r = -ESRCH;
+ goto finish;
+ }
- sd_notifyf(false,
- "READY=1\n"
- "STATUS=Container running.\n"
- "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
+ sd_notifyf(false,
+ "READY=1\n"
+ "STATUS=Container running.\n"
+ "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
- r = sd_event_new(&event);
- if (r < 0) {
- log_error_errno(r, "Failed to get default event source: %m");
- goto finish;
- }
+ r = sd_event_new(&event);
+ if (r < 0) {
+ log_error_errno(r, "Failed to get default event source: %m");
+ goto finish;
+ }
- if (arg_kill_signal > 0) {
- /* Try to kill the init system on SIGINT or SIGTERM */
- sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
- sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
- } else {
- /* Immediately exit */
- sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
- sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
- }
+ if (arg_kill_signal > 0) {
+ /* Try to kill the init system on SIGINT or SIGTERM */
+ sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
+ sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
+ } else {
+ /* Immediately exit */
+ sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
+ sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
+ }
- /* simply exit on sigchld */
- sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
+ /* simply exit on sigchld */
+ sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
- if (arg_expose_ports) {
- r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
- if (r < 0)
- goto finish;
+ if (arg_expose_ports) {
+ r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
+ if (r < 0)
+ goto finish;
- (void) expose_ports(rtnl, &exposed);
- }
+ (void) expose_ports(rtnl, &exposed);
+ }
- rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
+ rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
- r = pty_forward_new(event, master, true, !interactive, &forward);
- if (r < 0) {
- log_error_errno(r, "Failed to create PTY forwarder: %m");
- goto finish;
- }
+ r = pty_forward_new(event, master, true, !interactive, &forward);
+ if (r < 0) {
+ log_error_errno(r, "Failed to create PTY forwarder: %m");
+ goto finish;
+ }
- r = sd_event_loop(event);
- if (r < 0) {
- log_error_errno(r, "Failed to run event loop: %m");
- goto finish;
- }
+ r = sd_event_loop(event);
+ if (r < 0) {
+ log_error_errno(r, "Failed to run event loop: %m");
+ goto finish;
+ }
- pty_forward_get_last_char(forward, &last_char);
+ pty_forward_get_last_char(forward, &last_char);
- forward = pty_forward_free(forward);
+ forward = pty_forward_free(forward);
- if (!arg_quiet && last_char != '\n')
- putc('\n', stdout);
+ if (!arg_quiet && last_char != '\n')
+ putc('\n', stdout);
- /* Kill if it is not dead yet anyway */
- terminate_machine(pid);
- }
- }
+ /* Kill if it is not dead yet anyway */
+ terminate_machine(pid);
/* Normally redundant, but better safe than sorry */
kill(pid, SIGKILL);
@@ -4723,11 +4975,11 @@ finish:
"STOPPING=1\n"
"STATUS=Terminating...");
- loop_remove(loop_nr, &image_fd);
-
if (pid > 0)
kill(pid, SIGKILL);
+ loop_remove(loop_nr, &image_fd);
+
if (remove_subvol && arg_directory) {
int k;
diff --git a/src/shared/base-filesystem.c b/src/shared/base-filesystem.c
index 11e0947407..ab6fc171b0 100644
--- a/src/shared/base-filesystem.c
+++ b/src/shared/base-filesystem.c
@@ -41,13 +41,16 @@ static const BaseFilesystem table[] = {
{ "lib", 0, "usr/lib\0", NULL },
{ "root", 0755, NULL, NULL },
{ "sbin", 0, "usr/sbin\0", NULL },
+ { "usr", 0755, NULL, NULL },
+ { "var", 0755, NULL, NULL },
+ { "etc", 0755, NULL, NULL },
#if defined(__i386__) || defined(__x86_64__)
{ "lib64", 0, "usr/lib/x86_64-linux-gnu\0"
"usr/lib64\0", "ld-linux-x86-64.so.2" },
#endif
};
-int base_filesystem_create(const char *root) {
+int base_filesystem_create(const char *root, uid_t uid, gid_t gid) {
_cleanup_close_ int fd = -1;
unsigned i;
int r = 0;
@@ -90,6 +93,12 @@ int base_filesystem_create(const char *root) {
r = symlinkat(target, fd, table[i].dir);
if (r < 0 && errno != EEXIST)
return log_error_errno(errno, "Failed to create symlink at %s/%s: %m", root, table[i].dir);
+
+ if (uid != UID_INVALID || gid != UID_INVALID) {
+ if (fchownat(fd, table[i].dir, uid, gid, AT_SYMLINK_NOFOLLOW) < 0)
+ return log_error_errno(errno, "Failed to chown symlink at %s/%s: %m", root, table[i].dir);
+ }
+
continue;
}
@@ -97,6 +106,11 @@ int base_filesystem_create(const char *root) {
r = mkdirat(fd, table[i].dir, table[i].mode);
if (r < 0 && errno != EEXIST)
return log_error_errno(errno, "Failed to create directory at %s/%s: %m", root, table[i].dir);
+
+ if (uid != UID_INVALID || gid != UID_INVALID) {
+ if (fchownat(fd, table[i].dir, uid, gid, AT_SYMLINK_NOFOLLOW) < 0)
+ return log_error_errno(errno, "Failed to chown directory at %s/%s: %m", root, table[i].dir);
+ }
}
return 0;
diff --git a/src/shared/base-filesystem.h b/src/shared/base-filesystem.h
index 03201f7083..39a496090f 100644
--- a/src/shared/base-filesystem.h
+++ b/src/shared/base-filesystem.h
@@ -21,4 +21,6 @@
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
-int base_filesystem_create(const char *root);
+#include <sys/types.h>
+
+int base_filesystem_create(const char *root, uid_t uid, gid_t gid);
diff --git a/src/shared/dev-setup.c b/src/shared/dev-setup.c
index cb15da8a5d..25ad918b85 100644
--- a/src/shared/dev-setup.c
+++ b/src/shared/dev-setup.c
@@ -23,13 +23,12 @@
#include <stdlib.h>
#include <unistd.h>
-#include "dev-setup.h"
#include "util.h"
#include "label.h"
+#include "path-util.h"
+#include "dev-setup.h"
-int dev_setup(const char *prefix) {
- const char *j, *k;
-
+int dev_setup(const char *prefix, uid_t uid, gid_t gid) {
static const char symlinks[] =
"-/proc/kcore\0" "/dev/core\0"
"/proc/self/fd\0" "/dev/fd\0"
@@ -37,7 +36,13 @@ int dev_setup(const char *prefix) {
"/proc/self/fd/1\0" "/dev/stdout\0"
"/proc/self/fd/2\0" "/dev/stderr\0";
+ const char *j, *k;
+ int r;
+
NULSTR_FOREACH_PAIR(j, k, symlinks) {
+ _cleanup_free_ char *link_name = NULL;
+ const char *n;
+
if (j[0] == '-') {
j++;
@@ -46,15 +51,21 @@ int dev_setup(const char *prefix) {
}
if (prefix) {
- _cleanup_free_ char *link_name = NULL;
-
- link_name = strjoin(prefix, "/", k, NULL);
+ link_name = prefix_root(prefix, k);
if (!link_name)
return -ENOMEM;
- symlink_label(j, link_name);
+ n = link_name;
} else
- symlink_label(j, k);
+ n = k;
+
+ r = symlink_label(j, n);
+ if (r < 0)
+ log_debug_errno(r, "Failed to symlink %s to %s: %m", j, n);
+
+ if (uid != UID_INVALID || gid != GID_INVALID)
+ if (lchown(n, uid, gid) < 0)
+ log_debug_errno(errno, "Failed to chown %s: %m", n);
}
return 0;
diff --git a/src/shared/dev-setup.h b/src/shared/dev-setup.h
index d41b6eefba..ab2748db7f 100644
--- a/src/shared/dev-setup.h
+++ b/src/shared/dev-setup.h
@@ -21,4 +21,6 @@
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
-int dev_setup(const char *pathprefix);
+#include <sys/types.h>
+
+int dev_setup(const char *prefix, uid_t uid, gid_t gid);
diff --git a/src/shared/switch-root.c b/src/shared/switch-root.c
index ae3839de16..b12189cd10 100644
--- a/src/shared/switch-root.c
+++ b/src/shared/switch-root.c
@@ -105,7 +105,7 @@ int switch_root(const char *new_root, const char *oldroot, bool detach_oldroot,
* to look like. They might even boot, if they are RO and
* don't have the FS layout. Just ignore the error and
* switch_root() nevertheless. */
- (void) base_filesystem_create(new_root);
+ (void) base_filesystem_create(new_root, UID_INVALID, GID_INVALID);
if (chdir(new_root) < 0)
return log_error_errno(errno, "Failed to change directory to %s: %m", new_root);
diff --git a/src/udev/udevd.c b/src/udev/udevd.c
index 19640cb6ea..afd4640ad1 100644
--- a/src/udev/udevd.c
+++ b/src/udev/udevd.c
@@ -1484,7 +1484,7 @@ int main(int argc, char *argv[]) {
goto exit;
}
- dev_setup(NULL);
+ dev_setup(NULL, UID_INVALID, GID_INVALID);
r = manager_new(&manager);
if (r < 0)