summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--docs/CGROUP_DELEGATION.md4
-rw-r--r--man/org.freedesktop.systemd1.xml28
-rw-r--r--man/systemd.resource-control.xml32
-rw-r--r--src/basic/cgroup-util.c99
-rw-r--r--src/basic/cgroup-util.h3
-rw-r--r--src/core/cgroup.c47
-rw-r--r--src/core/cgroup.h3
-rw-r--r--src/core/dbus-cgroup.c28
-rw-r--r--src/core/execute.c53
-rw-r--r--src/core/load-fragment-gperf.gperf.in1
-rw-r--r--src/core/load-fragment.c36
-rw-r--r--src/core/load-fragment.h1
-rw-r--r--src/core/unit-printf.c20
-rw-r--r--src/nspawn/nspawn-cgroup.c21
-rw-r--r--src/shared/bus-unit-util.c3
-rw-r--r--src/shared/cgroup-setup.c76
-rw-r--r--src/shared/cgroup-setup.h1
-rw-r--r--src/test/test-cgroup-util.c15
-rw-r--r--src/udev/udevd.c59
-rwxr-xr-xtest/units/testsuite-19.sh18
-rw-r--r--units/systemd-nspawn@.service.in1
-rw-r--r--units/systemd-udevd.service.in1
-rw-r--r--units/user@.service.in1
23 files changed, 398 insertions, 153 deletions
diff --git a/docs/CGROUP_DELEGATION.md b/docs/CGROUP_DELEGATION.md
index f5509fb833..4210a75767 100644
--- a/docs/CGROUP_DELEGATION.md
+++ b/docs/CGROUP_DELEGATION.md
@@ -271,7 +271,9 @@ your service has any of these four settings set, you must be prepared that a
means that your service code should have moved itself further down the cgroup
tree by the time it notifies the service manager about start-up readiness, so
that the service's main cgroup is definitely an inner node by the time the
-service manager might start `ExecStartPost=`.)
+service manager might start `ExecStartPost=`. Starting with systemd 254 you may
+also use `DelegateSubgroup=` to let the service manager put your initial
+service process into a subgroup right away.)
(Also note, if you intend to use "threaded" cgroups — as added in Linux 4.14 —,
then you should do that *two* levels down from the main service cgroup your
diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml
index e462c60636..f2e892671a 100644
--- a/man/org.freedesktop.systemd1.xml
+++ b/man/org.freedesktop.systemd1.xml
@@ -2713,6 +2713,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly as DelegateControllers = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s DelegateSubgroup = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly b CPUAccounting = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t CPUWeight = ...;
@@ -3942,6 +3944,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<variablelist class="dbus-property" generated="True" extra-ref="DelegateControllers"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="DelegateSubgroup"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="CPUAccounting"/>
<variablelist class="dbus-property" generated="True" extra-ref="CPUWeight"/>
@@ -4544,6 +4548,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
memory controller is reached. It will take into consideration limits on all parent slices, other than the
limits set on the unit itself.</para>
+ <para><varname>DelegateSubgroup</varname> contains the cgroup subgroup to place invoked unit processes
+ in. As configured by the option of the same name in unit files. This is set to the empty string when it
+ does not apply or no subgroup has been configured.</para>
+
<para><varname>RuntimeDirectorySymlink</varname>, <varname>StateDirectorySymlink</varname>,
<varname>CacheDirectorySymlink</varname> and <varname>LogsDirectorySymlink</varname> respectively
implement the destination parameter of the unit files settings <varname>RuntimeDirectory</varname>,
@@ -4715,6 +4723,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly as DelegateControllers = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s DelegateSubgroup = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly b CPUAccounting = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t CPUWeight = ...;
@@ -5936,6 +5946,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<variablelist class="dbus-property" generated="True" extra-ref="DelegateControllers"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="DelegateSubgroup"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="CPUAccounting"/>
<variablelist class="dbus-property" generated="True" extra-ref="CPUWeight"/>
@@ -6588,6 +6600,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly as DelegateControllers = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s DelegateSubgroup = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly b CPUAccounting = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t CPUWeight = ...;
@@ -7655,6 +7669,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<variablelist class="dbus-property" generated="True" extra-ref="DelegateControllers"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="DelegateSubgroup"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="CPUAccounting"/>
<variablelist class="dbus-property" generated="True" extra-ref="CPUWeight"/>
@@ -8434,6 +8450,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly as DelegateControllers = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s DelegateSubgroup = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly b CPUAccounting = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t CPUWeight = ...;
@@ -9473,6 +9491,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<variablelist class="dbus-property" generated="True" extra-ref="DelegateControllers"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="DelegateSubgroup"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="CPUAccounting"/>
<variablelist class="dbus-property" generated="True" extra-ref="CPUWeight"/>
@@ -10111,6 +10131,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly as DelegateControllers = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s DelegateSubgroup = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly b CPUAccounting = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t CPUWeight = ...;
@@ -10456,6 +10478,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
<variablelist class="dbus-property" generated="True" extra-ref="DelegateControllers"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="DelegateSubgroup"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="CPUAccounting"/>
<variablelist class="dbus-property" generated="True" extra-ref="CPUWeight"/>
@@ -10656,6 +10680,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly as DelegateControllers = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s DelegateSubgroup = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly b CPUAccounting = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t CPUWeight = ...;
@@ -11051,6 +11077,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
<variablelist class="dbus-property" generated="True" extra-ref="DelegateControllers"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="DelegateSubgroup"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="CPUAccounting"/>
<variablelist class="dbus-property" generated="True" extra-ref="CPUWeight"/>
diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml
index f4e4a492a0..610c11feb3 100644
--- a/man/systemd.resource-control.xml
+++ b/man/systemd.resource-control.xml
@@ -1148,10 +1148,11 @@ DeviceAllow=/dev/loop-control
<term><varname>Delegate=</varname></term>
<listitem>
- <para>Turns on delegation of further resource control partitioning to processes of the unit. Units where this
- is enabled may create and manage their own private subhierarchy of control groups below the control group of
- the unit itself. For unprivileged services (i.e. those using the <varname>User=</varname> setting) the unit's
- control group will be made accessible to the relevant user.</para>
+ <para>Turns on delegation of further resource control partitioning to processes of the unit. Units
+ where this is enabled may create and manage their own private subhierarchy of control groups below
+ the control group of the unit itself. For unprivileged services (i.e. those using the
+ <varname>User=</varname> setting) the unit's control group will be made accessible to the relevant
+ user.</para>
<para>When enabled the service manager will refrain from manipulating control groups or moving
processes below the unit's control group, so that a clear concept of ownership is established: the
@@ -1189,6 +1190,29 @@ DeviceAllow=/dev/loop-control
</varlistentry>
<varlistentry>
+ <term><varname>DelegateSubgroup=</varname></term>
+
+ <listitem>
+ <para>Place unit processes in the specified subgroup of the unit's control group. Takes a valid
+ control group name (not a path!) as parameter, or an empty string to turn this feature
+ off. Defaults to off. The control group name must be usable as filename and avoid conflicts with
+ the kernel's control group attribute files (i.e. <filename>cgroup.procs</filename> is not an
+ acceptable name, since the kernel exposes a native control group attribute file by that name). This
+ option has no effect unless control group delegation is turned on via <varname>Delegate=</varname>,
+ see above. Note that this setting only applies to "main" processes of a unit, i.e. for services to
+ <varname>ExecStart=</varname>, but not for <varname>ExecReload=</varname> and similar. If
+ delegation is enabled, the latter are always placed inside a subgroup named
+ <filename>.control</filename>. The specified subgroup is automatically created (and potentially
+ ownership is passed to the unit's configured user/group) when a process is started in it.</para>
+
+ <para>This option is useful to avoid manually moving the invoked process into a subgroup after it
+ has been started. Since no processes should live in inner nodes of the control group tree it's
+ almost always necessary to run the main ("supervising") process of a unit that has delegation
+ turned on in a subgroup.</para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
<term><varname>DisableControllers=</varname></term>
<listitem>
diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c
index 90877c9fa1..21e2255dae 100644
--- a/src/basic/cgroup-util.c
+++ b/src/basic/cgroup-util.c
@@ -1554,52 +1554,64 @@ int cg_pid_get_user_slice(pid_t pid, char **slice) {
return cg_path_get_user_slice(cgroup, slice);
}
-char *cg_escape(const char *p) {
- bool need_prefix = false;
-
- /* This implements very minimal escaping for names to be used
- * as file names in the cgroup tree: any name which might
- * conflict with a kernel name or is prefixed with '_' is
- * prefixed with a '_'. That way, when reading cgroup names it
- * is sufficient to remove a single prefixing underscore if
- * there is one. */
-
- /* The return value of this function (unlike cg_unescape())
- * needs free()! */
-
- if (IN_SET(p[0], 0, '_', '.') ||
- STR_IN_SET(p, "notify_on_release", "release_agent", "tasks") ||
- startswith(p, "cgroup."))
- need_prefix = true;
- else {
- const char *dot;
+bool cg_needs_escape(const char *p) {
- dot = strrchr(p, '.');
- if (dot) {
- CGroupController c;
- size_t l = dot - p;
+ /* Checks if the specified path is a valid cgroup name by our rules, or if it must be escaped. Note
+ * that we consider escaped cgroup names invalid here, as they need to be escaped a second time if
+ * they shall be used. Also note that various names cannot be made valid by escaping even if we
+ * return true here (because too long, or contain the forbidden character "/"). */
- for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
- const char *n;
+ if (!filename_is_valid(p))
+ return true;
- n = cgroup_controller_to_string(c);
+ if (IN_SET(p[0], '_', '.'))
+ return true;
- if (l != strlen(n))
- continue;
+ if (STR_IN_SET(p, "notify_on_release", "release_agent", "tasks"))
+ return true;
- if (memcmp(p, n, l) != 0)
- continue;
+ if (startswith(p, "cgroup."))
+ return true;
- need_prefix = true;
- break;
- }
- }
+ for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
+ const char *q;
+
+ q = startswith(p, cgroup_controller_to_string(c));
+ if (!q)
+ continue;
+
+ if (q[0] == '.')
+ return true;
}
- if (need_prefix)
- return strjoin("_", p);
+ return false;
+}
+
+int cg_escape(const char *p, char **ret) {
+ _cleanup_free_ char *n = NULL;
+
+ /* This implements very minimal escaping for names to be used as file names in the cgroup tree: any
+ * name which might conflict with a kernel name or is prefixed with '_' is prefixed with a '_'. That
+ * way, when reading cgroup names it is sufficient to remove a single prefixing underscore if there
+ * is one. */
+
+ /* The return value of this function (unlike cg_unescape()) needs free()! */
- return strdup(p);
+ if (cg_needs_escape(p)) {
+ n = strjoin("_", p);
+ if (!n)
+ return -ENOMEM;
+
+ if (!filename_is_valid(n)) /* became invalid due to the prefixing? Or contained things like a slash that cannot be fixed by prefixing? */
+ return -EINVAL;
+ } else {
+ n = strdup(p);
+ if (!n)
+ return -ENOMEM;
+ }
+
+ *ret = TAKE_PTR(n);
+ return 0;
}
char *cg_unescape(const char *p) {
@@ -1698,9 +1710,9 @@ int cg_slice_to_path(const char *unit, char **ret) {
if (!unit_name_is_valid(n, UNIT_NAME_PLAIN))
return -EINVAL;
- escaped = cg_escape(n);
- if (!escaped)
- return -ENOMEM;
+ r = cg_escape(n, &escaped);
+ if (r < 0)
+ return r;
if (!strextend(&s, escaped, "/"))
return -ENOMEM;
@@ -1708,15 +1720,14 @@ int cg_slice_to_path(const char *unit, char **ret) {
dash = strchr(dash+1, '-');
}
- e = cg_escape(unit);
- if (!e)
- return -ENOMEM;
+ r = cg_escape(unit, &e);
+ if (r < 0)
+ return r;
if (!strextend(&s, e))
return -ENOMEM;
*ret = TAKE_PTR(s);
-
return 0;
}
diff --git a/src/basic/cgroup-util.h b/src/basic/cgroup-util.h
index 5de000c4ce..9b30ae0396 100644
--- a/src/basic/cgroup-util.h
+++ b/src/basic/cgroup-util.h
@@ -279,7 +279,8 @@ int cg_pid_get_user_slice(pid_t pid, char **slice);
int cg_path_decode_unit(const char *cgroup, char **unit);
-char *cg_escape(const char *p);
+bool cg_needs_escape(const char *p);
+int cg_escape(const char *p, char **ret);
char *cg_unescape(const char *p) _pure_;
bool cg_controller_is_valid(const char *p);
diff --git a/src/core/cgroup.c b/src/core/cgroup.c
index 8b178ea29f..4ec5dfa587 100644
--- a/src/core/cgroup.c
+++ b/src/core/cgroup.c
@@ -293,6 +293,8 @@ void cgroup_context_done(CGroupContext *c) {
cpu_set_reset(&c->startup_cpuset_cpus);
cpu_set_reset(&c->cpuset_mems);
cpu_set_reset(&c->startup_cpuset_mems);
+
+ c->delegate_subgroup = mfree(c->delegate_subgroup);
}
static int unit_get_kernel_memory_limit(Unit *u, const char *file, uint64_t *ret) {
@@ -570,6 +572,10 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
prefix, managed_oom_preference_to_string(c->moom_preference),
prefix, cgroup_pressure_watch_to_string(c->memory_pressure_watch));
+ if (c->delegate_subgroup)
+ fprintf(f, "%sDelegateSubgroup: %s\n",
+ prefix, c->delegate_subgroup);
+
if (c->memory_pressure_threshold_usec != USEC_INFINITY)
fprintf(f, "%sMemoryPressureThresholdSec: %s\n",
prefix, FORMAT_TIMESPAN(c->memory_pressure_threshold_usec, 1));
@@ -2087,28 +2093,37 @@ static const char *migrate_callback(CGroupMask mask, void *userdata) {
return strempty(unit_get_realized_cgroup_path(userdata, mask));
}
-char *unit_default_cgroup_path(const Unit *u) {
- _cleanup_free_ char *escaped = NULL, *slice_path = NULL;
- Unit *slice;
+int unit_default_cgroup_path(const Unit *u, char **ret) {
+ _cleanup_free_ char *p = NULL;
int r;
assert(u);
+ assert(ret);
if (unit_has_name(u, SPECIAL_ROOT_SLICE))
- return strdup(u->manager->cgroup_root);
+ p = strdup(u->manager->cgroup_root);
+ else {
+ _cleanup_free_ char *escaped = NULL, *slice_path = NULL;
+ Unit *slice;
- slice = UNIT_GET_SLICE(u);
- if (slice && !unit_has_name(slice, SPECIAL_ROOT_SLICE)) {
- r = cg_slice_to_path(slice->id, &slice_path);
+ slice = UNIT_GET_SLICE(u);
+ if (slice && !unit_has_name(slice, SPECIAL_ROOT_SLICE)) {
+ r = cg_slice_to_path(slice->id, &slice_path);
+ if (r < 0)
+ return r;
+ }
+
+ r = cg_escape(u->id, &escaped);
if (r < 0)
- return NULL;
- }
+ return r;
- escaped = cg_escape(u->id);
- if (!escaped)
- return NULL;
+ p = path_join(empty_to_root(u->manager->cgroup_root), slice_path, escaped);
+ }
+ if (!p)
+ return -ENOMEM;
- return path_join(empty_to_root(u->manager->cgroup_root), slice_path, escaped);
+ *ret = TAKE_PTR(p);
+ return 0;
}
int unit_set_cgroup_path(Unit *u, const char *path) {
@@ -2264,9 +2279,9 @@ int unit_pick_cgroup_path(Unit *u) {
if (!UNIT_HAS_CGROUP_CONTEXT(u))
return -EINVAL;
- path = unit_default_cgroup_path(u);
- if (!path)
- return log_oom();
+ r = unit_default_cgroup_path(u, &path);
+ if (r < 0)
+ return log_unit_error_errno(u, r, "Failed to generate default cgroup path: %m");
r = unit_set_cgroup_path(u, path);
if (r == -EEXIST)
diff --git a/src/core/cgroup.h b/src/core/cgroup.h
index 8e1f093901..bbbf9408cc 100644
--- a/src/core/cgroup.h
+++ b/src/core/cgroup.h
@@ -133,6 +133,7 @@ struct CGroupContext {
bool delegate;
CGroupMask delegate_controllers;
CGroupMask disable_controllers;
+ char *delegate_subgroup;
/* For unified hierarchy */
uint64_t cpu_weight;
@@ -290,7 +291,7 @@ void unit_invalidate_cgroup_members_masks(Unit *u);
void unit_add_family_to_cgroup_realize_queue(Unit *u);
const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask);
-char *unit_default_cgroup_path(const Unit *u);
+int unit_default_cgroup_path(const Unit *u, char **ret);
int unit_set_cgroup_path(Unit *u, const char *path);
int unit_pick_cgroup_path(Unit *u);
diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c
index 3a02fcbdb1..682ad5edd5 100644
--- a/src/core/dbus-cgroup.c
+++ b/src/core/dbus-cgroup.c
@@ -435,6 +435,7 @@ const sd_bus_vtable bus_cgroup_vtable[] = {
SD_BUS_VTABLE_START(0),
SD_BUS_PROPERTY("Delegate", "b", bus_property_get_bool, offsetof(CGroupContext, delegate), 0),
SD_BUS_PROPERTY("DelegateControllers", "as", property_get_delegate_controllers, 0, 0),
+ SD_BUS_PROPERTY("DelegateSubgroup", "s", NULL, offsetof(CGroupContext, delegate_subgroup), 0),
SD_BUS_PROPERTY("CPUAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, cpu_accounting), 0),
SD_BUS_PROPERTY("CPUWeight", "t", NULL, offsetof(CGroupContext, cpu_weight), 0),
SD_BUS_PROPERTY("StartupCPUWeight", "t", NULL, offsetof(CGroupContext, startup_cpu_weight), 0),
@@ -536,6 +537,33 @@ static int bus_cgroup_set_transient_property(
return 1;
+ } else if (streq(name, "DelegateSubgroup")) {
+ const char *s;
+
+ if (!UNIT_VTABLE(u)->can_delegate)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Delegation not available for unit type");
+
+ r = sd_bus_message_read(message, "s", &s);
+ if (r < 0)
+ return r;
+
+ if (!isempty(s) && cg_needs_escape(s))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid control group name: %s", s);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ if (isempty(s))
+ c->delegate_subgroup = mfree(c->delegate_subgroup);
+ else {
+ r = free_and_strdup_warn(&c->delegate_subgroup, s);
+ if (r < 0)
+ return r;
+ }
+
+ unit_write_settingf(u, flags, name, "DelegateSubgroup=%s", s);
+ }
+
+ return 1;
+
} else if (STR_IN_SET(name, "DelegateControllers", "DisableControllers")) {
CGroupMask mask = 0;
diff --git a/src/core/execute.c b/src/core/execute.c
index 60f7a6439c..b32e341b61 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -4178,8 +4178,12 @@ static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p
return 0;
}
-static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
- bool using_subcgroup;
+static int exec_parameters_get_cgroup_path(
+ const ExecParameters *params,
+ const CGroupContext *c,
+ char **ret) {
+
+ const char *subgroup = NULL;
char *p;
assert(params);
@@ -4197,16 +4201,22 @@ static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **
* this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
* flag, which is only passed for the former statements, not for the latter. */
- using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
- if (using_subcgroup)
- p = path_join(params->cgroup_path, ".control");
+ if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
+ if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
+ subgroup = ".control";
+ else
+ subgroup = c->delegate_subgroup;
+ }
+
+ if (subgroup)
+ p = path_join(params->cgroup_path, subgroup);
else
p = strdup(params->cgroup_path);
if (!p)
return -ENOMEM;
*ret = p;
- return using_subcgroup;
+ return !!subgroup;
}
static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
@@ -4705,7 +4715,7 @@ static int exec_child(
if (params->cgroup_path) {
_cleanup_free_ char *p = NULL;
- r = exec_parameters_get_cgroup_path(params, &p);
+ r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
if (r < 0) {
*exit_status = EXIT_CGROUP;
return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
@@ -4880,11 +4890,26 @@ static int exec_child(
* touch a single hierarchy too. */
if (params->flags & EXEC_CGROUP_DELEGATE) {
+ _cleanup_free_ char *p = NULL;
+
r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
if (r < 0) {
*exit_status = EXIT_CGROUP;
return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
}
+
+ r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
+ if (r < 0) {
+ *exit_status = EXIT_CGROUP;
+ return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
+ }
+ if (r > 0) {
+ r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
+ if (r < 0) {
+ *exit_status = EXIT_CGROUP;
+ return log_unit_error_errno(unit, r, "Failed to adjust control subgroup access: %m");
+ }
+ }
}
if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
@@ -5635,18 +5660,16 @@ int exec_spawn(Unit *unit,
log_command_line(unit, "About to execute", command->path, command->argv);
if (params->cgroup_path) {
- r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
+ r = exec_parameters_get_cgroup_path(params, cgroup_context, &subcgroup_path);
if (r < 0)
return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
- if (r > 0) { /* We are using a child cgroup */
+ if (r > 0) {
+ /* If there's a subcgroup, then let's create it here now (the main cgroup was already
+ * realized by the unit logic) */
+
r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
if (r < 0)
- return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
-
- /* Normally we would not propagate the xattrs to children but since we created this
- * sub-cgroup internally we should do it. */
- cgroup_oomd_xattr_apply(unit, subcgroup_path);
- cgroup_log_xattr_apply(unit, subcgroup_path);
+ return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
}
}
diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in
index 8a2823b075..110bccb7ad 100644
--- a/src/core/load-fragment-gperf.gperf.in
+++ b/src/core/load-fragment-gperf.gperf.in
@@ -237,6 +237,7 @@
{{type}}.TasksAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.tasks_accounting)
{{type}}.TasksMax, config_parse_tasks_max, 0, offsetof({{type}}, cgroup_context.tasks_max)
{{type}}.Delegate, config_parse_delegate, 0, offsetof({{type}}, cgroup_context)
+{{type}}.DelegateSubgroup, config_parse_delegate_subgroup , 0, offsetof({{type}}, cgroup_context)
{{type}}.DisableControllers, config_parse_disable_controllers, 0, offsetof({{type}}, cgroup_context)
{{type}}.IPAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.ip_accounting)
{{type}}.IPAddressAllow, config_parse_in_addr_prefixes, AF_UNSPEC, offsetof({{type}}, cgroup_context.ip_address_allow)
diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c
index 581a051d46..ebfe98d7cc 100644
--- a/src/core/load-fragment.c
+++ b/src/core/load-fragment.c
@@ -4029,6 +4029,42 @@ int config_parse_delegate(
return 0;
}
+int config_parse_delegate_subgroup(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ CGroupContext *c = ASSERT_PTR(data);
+ UnitType t;
+
+ t = unit_name_to_type(unit);
+ assert(t >= 0);
+
+ if (!unit_vtable[t]->can_delegate) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "DelegateSubgroup= setting not supported for this unit type, ignoring.");
+ return 0;
+ }
+
+ if (isempty(rvalue)) {
+ c->delegate_subgroup = mfree(c->delegate_subgroup);
+ return 0;
+ }
+
+ if (cg_needs_escape(rvalue)) { /* Insist that specified names don't need escaping */
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid control group name, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ return free_and_strdup_warn(&c->delegate_subgroup, rvalue);
+}
+
int config_parse_managed_oom_mode(
const char *unit,
const char *filename,
diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h
index a38d697338..59f02a3207 100644
--- a/src/core/load-fragment.h
+++ b/src/core/load-fragment.h
@@ -83,6 +83,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_cpu_shares);
CONFIG_PARSER_PROTOTYPE(config_parse_memory_limit);
CONFIG_PARSER_PROTOTYPE(config_parse_tasks_max);
CONFIG_PARSER_PROTOTYPE(config_parse_delegate);
+CONFIG_PARSER_PROTOTYPE(config_parse_delegate_subgroup);
CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mode);
CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mem_pressure_limit);
CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_preference);
diff --git a/src/core/unit-printf.c b/src/core/unit-printf.c
index 1b267d4fdd..3977082cc1 100644
--- a/src/core/unit-printf.c
+++ b/src/core/unit-printf.c
@@ -86,19 +86,21 @@ static void bad_specifier(const Unit *u, char specifier) {
static int specifier_cgroup(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
const Unit *u = ASSERT_PTR(userdata);
- char *n;
bad_specifier(u, specifier);
- if (u->cgroup_path)
+ if (u->cgroup_path) {
+ char *n;
+
n = strdup(u->cgroup_path);
- else
- n = unit_default_cgroup_path(u);
- if (!n)
- return -ENOMEM;
+ if (!n)
+ return -ENOMEM;
- *ret = n;
- return 0;
+ *ret = n;
+ return 0;
+ }
+
+ return unit_default_cgroup_path(u, ret);
}
static int specifier_cgroup_root(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
@@ -126,7 +128,7 @@ static int specifier_cgroup_slice(char specifier, const void *data, const char *
if (slice->cgroup_path)
n = strdup(slice->cgroup_path);
else
- n = unit_default_cgroup_path(slice);
+ return unit_default_cgroup_path(slice, ret);
} else
n = strdup(u->manager->cgroup_root);
if (!n)
diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c
index 0deb4ebb30..a9d36627a8 100644
--- a/src/nspawn/nspawn-cgroup.c
+++ b/src/nspawn/nspawn-cgroup.c
@@ -141,9 +141,9 @@ finish:
}
int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested) {
- _cleanup_free_ char *cgroup = NULL;
+ _cleanup_free_ char *cgroup = NULL, *payload = NULL;
CGroupMask supported;
- const char *payload;
+ char *e;
int r;
assert(pid > 1);
@@ -174,15 +174,26 @@ int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested)
if (r < 0)
return log_error_errno(r, "Failed to get our control group: %m");
- payload = strjoina(cgroup, "/payload");
+ /* If the service manager already placed us in the supervisor cgroup, let's handle that. */
+ e = endswith(cgroup, "/supervisor");
+ if (e)
+ *e = 0; /* chop off, we want the main path delegated to us */
+
+ payload = path_join(cgroup, "payload");
+ if (!payload)
+ return log_oom();
+
r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, payload, pid);
if (r < 0)
return log_error_errno(r, "Failed to create %s subcgroup: %m", payload);
if (keep_unit) {
- const char *supervisor;
+ _cleanup_free_ char *supervisor = NULL;
+
+ supervisor = path_join(cgroup, "supervisor");
+ if (!supervisor)
+ return log_oom();
- supervisor = strjoina(cgroup, "/supervisor");
r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, supervisor, 0);
if (r < 0)
return log_error_errno(r, "Failed to create %s subcgroup: %m", supervisor);
diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c
index ebbd1f7f28..a321179609 100644
--- a/src/shared/bus-unit-util.c
+++ b/src/shared/bus-unit-util.c
@@ -461,7 +461,8 @@ static int bus_append_cgroup_property(sd_bus_message *m, const char *field, cons
"ManagedOOMSwap",
"ManagedOOMMemoryPressure",
"ManagedOOMPreference",
- "MemoryPressureWatch"))
+ "MemoryPressureWatch",
+ "DelegateSubgroup"))
return bus_append_string(m, field, eq);
if (STR_IN_SET(field, "ManagedOOMMemoryPressureLimit")) {
diff --git a/src/shared/cgroup-setup.c b/src/shared/cgroup-setup.c
index 65be851014..6f3f9dd939 100644
--- a/src/shared/cgroup-setup.c
+++ b/src/shared/cgroup-setup.c
@@ -483,6 +483,82 @@ int cg_set_access(
return 0;
}
+struct access_callback_data {
+ uid_t uid;
+ gid_t gid;
+ int error;
+};
+
+static int access_callback(
+ RecurseDirEvent event,
+ const char *path,
+ int dir_fd,
+ int inode_fd,
+ const struct dirent *de,
+ const struct statx *sx,
+ void *userdata) {
+
+ struct access_callback_data *d = ASSERT_PTR(userdata);
+
+ if (!IN_SET(event, RECURSE_DIR_ENTER, RECURSE_DIR_ENTRY))
+ return RECURSE_DIR_CONTINUE;
+
+ assert(inode_fd >= 0);
+
+ /* fchown() doesn't support O_PATH fds, hence we use the /proc/self/fd/ trick */
+ if (chown(FORMAT_PROC_FD_PATH(inode_fd), d->uid, d->gid) < 0) {
+ log_debug_errno(errno, "Failed to change ownership of '%s', ignoring: %m", ASSERT_PTR(path));
+
+ if (d->error == 0) /* Return last error to caller */
+ d->error = errno;
+ }
+
+ return RECURSE_DIR_CONTINUE;
+}
+
+int cg_set_access_recursive(
+ const char *controller,
+ const char *path,
+ uid_t uid,
+ gid_t gid) {
+
+ _cleanup_close_ int fd = -EBADF;
+ _cleanup_free_ char *fs = NULL;
+ int r;
+
+ /* A recursive version of cg_set_access(). But note that this one changes ownership of *all* files,
+ * not just the allowlist that cg_set_access() uses. Use cg_set_access() on the cgroup you want to
+ * delegate, and cg_set_access_recursive() for any subcrgoups you might want to create below it. */
+
+ if (!uid_is_valid(uid) && !gid_is_valid(gid))
+ return 0;
+
+ r = cg_get_path(controller, path, NULL, &fs);
+ if (r < 0)
+ return r;
+
+ fd = open(fs, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
+ if (fd < 0)
+ return -errno;
+
+ struct access_callback_data d = {
+ .uid = uid,
+ .gid = gid,
+ };
+
+ r = recurse_dir(fd,
+ fs,
+ /* statx_mask= */ 0,
+ /* n_depth_max= */ UINT_MAX,
+ RECURSE_DIR_SAME_MOUNT|RECURSE_DIR_INODE_FD|RECURSE_DIR_TOPLEVEL,
+ access_callback,
+ &d);
+ if (r < 0)
+ return r;
+
+ return -d.error;
+}
+
int cg_migrate(
const char *cfrom,
const char *pfrom,
diff --git a/src/shared/cgroup-setup.h b/src/shared/cgroup-setup.h
index 13f836bd90..1b6f0716c6 100644
--- a/src/shared/cgroup-setup.h
+++ b/src/shared/cgroup-setup.h
@@ -24,6 +24,7 @@ int cg_attach_fallback(const char *controller, const char *path, pid_t pid);
int cg_create_and_attach(const char *controller, const char *path, pid_t pid);
int cg_set_access(const char *controller, const char *path, uid_t uid, gid_t gid);
+int cg_set_access_recursive(const char *controller, const char *path, uid_t uid, gid_t gid);
int cg_migrate(const char *cfrom, const char *pfrom, const char *cto, const char *pto, CGroupFlags flags);
int cg_migrate_recursive(const char *cfrom, const char *pfrom, const char *cto, const char *pto, CGroupFlags flags);
diff --git a/src/test/test-cgroup-util.c b/src/test/test-cgroup-util.c
index cdf911926c..4f430a1df2 100644
--- a/src/test/test-cgroup-util.c
+++ b/src/test/test-cgroup-util.c
@@ -235,14 +235,19 @@ TEST(proc) {
}
}
-static void test_escape_one(const char *s, const char *r) {
- _cleanup_free_ char *b;
+static void test_escape_one(const char *s, const char *expected) {
+ _cleanup_free_ char *b = NULL;
- b = cg_escape(s);
- assert_se(b);
- assert_se(streq(b, r));
+ assert_se(s);
+ assert_se(expected);
+
+ assert_se(cg_escape(s, &b) >= 0);
+ assert_se(streq(b, expected));
assert_se(streq(cg_unescape(b), s));
+
+ assert_se(filename_is_valid(b));
+ assert_se(!cg_needs_escape(s) || b[0] == '_');
}
TEST(escape, .sd_booted = true) {
diff --git a/src/udev/udevd.c b/src/udev/udevd.c
index c56956c378..b3aabcaa1f 100644
--- a/src/udev/udevd.c
+++ b/src/udev/udevd.c
@@ -1799,54 +1799,6 @@ static int parse_argv(int argc, char *argv[]) {
return 1;
}
-static int create_subcgroup(char **ret) {
- _cleanup_free_ char *cgroup = NULL, *subcgroup = NULL;
- int r;
-
- if (getppid() != 1)
- return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Not invoked by PID1.");
-
- r = sd_booted();
- if (r < 0)
- return log_debug_errno(r, "Failed to check if systemd is running: %m");
- if (r == 0)
- return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "systemd is not running.");
-
- /* Get our own cgroup, we regularly kill everything udev has left behind.
- * We only do this on systemd systems, and only if we are directly spawned
- * by PID1. Otherwise we are not guaranteed to have a dedicated cgroup. */
-
- r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
- if (r < 0) {
- if (IN_SET(r, -ENOENT, -ENOMEDIUM))
- return log_debug_errno(r, "Dedicated cgroup not found: %m");
- return log_debug_errno(r, "Failed to get cgroup: %m");
- }
-
- r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, cgroup, "trusted.delegate");
- if (r == 0 || (r < 0 && ERRNO_IS_XATTR_ABSENT(r)))
- return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "The cgroup %s is not delegated to us.", cgroup);
- if (r < 0)
- return log_debug_errno(r, "Failed to read trusted.delegate attribute: %m");
-
- /* We are invoked with our own delegated cgroup tree, let's move us one level down, so that we
- * don't collide with the "no processes in inner nodes" rule of cgroups, when the service
- * manager invokes the ExecReload= job in the .control/ subcgroup. */
-
- subcgroup = path_join(cgroup, "/udev");
- if (!subcgroup)
- return log_oom_debug();
-
- r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup, 0);
- if (r < 0)
- return log_debug_errno(r, "Failed to create %s subcgroup: %m", subcgroup);
-
- log_debug("Created %s subcgroup.", subcgroup);
- if (ret)
- *ret = TAKE_PTR(subcgroup);
- return 0;
-}
-
static int manager_new(Manager **ret, int fd_ctrl, int fd_uevent) {
_cleanup_(manager_freep) Manager *manager = NULL;
_cleanup_free_ char *cgroup = NULL;
@@ -1854,8 +1806,6 @@ static int manager_new(Manager **ret, int fd_ctrl, int fd_uevent) {
assert(ret);
- (void) create_subcgroup(&cgroup);
-
manager = new(Manager, 1);
if (!manager)
return log_oom();
@@ -1863,7 +1813,6 @@ static int manager_new(Manager **ret, int fd_ctrl, int fd_uevent) {
*manager = (Manager) {
.inotify_fd = -EBADF,
.worker_watch = PIPE_EBADF,
- .cgroup = TAKE_PTR(cgroup),
};
r = udev_ctrl_new_from_fd(&manager->ctrl, fd_ctrl);
@@ -1895,6 +1844,14 @@ static int manager_new(Manager **ret, int fd_ctrl, int fd_uevent) {
manager->log_level = log_get_max_level();
+ r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
+ if (r < 0)
+ log_warning_errno(r, "Failed to get cgroup, ignoring: %m");
+ else if (endswith(cgroup, "/udev")) { /* If we are in a subcgroup /udev/ we assume it was delegated to us */
+ log_debug("Running in delegated subcgroup '%s'.", cgroup);
+ manager->cgroup = TAKE_PTR(cgroup);
+ }
+
*ret = TAKE_PTR(manager);
return 0;
diff --git a/test/units/testsuite-19.sh b/test/units/testsuite-19.sh
index 1e705ea72b..20759eaa1f 100755
--- a/test/units/testsuite-19.sh
+++ b/test/units/testsuite-19.sh
@@ -45,6 +45,24 @@ if grep -q cgroup2 /proc/filesystems ; then
# Check that unprivileged delegation works for scopes
test_scope_unpriv_delegation
+ # Verify that DelegateSubgroup= affects ownership correctly
+ U="testsubgroup-$RANDOM.service"
+ systemd-run --wait --unit="$U" -p "DynamicUser=1" -p "Delegate=pids" -p "DelegateSubgroup=foo" \
+ test -w "/sys/fs/cgroup/system.slice/$U" -a \
+ -w "/sys/fs/cgroup/system.slice/$U/foo"
+
+ # Check that for the subgroup also attributes that aren't covered by
+ # regular (i.e. main cgroup) delegation ownership rules are delegated properly
+ if test -f /sys/fs/cgroup/cgroup.max.depth ; then
+ U="testsubgroup-$RANDOM.service"
+ systemd-run --wait --unit="$U" -p "DynamicUser=1" -p "Delegate=pids" -p "DelegateSubgroup=zzz" \
+ test -w "/sys/fs/cgroup/system.slice/$U/zzz/cgroup.max.depth"
+ fi
+
+ # Check that the invoked process itsel is also in the subgroup
+ U="testsubgroup-$RANDOM.service"
+ systemd-run --wait --unit="$U" -p "DynamicUser=1" -p "Delegate=pids" -p "DelegateSubgroup=bar" \
+ grep -q -x -F "0::/system.slice/$U/bar" /proc/self/cgroup
else
echo "Skipping TEST-19-DELEGATE, as the kernel doesn't actually support cgroup v2" >&2
fi
diff --git a/units/systemd-nspawn@.service.in b/units/systemd-nspawn@.service.in
index e1626b9c87..079d6064f6 100644
--- a/units/systemd-nspawn@.service.in
+++ b/units/systemd-nspawn@.service.in
@@ -25,6 +25,7 @@ RestartForceExitStatus=133
SuccessExitStatus=133
Slice=machine.slice
Delegate=yes
+DelegateSubgroup=supervisor
TasksMax=16384
{{SERVICE_WATCHDOG}}
diff --git a/units/systemd-udevd.service.in b/units/systemd-udevd.service.in
index dfc2a0e341..a0ee9e0a50 100644
--- a/units/systemd-udevd.service.in
+++ b/units/systemd-udevd.service.in
@@ -18,6 +18,7 @@ ConditionPathIsReadWrite=/sys
[Service]
CapabilityBoundingSet=~CAP_SYS_TIME CAP_WAKE_ALARM
Delegate=pids
+DelegateSubgroup=udev
Type=notify-reload
# Note that udev will reset the value internally for its workers
OOMScoreAdjust=-1000
diff --git a/units/user@.service.in b/units/user@.service.in
index 41ed55cb1a..86ab4ffcc6 100644
--- a/units/user@.service.in
+++ b/units/user@.service.in
@@ -22,6 +22,7 @@ ExecStart={{ROOTLIBEXECDIR}}/systemd --user
Slice=user-%i.slice
KillMode=mixed
Delegate=pids memory cpu
+DelegateSubgroup=init.scope
TasksMax=infinity
TimeoutStopSec={{ DEFAULT_USER_TIMEOUT_SEC*4//3 }}s
KeyringMode=inherit