diff options
-rw-r--r-- | docs/CGROUP_DELEGATION.md | 4 | ||||
-rw-r--r-- | man/org.freedesktop.systemd1.xml | 28 | ||||
-rw-r--r-- | man/systemd.resource-control.xml | 32 | ||||
-rw-r--r-- | src/basic/cgroup-util.c | 99 | ||||
-rw-r--r-- | src/basic/cgroup-util.h | 3 | ||||
-rw-r--r-- | src/core/cgroup.c | 47 | ||||
-rw-r--r-- | src/core/cgroup.h | 3 | ||||
-rw-r--r-- | src/core/dbus-cgroup.c | 28 | ||||
-rw-r--r-- | src/core/execute.c | 53 | ||||
-rw-r--r-- | src/core/load-fragment-gperf.gperf.in | 1 | ||||
-rw-r--r-- | src/core/load-fragment.c | 36 | ||||
-rw-r--r-- | src/core/load-fragment.h | 1 | ||||
-rw-r--r-- | src/core/unit-printf.c | 20 | ||||
-rw-r--r-- | src/nspawn/nspawn-cgroup.c | 21 | ||||
-rw-r--r-- | src/shared/bus-unit-util.c | 3 | ||||
-rw-r--r-- | src/shared/cgroup-setup.c | 76 | ||||
-rw-r--r-- | src/shared/cgroup-setup.h | 1 | ||||
-rw-r--r-- | src/test/test-cgroup-util.c | 15 | ||||
-rw-r--r-- | src/udev/udevd.c | 59 | ||||
-rwxr-xr-x | test/units/testsuite-19.sh | 18 | ||||
-rw-r--r-- | units/systemd-nspawn@.service.in | 1 | ||||
-rw-r--r-- | units/systemd-udevd.service.in | 1 | ||||
-rw-r--r-- | units/user@.service.in | 1 |
23 files changed, 398 insertions, 153 deletions
diff --git a/docs/CGROUP_DELEGATION.md b/docs/CGROUP_DELEGATION.md index f5509fb833..4210a75767 100644 --- a/docs/CGROUP_DELEGATION.md +++ b/docs/CGROUP_DELEGATION.md @@ -271,7 +271,9 @@ your service has any of these four settings set, you must be prepared that a means that your service code should have moved itself further down the cgroup tree by the time it notifies the service manager about start-up readiness, so that the service's main cgroup is definitely an inner node by the time the -service manager might start `ExecStartPost=`.) +service manager might start `ExecStartPost=`. Starting with systemd 254 you may +also use `DelegateSubgroup=` to let the service manager put your initial +service process into a subgroup right away.) (Also note, if you intend to use "threaded" cgroups — as added in Linux 4.14 —, then you should do that *two* levels down from the main service cgroup your diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index e462c60636..f2e892671a 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -2713,6 +2713,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly as DelegateControllers = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s DelegateSubgroup = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CPUAccounting = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUWeight = ...; @@ -3942,6 +3944,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { <variablelist class="dbus-property" generated="True" extra-ref="DelegateControllers"/> + <variablelist class="dbus-property" generated="True" extra-ref="DelegateSubgroup"/> + <variablelist class="dbus-property" generated="True" extra-ref="CPUAccounting"/> <variablelist class="dbus-property" generated="True" extra-ref="CPUWeight"/> @@ -4544,6 +4548,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { memory controller is reached. It will take into consideration limits on all parent slices, other than the limits set on the unit itself.</para> + <para><varname>DelegateSubgroup</varname> contains the cgroup subgroup to place invoked unit processes + in. As configured by the option of the same name in unit files. This is set to the empty string when it + does not apply or no subgroup has been configured.</para> + <para><varname>RuntimeDirectorySymlink</varname>, <varname>StateDirectorySymlink</varname>, <varname>CacheDirectorySymlink</varname> and <varname>LogsDirectorySymlink</varname> respectively implement the destination parameter of the unit files settings <varname>RuntimeDirectory</varname>, @@ -4715,6 +4723,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly as DelegateControllers = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s DelegateSubgroup = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CPUAccounting = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUWeight = ...; @@ -5936,6 +5946,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { <variablelist class="dbus-property" generated="True" extra-ref="DelegateControllers"/> + <variablelist class="dbus-property" generated="True" extra-ref="DelegateSubgroup"/> + <variablelist class="dbus-property" generated="True" extra-ref="CPUAccounting"/> <variablelist class="dbus-property" generated="True" extra-ref="CPUWeight"/> @@ -6588,6 +6600,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly as DelegateControllers = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s DelegateSubgroup = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CPUAccounting = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUWeight = ...; @@ -7655,6 +7669,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { <variablelist class="dbus-property" generated="True" extra-ref="DelegateControllers"/> + <variablelist class="dbus-property" generated="True" extra-ref="DelegateSubgroup"/> + <variablelist class="dbus-property" generated="True" extra-ref="CPUAccounting"/> <variablelist class="dbus-property" generated="True" extra-ref="CPUWeight"/> @@ -8434,6 +8450,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly as DelegateControllers = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s DelegateSubgroup = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CPUAccounting = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUWeight = ...; @@ -9473,6 +9491,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { <variablelist class="dbus-property" generated="True" extra-ref="DelegateControllers"/> + <variablelist class="dbus-property" generated="True" extra-ref="DelegateSubgroup"/> + <variablelist class="dbus-property" generated="True" extra-ref="CPUAccounting"/> <variablelist class="dbus-property" generated="True" extra-ref="CPUWeight"/> @@ -10111,6 +10131,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly as DelegateControllers = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s DelegateSubgroup = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CPUAccounting = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUWeight = ...; @@ -10456,6 +10478,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { <variablelist class="dbus-property" generated="True" extra-ref="DelegateControllers"/> + <variablelist class="dbus-property" generated="True" extra-ref="DelegateSubgroup"/> + <variablelist class="dbus-property" generated="True" extra-ref="CPUAccounting"/> <variablelist class="dbus-property" generated="True" extra-ref="CPUWeight"/> @@ -10656,6 +10680,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly as DelegateControllers = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s DelegateSubgroup = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CPUAccounting = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUWeight = ...; @@ -11051,6 +11077,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { <variablelist class="dbus-property" generated="True" extra-ref="DelegateControllers"/> + <variablelist class="dbus-property" generated="True" extra-ref="DelegateSubgroup"/> + <variablelist class="dbus-property" generated="True" extra-ref="CPUAccounting"/> <variablelist class="dbus-property" generated="True" extra-ref="CPUWeight"/> diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml index f4e4a492a0..610c11feb3 100644 --- a/man/systemd.resource-control.xml +++ b/man/systemd.resource-control.xml @@ -1148,10 +1148,11 @@ DeviceAllow=/dev/loop-control <term><varname>Delegate=</varname></term> <listitem> - <para>Turns on delegation of further resource control partitioning to processes of the unit. Units where this - is enabled may create and manage their own private subhierarchy of control groups below the control group of - the unit itself. For unprivileged services (i.e. those using the <varname>User=</varname> setting) the unit's - control group will be made accessible to the relevant user.</para> + <para>Turns on delegation of further resource control partitioning to processes of the unit. Units + where this is enabled may create and manage their own private subhierarchy of control groups below + the control group of the unit itself. For unprivileged services (i.e. those using the + <varname>User=</varname> setting) the unit's control group will be made accessible to the relevant + user.</para> <para>When enabled the service manager will refrain from manipulating control groups or moving processes below the unit's control group, so that a clear concept of ownership is established: the @@ -1189,6 +1190,29 @@ DeviceAllow=/dev/loop-control </varlistentry> <varlistentry> + <term><varname>DelegateSubgroup=</varname></term> + + <listitem> + <para>Place unit processes in the specified subgroup of the unit's control group. Takes a valid + control group name (not a path!) as parameter, or an empty string to turn this feature + off. Defaults to off. The control group name must be usable as filename and avoid conflicts with + the kernel's control group attribute files (i.e. <filename>cgroup.procs</filename> is not an + acceptable name, since the kernel exposes a native control group attribute file by that name). This + option has no effect unless control group delegation is turned on via <varname>Delegate=</varname>, + see above. Note that this setting only applies to "main" processes of a unit, i.e. for services to + <varname>ExecStart=</varname>, but not for <varname>ExecReload=</varname> and similar. If + delegation is enabled, the latter are always placed inside a subgroup named + <filename>.control</filename>. The specified subgroup is automatically created (and potentially + ownership is passed to the unit's configured user/group) when a process is started in it.</para> + + <para>This option is useful to avoid manually moving the invoked process into a subgroup after it + has been started. Since no processes should live in inner nodes of the control group tree it's + almost always necessary to run the main ("supervising") process of a unit that has delegation + turned on in a subgroup.</para> + </listitem> + </varlistentry> + + <varlistentry> <term><varname>DisableControllers=</varname></term> <listitem> diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c index 90877c9fa1..21e2255dae 100644 --- a/src/basic/cgroup-util.c +++ b/src/basic/cgroup-util.c @@ -1554,52 +1554,64 @@ int cg_pid_get_user_slice(pid_t pid, char **slice) { return cg_path_get_user_slice(cgroup, slice); } -char *cg_escape(const char *p) { - bool need_prefix = false; - - /* This implements very minimal escaping for names to be used - * as file names in the cgroup tree: any name which might - * conflict with a kernel name or is prefixed with '_' is - * prefixed with a '_'. That way, when reading cgroup names it - * is sufficient to remove a single prefixing underscore if - * there is one. */ - - /* The return value of this function (unlike cg_unescape()) - * needs free()! */ - - if (IN_SET(p[0], 0, '_', '.') || - STR_IN_SET(p, "notify_on_release", "release_agent", "tasks") || - startswith(p, "cgroup.")) - need_prefix = true; - else { - const char *dot; +bool cg_needs_escape(const char *p) { - dot = strrchr(p, '.'); - if (dot) { - CGroupController c; - size_t l = dot - p; + /* Checks if the specified path is a valid cgroup name by our rules, or if it must be escaped. Note + * that we consider escaped cgroup names invalid here, as they need to be escaped a second time if + * they shall be used. Also note that various names cannot be made valid by escaping even if we + * return true here (because too long, or contain the forbidden character "/"). */ - for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { - const char *n; + if (!filename_is_valid(p)) + return true; - n = cgroup_controller_to_string(c); + if (IN_SET(p[0], '_', '.')) + return true; - if (l != strlen(n)) - continue; + if (STR_IN_SET(p, "notify_on_release", "release_agent", "tasks")) + return true; - if (memcmp(p, n, l) != 0) - continue; + if (startswith(p, "cgroup.")) + return true; - need_prefix = true; - break; - } - } + for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { + const char *q; + + q = startswith(p, cgroup_controller_to_string(c)); + if (!q) + continue; + + if (q[0] == '.') + return true; } - if (need_prefix) - return strjoin("_", p); + return false; +} + +int cg_escape(const char *p, char **ret) { + _cleanup_free_ char *n = NULL; + + /* This implements very minimal escaping for names to be used as file names in the cgroup tree: any + * name which might conflict with a kernel name or is prefixed with '_' is prefixed with a '_'. That + * way, when reading cgroup names it is sufficient to remove a single prefixing underscore if there + * is one. */ + + /* The return value of this function (unlike cg_unescape()) needs free()! */ - return strdup(p); + if (cg_needs_escape(p)) { + n = strjoin("_", p); + if (!n) + return -ENOMEM; + + if (!filename_is_valid(n)) /* became invalid due to the prefixing? Or contained things like a slash that cannot be fixed by prefixing? */ + return -EINVAL; + } else { + n = strdup(p); + if (!n) + return -ENOMEM; + } + + *ret = TAKE_PTR(n); + return 0; } char *cg_unescape(const char *p) { @@ -1698,9 +1710,9 @@ int cg_slice_to_path(const char *unit, char **ret) { if (!unit_name_is_valid(n, UNIT_NAME_PLAIN)) return -EINVAL; - escaped = cg_escape(n); - if (!escaped) - return -ENOMEM; + r = cg_escape(n, &escaped); + if (r < 0) + return r; if (!strextend(&s, escaped, "/")) return -ENOMEM; @@ -1708,15 +1720,14 @@ int cg_slice_to_path(const char *unit, char **ret) { dash = strchr(dash+1, '-'); } - e = cg_escape(unit); - if (!e) - return -ENOMEM; + r = cg_escape(unit, &e); + if (r < 0) + return r; if (!strextend(&s, e)) return -ENOMEM; *ret = TAKE_PTR(s); - return 0; } diff --git a/src/basic/cgroup-util.h b/src/basic/cgroup-util.h index 5de000c4ce..9b30ae0396 100644 --- a/src/basic/cgroup-util.h +++ b/src/basic/cgroup-util.h @@ -279,7 +279,8 @@ int cg_pid_get_user_slice(pid_t pid, char **slice); int cg_path_decode_unit(const char *cgroup, char **unit); -char *cg_escape(const char *p); +bool cg_needs_escape(const char *p); +int cg_escape(const char *p, char **ret); char *cg_unescape(const char *p) _pure_; bool cg_controller_is_valid(const char *p); diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 8b178ea29f..4ec5dfa587 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -293,6 +293,8 @@ void cgroup_context_done(CGroupContext *c) { cpu_set_reset(&c->startup_cpuset_cpus); cpu_set_reset(&c->cpuset_mems); cpu_set_reset(&c->startup_cpuset_mems); + + c->delegate_subgroup = mfree(c->delegate_subgroup); } static int unit_get_kernel_memory_limit(Unit *u, const char *file, uint64_t *ret) { @@ -570,6 +572,10 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { prefix, managed_oom_preference_to_string(c->moom_preference), prefix, cgroup_pressure_watch_to_string(c->memory_pressure_watch)); + if (c->delegate_subgroup) + fprintf(f, "%sDelegateSubgroup: %s\n", + prefix, c->delegate_subgroup); + if (c->memory_pressure_threshold_usec != USEC_INFINITY) fprintf(f, "%sMemoryPressureThresholdSec: %s\n", prefix, FORMAT_TIMESPAN(c->memory_pressure_threshold_usec, 1)); @@ -2087,28 +2093,37 @@ static const char *migrate_callback(CGroupMask mask, void *userdata) { return strempty(unit_get_realized_cgroup_path(userdata, mask)); } -char *unit_default_cgroup_path(const Unit *u) { - _cleanup_free_ char *escaped = NULL, *slice_path = NULL; - Unit *slice; +int unit_default_cgroup_path(const Unit *u, char **ret) { + _cleanup_free_ char *p = NULL; int r; assert(u); + assert(ret); if (unit_has_name(u, SPECIAL_ROOT_SLICE)) - return strdup(u->manager->cgroup_root); + p = strdup(u->manager->cgroup_root); + else { + _cleanup_free_ char *escaped = NULL, *slice_path = NULL; + Unit *slice; - slice = UNIT_GET_SLICE(u); - if (slice && !unit_has_name(slice, SPECIAL_ROOT_SLICE)) { - r = cg_slice_to_path(slice->id, &slice_path); + slice = UNIT_GET_SLICE(u); + if (slice && !unit_has_name(slice, SPECIAL_ROOT_SLICE)) { + r = cg_slice_to_path(slice->id, &slice_path); + if (r < 0) + return r; + } + + r = cg_escape(u->id, &escaped); if (r < 0) - return NULL; - } + return r; - escaped = cg_escape(u->id); - if (!escaped) - return NULL; + p = path_join(empty_to_root(u->manager->cgroup_root), slice_path, escaped); + } + if (!p) + return -ENOMEM; - return path_join(empty_to_root(u->manager->cgroup_root), slice_path, escaped); + *ret = TAKE_PTR(p); + return 0; } int unit_set_cgroup_path(Unit *u, const char *path) { @@ -2264,9 +2279,9 @@ int unit_pick_cgroup_path(Unit *u) { if (!UNIT_HAS_CGROUP_CONTEXT(u)) return -EINVAL; - path = unit_default_cgroup_path(u); - if (!path) - return log_oom(); + r = unit_default_cgroup_path(u, &path); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to generate default cgroup path: %m"); r = unit_set_cgroup_path(u, path); if (r == -EEXIST) diff --git a/src/core/cgroup.h b/src/core/cgroup.h index 8e1f093901..bbbf9408cc 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -133,6 +133,7 @@ struct CGroupContext { bool delegate; CGroupMask delegate_controllers; CGroupMask disable_controllers; + char *delegate_subgroup; /* For unified hierarchy */ uint64_t cpu_weight; @@ -290,7 +291,7 @@ void unit_invalidate_cgroup_members_masks(Unit *u); void unit_add_family_to_cgroup_realize_queue(Unit *u); const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask); -char *unit_default_cgroup_path(const Unit *u); +int unit_default_cgroup_path(const Unit *u, char **ret); int unit_set_cgroup_path(Unit *u, const char *path); int unit_pick_cgroup_path(Unit *u); diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c index 3a02fcbdb1..682ad5edd5 100644 --- a/src/core/dbus-cgroup.c +++ b/src/core/dbus-cgroup.c @@ -435,6 +435,7 @@ const sd_bus_vtable bus_cgroup_vtable[] = { SD_BUS_VTABLE_START(0), SD_BUS_PROPERTY("Delegate", "b", bus_property_get_bool, offsetof(CGroupContext, delegate), 0), SD_BUS_PROPERTY("DelegateControllers", "as", property_get_delegate_controllers, 0, 0), + SD_BUS_PROPERTY("DelegateSubgroup", "s", NULL, offsetof(CGroupContext, delegate_subgroup), 0), SD_BUS_PROPERTY("CPUAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, cpu_accounting), 0), SD_BUS_PROPERTY("CPUWeight", "t", NULL, offsetof(CGroupContext, cpu_weight), 0), SD_BUS_PROPERTY("StartupCPUWeight", "t", NULL, offsetof(CGroupContext, startup_cpu_weight), 0), @@ -536,6 +537,33 @@ static int bus_cgroup_set_transient_property( return 1; + } else if (streq(name, "DelegateSubgroup")) { + const char *s; + + if (!UNIT_VTABLE(u)->can_delegate) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Delegation not available for unit type"); + + r = sd_bus_message_read(message, "s", &s); + if (r < 0) + return r; + + if (!isempty(s) && cg_needs_escape(s)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid control group name: %s", s); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + if (isempty(s)) + c->delegate_subgroup = mfree(c->delegate_subgroup); + else { + r = free_and_strdup_warn(&c->delegate_subgroup, s); + if (r < 0) + return r; + } + + unit_write_settingf(u, flags, name, "DelegateSubgroup=%s", s); + } + + return 1; + } else if (STR_IN_SET(name, "DelegateControllers", "DisableControllers")) { CGroupMask mask = 0; diff --git a/src/core/execute.c b/src/core/execute.c index 60f7a6439c..b32e341b61 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -4178,8 +4178,12 @@ static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p return 0; } -static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) { - bool using_subcgroup; +static int exec_parameters_get_cgroup_path( + const ExecParameters *params, + const CGroupContext *c, + char **ret) { + + const char *subgroup = NULL; char *p; assert(params); @@ -4197,16 +4201,22 @@ static int exec_parameters_get_cgroup_path(const ExecParameters *params, char ** * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP * flag, which is only passed for the former statements, not for the latter. */ - using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL); - if (using_subcgroup) - p = path_join(params->cgroup_path, ".control"); + if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) { + if (FLAGS_SET(params->flags, EXEC_IS_CONTROL)) + subgroup = ".control"; + else + subgroup = c->delegate_subgroup; + } + + if (subgroup) + p = path_join(params->cgroup_path, subgroup); else p = strdup(params->cgroup_path); if (!p) return -ENOMEM; *ret = p; - return using_subcgroup; + return !!subgroup; } static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) { @@ -4705,7 +4715,7 @@ static int exec_child( if (params->cgroup_path) { _cleanup_free_ char *p = NULL; - r = exec_parameters_get_cgroup_path(params, &p); + r = exec_parameters_get_cgroup_path(params, cgroup_context, &p); if (r < 0) { *exit_status = EXIT_CGROUP; return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m"); @@ -4880,11 +4890,26 @@ static int exec_child( * touch a single hierarchy too. */ if (params->flags & EXEC_CGROUP_DELEGATE) { + _cleanup_free_ char *p = NULL; + r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid); if (r < 0) { *exit_status = EXIT_CGROUP; return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m"); } + + r = exec_parameters_get_cgroup_path(params, cgroup_context, &p); + if (r < 0) { + *exit_status = EXIT_CGROUP; + return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m"); + } + if (r > 0) { + r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid); + if (r < 0) { + *exit_status = EXIT_CGROUP; + return log_unit_error_errno(unit, r, "Failed to adjust control subgroup access: %m"); + } + } } if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) { @@ -5635,18 +5660,16 @@ int exec_spawn(Unit *unit, log_command_line(unit, "About to execute", command->path, command->argv); if (params->cgroup_path) { - r = exec_parameters_get_cgroup_path(params, &subcgroup_path); + r = exec_parameters_get_cgroup_path(params, cgroup_context, &subcgroup_path); if (r < 0) return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m"); - if (r > 0) { /* We are using a child cgroup */ + if (r > 0) { + /* If there's a subcgroup, then let's create it here now (the main cgroup was already + * realized by the unit logic) */ + r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path); if (r < 0) - return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path); - - /* Normally we would not propagate the xattrs to children but since we created this - * sub-cgroup internally we should do it. */ - cgroup_oomd_xattr_apply(unit, subcgroup_path); - cgroup_log_xattr_apply(unit, subcgroup_path); + return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path); } } diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in index 8a2823b075..110bccb7ad 100644 --- a/src/core/load-fragment-gperf.gperf.in +++ b/src/core/load-fragment-gperf.gperf.in @@ -237,6 +237,7 @@ {{type}}.TasksAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.tasks_accounting) {{type}}.TasksMax, config_parse_tasks_max, 0, offsetof({{type}}, cgroup_context.tasks_max) {{type}}.Delegate, config_parse_delegate, 0, offsetof({{type}}, cgroup_context) +{{type}}.DelegateSubgroup, config_parse_delegate_subgroup , 0, offsetof({{type}}, cgroup_context) {{type}}.DisableControllers, config_parse_disable_controllers, 0, offsetof({{type}}, cgroup_context) {{type}}.IPAccounting, config_parse_bool, 0, offsetof({{type}}, cgroup_context.ip_accounting) {{type}}.IPAddressAllow, config_parse_in_addr_prefixes, AF_UNSPEC, offsetof({{type}}, cgroup_context.ip_address_allow) diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index 581a051d46..ebfe98d7cc 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -4029,6 +4029,42 @@ int config_parse_delegate( return 0; } +int config_parse_delegate_subgroup( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + CGroupContext *c = ASSERT_PTR(data); + UnitType t; + + t = unit_name_to_type(unit); + assert(t >= 0); + + if (!unit_vtable[t]->can_delegate) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "DelegateSubgroup= setting not supported for this unit type, ignoring."); + return 0; + } + + if (isempty(rvalue)) { + c->delegate_subgroup = mfree(c->delegate_subgroup); + return 0; + } + + if (cg_needs_escape(rvalue)) { /* Insist that specified names don't need escaping */ + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid control group name, ignoring: %s", rvalue); + return 0; + } + + return free_and_strdup_warn(&c->delegate_subgroup, rvalue); +} + int config_parse_managed_oom_mode( const char *unit, const char *filename, diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h index a38d697338..59f02a3207 100644 --- a/src/core/load-fragment.h +++ b/src/core/load-fragment.h @@ -83,6 +83,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_cpu_shares); CONFIG_PARSER_PROTOTYPE(config_parse_memory_limit); CONFIG_PARSER_PROTOTYPE(config_parse_tasks_max); CONFIG_PARSER_PROTOTYPE(config_parse_delegate); +CONFIG_PARSER_PROTOTYPE(config_parse_delegate_subgroup); CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mode); CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mem_pressure_limit); CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_preference); diff --git a/src/core/unit-printf.c b/src/core/unit-printf.c index 1b267d4fdd..3977082cc1 100644 --- a/src/core/unit-printf.c +++ b/src/core/unit-printf.c @@ -86,19 +86,21 @@ static void bad_specifier(const Unit *u, char specifier) { static int specifier_cgroup(char specifier, const void *data, const char *root, const void *userdata, char **ret) { const Unit *u = ASSERT_PTR(userdata); - char *n; bad_specifier(u, specifier); - if (u->cgroup_path) + if (u->cgroup_path) { + char *n; + n = strdup(u->cgroup_path); - else - n = unit_default_cgroup_path(u); - if (!n) - return -ENOMEM; + if (!n) + return -ENOMEM; - *ret = n; - return 0; + *ret = n; + return 0; + } + + return unit_default_cgroup_path(u, ret); } static int specifier_cgroup_root(char specifier, const void *data, const char *root, const void *userdata, char **ret) { @@ -126,7 +128,7 @@ static int specifier_cgroup_slice(char specifier, const void *data, const char * if (slice->cgroup_path) n = strdup(slice->cgroup_path); else - n = unit_default_cgroup_path(slice); + return unit_default_cgroup_path(slice, ret); } else n = strdup(u->manager->cgroup_root); if (!n) diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c index 0deb4ebb30..a9d36627a8 100644 --- a/src/nspawn/nspawn-cgroup.c +++ b/src/nspawn/nspawn-cgroup.c @@ -141,9 +141,9 @@ finish: } int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested) { - _cleanup_free_ char *cgroup = NULL; + _cleanup_free_ char *cgroup = NULL, *payload = NULL; CGroupMask supported; - const char *payload; + char *e; int r; assert(pid > 1); @@ -174,15 +174,26 @@ int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested) if (r < 0) return log_error_errno(r, "Failed to get our control group: %m"); - payload = strjoina(cgroup, "/payload"); + /* If the service manager already placed us in the supervisor cgroup, let's handle that. */ + e = endswith(cgroup, "/supervisor"); + if (e) + *e = 0; /* chop off, we want the main path delegated to us */ + + payload = path_join(cgroup, "payload"); + if (!payload) + return log_oom(); + r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, payload, pid); if (r < 0) return log_error_errno(r, "Failed to create %s subcgroup: %m", payload); if (keep_unit) { - const char *supervisor; + _cleanup_free_ char *supervisor = NULL; + + supervisor = path_join(cgroup, "supervisor"); + if (!supervisor) + return log_oom(); - supervisor = strjoina(cgroup, "/supervisor"); r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, supervisor, 0); if (r < 0) return log_error_errno(r, "Failed to create %s subcgroup: %m", supervisor); diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index ebbd1f7f28..a321179609 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -461,7 +461,8 @@ static int bus_append_cgroup_property(sd_bus_message *m, const char *field, cons "ManagedOOMSwap", "ManagedOOMMemoryPressure", "ManagedOOMPreference", - "MemoryPressureWatch")) + "MemoryPressureWatch", + "DelegateSubgroup")) return bus_append_string(m, field, eq); if (STR_IN_SET(field, "ManagedOOMMemoryPressureLimit")) { diff --git a/src/shared/cgroup-setup.c b/src/shared/cgroup-setup.c index 65be851014..6f3f9dd939 100644 --- a/src/shared/cgroup-setup.c +++ b/src/shared/cgroup-setup.c @@ -483,6 +483,82 @@ int cg_set_access( return 0; } +struct access_callback_data { + uid_t uid; + gid_t gid; + int error; +}; + +static int access_callback( + RecurseDirEvent event, + const char *path, + int dir_fd, + int inode_fd, + const struct dirent *de, + const struct statx *sx, + void *userdata) { + + struct access_callback_data *d = ASSERT_PTR(userdata); + + if (!IN_SET(event, RECURSE_DIR_ENTER, RECURSE_DIR_ENTRY)) + return RECURSE_DIR_CONTINUE; + + assert(inode_fd >= 0); + + /* fchown() doesn't support O_PATH fds, hence we use the /proc/self/fd/ trick */ + if (chown(FORMAT_PROC_FD_PATH(inode_fd), d->uid, d->gid) < 0) { + log_debug_errno(errno, "Failed to change ownership of '%s', ignoring: %m", ASSERT_PTR(path)); + + if (d->error == 0) /* Return last error to caller */ + d->error = errno; + } + + return RECURSE_DIR_CONTINUE; +} + +int cg_set_access_recursive( + const char *controller, + const char *path, + uid_t uid, + gid_t gid) { + + _cleanup_close_ int fd = -EBADF; + _cleanup_free_ char *fs = NULL; + int r; + + /* A recursive version of cg_set_access(). But note that this one changes ownership of *all* files, + * not just the allowlist that cg_set_access() uses. Use cg_set_access() on the cgroup you want to + * delegate, and cg_set_access_recursive() for any subcrgoups you might want to create below it. */ + + if (!uid_is_valid(uid) && !gid_is_valid(gid)) + return 0; + + r = cg_get_path(controller, path, NULL, &fs); + if (r < 0) + return r; + + fd = open(fs, O_DIRECTORY|O_CLOEXEC|O_RDONLY); + if (fd < 0) + return -errno; + + struct access_callback_data d = { + .uid = uid, + .gid = gid, + }; + + r = recurse_dir(fd, + fs, + /* statx_mask= */ 0, + /* n_depth_max= */ UINT_MAX, + RECURSE_DIR_SAME_MOUNT|RECURSE_DIR_INODE_FD|RECURSE_DIR_TOPLEVEL, + access_callback, + &d); + if (r < 0) + return r; + + return -d.error; +} + int cg_migrate( const char *cfrom, const char *pfrom, diff --git a/src/shared/cgroup-setup.h b/src/shared/cgroup-setup.h index 13f836bd90..1b6f0716c6 100644 --- a/src/shared/cgroup-setup.h +++ b/src/shared/cgroup-setup.h @@ -24,6 +24,7 @@ int cg_attach_fallback(const char *controller, const char *path, pid_t pid); int cg_create_and_attach(const char *controller, const char *path, pid_t pid); int cg_set_access(const char *controller, const char *path, uid_t uid, gid_t gid); +int cg_set_access_recursive(const char *controller, const char *path, uid_t uid, gid_t gid); int cg_migrate(const char *cfrom, const char *pfrom, const char *cto, const char *pto, CGroupFlags flags); int cg_migrate_recursive(const char *cfrom, const char *pfrom, const char *cto, const char *pto, CGroupFlags flags); diff --git a/src/test/test-cgroup-util.c b/src/test/test-cgroup-util.c index cdf911926c..4f430a1df2 100644 --- a/src/test/test-cgroup-util.c +++ b/src/test/test-cgroup-util.c @@ -235,14 +235,19 @@ TEST(proc) { } } -static void test_escape_one(const char *s, const char *r) { - _cleanup_free_ char *b; +static void test_escape_one(const char *s, const char *expected) { + _cleanup_free_ char *b = NULL; - b = cg_escape(s); - assert_se(b); - assert_se(streq(b, r)); + assert_se(s); + assert_se(expected); + + assert_se(cg_escape(s, &b) >= 0); + assert_se(streq(b, expected)); assert_se(streq(cg_unescape(b), s)); + + assert_se(filename_is_valid(b)); + assert_se(!cg_needs_escape(s) || b[0] == '_'); } TEST(escape, .sd_booted = true) { diff --git a/src/udev/udevd.c b/src/udev/udevd.c index c56956c378..b3aabcaa1f 100644 --- a/src/udev/udevd.c +++ b/src/udev/udevd.c @@ -1799,54 +1799,6 @@ static int parse_argv(int argc, char *argv[]) { return 1; } -static int create_subcgroup(char **ret) { - _cleanup_free_ char *cgroup = NULL, *subcgroup = NULL; - int r; - - if (getppid() != 1) - return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Not invoked by PID1."); - - r = sd_booted(); - if (r < 0) - return log_debug_errno(r, "Failed to check if systemd is running: %m"); - if (r == 0) - return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "systemd is not running."); - - /* Get our own cgroup, we regularly kill everything udev has left behind. - * We only do this on systemd systems, and only if we are directly spawned - * by PID1. Otherwise we are not guaranteed to have a dedicated cgroup. */ - - r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup); - if (r < 0) { - if (IN_SET(r, -ENOENT, -ENOMEDIUM)) - return log_debug_errno(r, "Dedicated cgroup not found: %m"); - return log_debug_errno(r, "Failed to get cgroup: %m"); - } - - r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, cgroup, "trusted.delegate"); - if (r == 0 || (r < 0 && ERRNO_IS_XATTR_ABSENT(r))) - return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "The cgroup %s is not delegated to us.", cgroup); - if (r < 0) - return log_debug_errno(r, "Failed to read trusted.delegate attribute: %m"); - - /* We are invoked with our own delegated cgroup tree, let's move us one level down, so that we - * don't collide with the "no processes in inner nodes" rule of cgroups, when the service - * manager invokes the ExecReload= job in the .control/ subcgroup. */ - - subcgroup = path_join(cgroup, "/udev"); - if (!subcgroup) - return log_oom_debug(); - - r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup, 0); - if (r < 0) - return log_debug_errno(r, "Failed to create %s subcgroup: %m", subcgroup); - - log_debug("Created %s subcgroup.", subcgroup); - if (ret) - *ret = TAKE_PTR(subcgroup); - return 0; -} - static int manager_new(Manager **ret, int fd_ctrl, int fd_uevent) { _cleanup_(manager_freep) Manager *manager = NULL; _cleanup_free_ char *cgroup = NULL; @@ -1854,8 +1806,6 @@ static int manager_new(Manager **ret, int fd_ctrl, int fd_uevent) { assert(ret); - (void) create_subcgroup(&cgroup); - manager = new(Manager, 1); if (!manager) return log_oom(); @@ -1863,7 +1813,6 @@ static int manager_new(Manager **ret, int fd_ctrl, int fd_uevent) { *manager = (Manager) { .inotify_fd = -EBADF, .worker_watch = PIPE_EBADF, - .cgroup = TAKE_PTR(cgroup), }; r = udev_ctrl_new_from_fd(&manager->ctrl, fd_ctrl); @@ -1895,6 +1844,14 @@ static int manager_new(Manager **ret, int fd_ctrl, int fd_uevent) { manager->log_level = log_get_max_level(); + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup); + if (r < 0) + log_warning_errno(r, "Failed to get cgroup, ignoring: %m"); + else if (endswith(cgroup, "/udev")) { /* If we are in a subcgroup /udev/ we assume it was delegated to us */ + log_debug("Running in delegated subcgroup '%s'.", cgroup); + manager->cgroup = TAKE_PTR(cgroup); + } + *ret = TAKE_PTR(manager); return 0; diff --git a/test/units/testsuite-19.sh b/test/units/testsuite-19.sh index 1e705ea72b..20759eaa1f 100755 --- a/test/units/testsuite-19.sh +++ b/test/units/testsuite-19.sh @@ -45,6 +45,24 @@ if grep -q cgroup2 /proc/filesystems ; then # Check that unprivileged delegation works for scopes test_scope_unpriv_delegation + # Verify that DelegateSubgroup= affects ownership correctly + U="testsubgroup-$RANDOM.service" + systemd-run --wait --unit="$U" -p "DynamicUser=1" -p "Delegate=pids" -p "DelegateSubgroup=foo" \ + test -w "/sys/fs/cgroup/system.slice/$U" -a \ + -w "/sys/fs/cgroup/system.slice/$U/foo" + + # Check that for the subgroup also attributes that aren't covered by + # regular (i.e. main cgroup) delegation ownership rules are delegated properly + if test -f /sys/fs/cgroup/cgroup.max.depth ; then + U="testsubgroup-$RANDOM.service" + systemd-run --wait --unit="$U" -p "DynamicUser=1" -p "Delegate=pids" -p "DelegateSubgroup=zzz" \ + test -w "/sys/fs/cgroup/system.slice/$U/zzz/cgroup.max.depth" + fi + + # Check that the invoked process itsel is also in the subgroup + U="testsubgroup-$RANDOM.service" + systemd-run --wait --unit="$U" -p "DynamicUser=1" -p "Delegate=pids" -p "DelegateSubgroup=bar" \ + grep -q -x -F "0::/system.slice/$U/bar" /proc/self/cgroup else echo "Skipping TEST-19-DELEGATE, as the kernel doesn't actually support cgroup v2" >&2 fi diff --git a/units/systemd-nspawn@.service.in b/units/systemd-nspawn@.service.in index e1626b9c87..079d6064f6 100644 --- a/units/systemd-nspawn@.service.in +++ b/units/systemd-nspawn@.service.in @@ -25,6 +25,7 @@ RestartForceExitStatus=133 SuccessExitStatus=133 Slice=machine.slice Delegate=yes +DelegateSubgroup=supervisor TasksMax=16384 {{SERVICE_WATCHDOG}} diff --git a/units/systemd-udevd.service.in b/units/systemd-udevd.service.in index dfc2a0e341..a0ee9e0a50 100644 --- a/units/systemd-udevd.service.in +++ b/units/systemd-udevd.service.in @@ -18,6 +18,7 @@ ConditionPathIsReadWrite=/sys [Service] CapabilityBoundingSet=~CAP_SYS_TIME CAP_WAKE_ALARM Delegate=pids +DelegateSubgroup=udev Type=notify-reload # Note that udev will reset the value internally for its workers OOMScoreAdjust=-1000 diff --git a/units/user@.service.in b/units/user@.service.in index 41ed55cb1a..86ab4ffcc6 100644 --- a/units/user@.service.in +++ b/units/user@.service.in @@ -22,6 +22,7 @@ ExecStart={{ROOTLIBEXECDIR}}/systemd --user Slice=user-%i.slice KillMode=mixed Delegate=pids memory cpu +DelegateSubgroup=init.scope TasksMax=infinity TimeoutStopSec={{ DEFAULT_USER_TIMEOUT_SEC*4//3 }}s KeyringMode=inherit |