summaryrefslogtreecommitdiff
path: root/src/oom
diff options
context:
space:
mode:
authorAnita Zhang <the.anitazha@gmail.com>2021-02-09 01:47:34 -0800
committerAnita Zhang <the.anitazha@gmail.com>2021-02-09 02:27:40 -0800
commit59331b8e292a93bc7a03a51fe54cb65a4257e894 (patch)
tree3b420e7a81168b54d8a6578fd3ea52b593c2a4af /src/oom
parent242d75bdaaa6f222a47148f8a83cc425d6ceefb3 (diff)
downloadsystemd-59331b8e292a93bc7a03a51fe54cb65a4257e894.tar.gz
oom: implement avoid/omit xattr support
There may be situations where a cgroup should be protected from killing or deprioritized as a candidate. In FB oomd xattrs are used to bias oomd away from supervisor cgroups and towards worker cgroups in container tasks. On desktops this can be used to protect important units with unpredictable resource consumption. The patch allows systemd-oomd to understand 2 xattrs: "user.oomd_avoid" and "user.oomd_omit". If systemd-oomd sees these xattrs set to 1 on a candidate cgroup (i.e. while attempting to kill something) AND the cgroup is owned by root, it will either deprioritize the cgroup as a candidate (avoid) or remove it completely as a candidate (omit). Usage is restricted to root owned cgroups to prevent situations where an unprivileged user can set their own cgroups lower in the kill priority than another user's (and prevent them from omitting their units from systemd-oomd killing).
Diffstat (limited to 'src/oom')
-rw-r--r--src/oom/oomd-util.c32
-rw-r--r--src/oom/oomd-util.h15
-rw-r--r--src/oom/test-oomd-util.c60
3 files changed, 98 insertions, 9 deletions
diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c
index fa8b8b70b1..9dd9b17c6d 100644
--- a/src/oom/oomd-util.c
+++ b/src/oom/oomd-util.c
@@ -3,7 +3,6 @@
#include <sys/xattr.h>
#include <unistd.h>
-#include "cgroup-util.h"
#include "fd-util.h"
#include "format-util.h"
#include "oomd-util.h"
@@ -159,7 +158,8 @@ int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const cha
return -ENOMEM;
HASHMAP_FOREACH(item, h) {
- if (item->path && prefix && !path_startswith(item->path, prefix))
+ /* Skip over cgroups that are not valid candidates or are explicitly marked for omission */
+ if ((item->path && prefix && !path_startswith(item->path, prefix)) || item->preference == MANAGED_OOM_PREFERENCE_OMIT)
continue;
sorted[k++] = item;
@@ -219,9 +219,10 @@ int oomd_kill_by_pgscan(Hashmap *h, const char *prefix, bool dry_run) {
return r;
for (int i = 0; i < r; i++) {
- /* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure */
+ /* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure. */
+ /* Don't break since there might be "avoid" cgroups at the end. */
if (sorted[i]->pgscan == 0 && sorted[i]->current_memory_usage == 0)
- break;
+ continue;
r = oomd_cgroup_kill(sorted[i]->path, true, dry_run);
if (r > 0 || r == -ENOMEM)
@@ -244,8 +245,10 @@ int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run) {
/* Try to kill cgroups with non-zero swap usage until we either succeed in
* killing or we get to a cgroup with no swap usage. */
for (int i = 0; i < r; i++) {
+ /* Skip over cgroups with no resource usage. Don't break since there might be "avoid"
+ * cgroups at the end. */
if (sorted[i]->swap_usage == 0)
- break;
+ continue;
r = oomd_cgroup_kill(sorted[i]->path, true, dry_run);
if (r > 0 || r == -ENOMEM)
@@ -259,6 +262,7 @@ int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) {
_cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL;
_cleanup_free_ char *p = NULL, *val = NULL;
bool is_root;
+ uid_t uid;
int r;
assert(path);
@@ -269,6 +273,7 @@ int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) {
return -ENOMEM;
is_root = empty_or_root(path);
+ ctx->preference = MANAGED_OOM_PREFERENCE_NONE;
r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, "memory.pressure", &p);
if (r < 0)
@@ -278,6 +283,23 @@ int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) {
if (r < 0)
return log_debug_errno(r, "Error parsing memory pressure from %s: %m", p);
+ r = cg_get_owner(SYSTEMD_CGROUP_CONTROLLER, path, &uid);
+ if (r < 0)
+ log_debug_errno(r, "Failed to get owner/group from %s: %m", path);
+ else if (uid == 0) {
+ /* Ignore most errors when reading the xattr since it is usually unset and cgroup xattrs are only used
+ * as an optional feature of systemd-oomd (and the system might not even support them). */
+ r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, path, "user.oomd_avoid");
+ if (r == -ENOMEM)
+ return r;
+ ctx->preference = r == 1 ? MANAGED_OOM_PREFERENCE_AVOID : ctx->preference;
+
+ r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, path, "user.oomd_omit");
+ if (r == -ENOMEM)
+ return r;
+ ctx->preference = r == 1 ? MANAGED_OOM_PREFERENCE_OMIT : ctx->preference;
+ }
+
if (is_root) {
r = procfs_memory_get_used(&ctx->current_memory_usage);
if (r < 0)
diff --git a/src/oom/oomd-util.h b/src/oom/oomd-util.h
index 650b4efb9c..bffccf75da 100644
--- a/src/oom/oomd-util.h
+++ b/src/oom/oomd-util.h
@@ -3,6 +3,7 @@
#include <stdbool.h>
+#include "cgroup-util.h"
#include "hashmap.h"
#include "psi-util.h"
@@ -29,6 +30,8 @@ struct OomdCGroupContext {
uint64_t last_pgscan;
uint64_t pgscan;
+ ManagedOOMPreference preference;
+
/* These are only used by oomd_pressure_above for acting on high memory pressure. */
loadavg_t mem_pressure_limit;
usec_t mem_pressure_duration_usec;
@@ -61,12 +64,18 @@ bool oomd_memory_reclaim(Hashmap *h);
/* Returns true if the amount of swap free is below the percentage of swap specified by `threshold_percent`. */
bool oomd_swap_free_below(const OomdSystemContext *ctx, uint64_t threshold_percent);
+/* The compare functions will sort from largest to smallest, putting all the contexts with "avoid" at the end
+ * (after the smallest values). */
static inline int compare_pgscan_and_memory_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) {
int r;
assert(c1);
assert(c2);
+ r = CMP((*c1)->preference, (*c2)->preference);
+ if (r != 0)
+ return r;
+
r = CMP((*c2)->pgscan, (*c1)->pgscan);
if (r != 0)
return r;
@@ -75,9 +84,15 @@ static inline int compare_pgscan_and_memory_usage(OomdCGroupContext * const *c1,
}
static inline int compare_swap_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) {
+ int r;
+
assert(c1);
assert(c2);
+ r = CMP((*c1)->preference, (*c2)->preference);
+ if (r != 0)
+ return r;
+
return CMP((*c2)->swap_usage, (*c1)->swap_usage);
}
diff --git a/src/oom/test-oomd-util.c b/src/oom/test-oomd-util.c
index a1fe78806a..49a02f9424 100644
--- a/src/oom/test-oomd-util.c
+++ b/src/oom/test-oomd-util.c
@@ -89,6 +89,8 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
_cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL;
_cleanup_free_ char *cgroup = NULL;
OomdCGroupContext *c1, *c2;
+ bool test_xattrs;
+ int r;
if (geteuid() != 0)
return (void) log_tests_skipped("not root");
@@ -101,6 +103,16 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
assert_se(cg_pid_get_path(NULL, 0, &cgroup) >= 0);
+ /* If we don't have permissions to set xattrs we're likely in a userns or missing capabilities
+ * so skip the xattr portions of the test. */
+ r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_test", "1", 1, 0);
+ test_xattrs = !ERRNO_IS_PRIVILEGE(r) && !ERRNO_IS_NOT_SUPPORTED(r);
+
+ if (test_xattrs) {
+ assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_omit", "1", 1, 0) >= 0);
+ assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_avoid", "1", 1, 0) >= 0);
+ }
+
assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0);
assert_se(streq(ctx->path, cgroup));
@@ -110,12 +122,28 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
assert_se(ctx->swap_usage == 0);
assert_se(ctx->last_pgscan == 0);
assert_se(ctx->pgscan == 0);
+ /* omit takes precedence over avoid when both are set to true */
+ if (test_xattrs)
+ assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_OMIT);
+ else
+ assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_NONE);
+ ctx = oomd_cgroup_context_free(ctx);
+
+ /* also check when only avoid is set to true */
+ if (test_xattrs) {
+ assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_omit", "0", 1, 0) >= 0);
+ assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_avoid", "1", 1, 0) >= 0);
+ }
+ assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0);
+ if (test_xattrs)
+ assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_AVOID);
ctx = oomd_cgroup_context_free(ctx);
/* Test the root cgroup */
assert_se(oomd_cgroup_context_acquire("", &ctx) == 0);
assert_se(streq(ctx->path, "/"));
assert_se(ctx->current_memory_usage > 0);
+ assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_NONE);
/* Test hashmap inserts */
assert_se(h1 = hashmap_new(&oomd_cgroup_ctx_hash_ops));
@@ -137,6 +165,14 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
assert_se(c2->last_pgscan == 5555);
assert_se(c2->mem_pressure_limit == 6789);
assert_se(c2->last_hit_mem_pressure_limit == 42);
+
+ /* Assert that avoid/omit are not set if the cgroup is not owned by root */
+ if (test_xattrs) {
+ ctx = oomd_cgroup_context_free(ctx);
+ assert_se(cg_set_access(SYSTEMD_CGROUP_CONTROLLER, cgroup, 65534, 0) >= 0);
+ assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0);
+ assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_NONE);
+ }
}
static void test_oomd_system_context_acquire(void) {
@@ -287,9 +323,11 @@ static void test_oomd_sort_cgroups(void) {
char **paths = STRV_MAKE("/herp.slice",
"/herp.slice/derp.scope",
"/herp.slice/derp.scope/sheep.service",
- "/zupa.slice");
+ "/zupa.slice",
+ "/omitted.slice",
+ "/avoid.slice");
- OomdCGroupContext ctx[4] = {
+ OomdCGroupContext ctx[6] = {
{ .path = paths[0],
.swap_usage = 20,
.pgscan = 60,
@@ -306,6 +344,14 @@ static void test_oomd_sort_cgroups(void) {
.swap_usage = 10,
.pgscan = 80,
.current_memory_usage = 10 },
+ { .path = paths[4],
+ .swap_usage = 90,
+ .pgscan = 100,
+ .preference = MANAGED_OOM_PREFERENCE_OMIT },
+ { .path = paths[5],
+ .swap_usage = 99,
+ .pgscan = 200,
+ .preference = MANAGED_OOM_PREFERENCE_AVOID },
};
assert_se(h = hashmap_new(&string_hash_ops));
@@ -314,19 +360,23 @@ static void test_oomd_sort_cgroups(void) {
assert_se(hashmap_put(h, "/herp.slice/derp.scope", &ctx[1]) >= 0);
assert_se(hashmap_put(h, "/herp.slice/derp.scope/sheep.service", &ctx[2]) >= 0);
assert_se(hashmap_put(h, "/zupa.slice", &ctx[3]) >= 0);
+ assert_se(hashmap_put(h, "/omitted.slice", &ctx[4]) >= 0);
+ assert_se(hashmap_put(h, "/avoid.slice", &ctx[5]) >= 0);
- assert_se(oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted_cgroups) == 4);
+ assert_se(oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted_cgroups) == 5);
assert_se(sorted_cgroups[0] == &ctx[1]);
assert_se(sorted_cgroups[1] == &ctx[2]);
assert_se(sorted_cgroups[2] == &ctx[0]);
assert_se(sorted_cgroups[3] == &ctx[3]);
+ assert_se(sorted_cgroups[4] == &ctx[5]);
sorted_cgroups = mfree(sorted_cgroups);
- assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, NULL, &sorted_cgroups) == 4);
+ assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, NULL, &sorted_cgroups) == 5);
assert_se(sorted_cgroups[0] == &ctx[3]);
assert_se(sorted_cgroups[1] == &ctx[0]);
assert_se(sorted_cgroups[2] == &ctx[2]);
assert_se(sorted_cgroups[3] == &ctx[1]);
+ assert_se(sorted_cgroups[4] == &ctx[5]);
sorted_cgroups = mfree(sorted_cgroups);
assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, "/herp.slice/derp.scope", &sorted_cgroups) == 2);
@@ -334,6 +384,8 @@ static void test_oomd_sort_cgroups(void) {
assert_se(sorted_cgroups[1] == &ctx[1]);
assert_se(sorted_cgroups[2] == 0);
assert_se(sorted_cgroups[3] == 0);
+ assert_se(sorted_cgroups[4] == 0);
+ assert_se(sorted_cgroups[5] == 0);
sorted_cgroups = mfree(sorted_cgroups);
}