diff options
author | Benjamin Berg <bberg@redhat.com> | 2022-02-03 15:50:31 +0100 |
---|---|---|
committer | Benjamin Berg <bberg@redhat.com> | 2022-02-04 20:00:35 +0100 |
commit | 29f4185a9cdc101e78efd92c1cd42d9a7c5de72e (patch) | |
tree | b81c4e7a8eeff18446052e97492f17cd0e97009d /src/oom | |
parent | f83da717d9cdc9d464fcc4abae5193b4b5db1f79 (diff) | |
download | systemd-29f4185a9cdc101e78efd92c1cd42d9a7c5de72e.tar.gz |
oomd: Dump top offenders after a kill action
This hopefully makes it more transparent why a specific cgroup was
killed by systemd-oomd.
Diffstat (limited to 'src/oom')
-rw-r--r-- | src/oom/oomd-util.c | 42 | ||||
-rw-r--r-- | src/oom/oomd-util.h | 1 |
2 files changed, 41 insertions, 2 deletions
diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c index cef7519a74..4d05f57a69 100644 --- a/src/oom/oomd-util.c +++ b/src/oom/oomd-util.c @@ -216,9 +216,36 @@ int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run) { return set_size(pids_killed) != 0; } +typedef void (*dump_candidate_func)(const OomdCGroupContext *ctx, FILE *f, const char *prefix); + +static int dump_kill_candidates(OomdCGroupContext **sorted, int n, int dump_until, dump_candidate_func dump_func) { + /* Try dumping top offendors, ignoring any errors that might happen. */ + _cleanup_free_ char *dump = NULL; + _cleanup_fclose_ FILE *f = NULL; + int r; + size_t size; + + f = open_memstream_unlocked(&dump, &size); + if (!f) + return -errno;; + + fprintf(f, "Considered %d cgroups for killing, top candidates were:\n", n); + for (int i = 0; i < dump_until; i++) + dump_func(sorted[i], f, "\t"); + + r = fflush_and_check(f); + if (r < 0) + return r; + + f = safe_fclose(f); + + return log_dump(LOG_INFO, dump); +} + int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char **ret_selected) { _cleanup_free_ OomdCGroupContext **sorted = NULL; int n, r, ret = 0; + int dump_until; assert(h); assert(ret_selected); @@ -227,6 +254,7 @@ int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char if (n < 0) return n; + dump_until = MIN(n, DUMP_ON_KILL_COUNT); for (int i = 0; i < n; i++) { /* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure. * Continue since there might be "avoid" cgroups at the end. */ @@ -242,19 +270,24 @@ int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char continue; /* Try to find something else to kill */ } + dump_until = MAX(dump_until, i); char *selected = strdup(sorted[i]->path); if (!selected) return -ENOMEM; *ret_selected = selected; - return r; + ret = r; + break; } + dump_kill_candidates(sorted, n, dump_until, oomd_dump_memory_pressure_cgroup_context); + return ret; } int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, char **ret_selected) { _cleanup_free_ OomdCGroupContext **sorted = NULL; int n, r, ret = 0; + int dump_until; assert(h); assert(ret_selected); @@ -263,6 +296,7 @@ int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, if (n < 0) return n; + dump_until = MIN(n, DUMP_ON_KILL_COUNT); /* Try to kill cgroups with non-zero swap usage until we either succeed in killing or we get to a cgroup with * no swap usage. Threshold killing only cgroups with more than threshold swap usage. */ for (int i = 0; i < n; i++) { @@ -280,13 +314,17 @@ int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, continue; /* Try to find something else to kill */ } + dump_until = MAX(dump_until, i); char *selected = strdup(sorted[i]->path); if (!selected) return -ENOMEM; *ret_selected = selected; - return r; + ret = r; + break; } + dump_kill_candidates(sorted, n, dump_until, oomd_dump_swap_cgroup_context); + return ret; } diff --git a/src/oom/oomd-util.h b/src/oom/oomd-util.h index 3a91a31352..6aada90344 100644 --- a/src/oom/oomd-util.h +++ b/src/oom/oomd-util.h @@ -7,6 +7,7 @@ #include "hashmap.h" #include "psi-util.h" +#define DUMP_ON_KILL_COUNT 10 #define GROWING_SIZE_PERCENTILE 80 extern const struct hash_ops oomd_cgroup_ctx_hash_ops; |