oomd: split swap and mem pressure event timers

One thing that came out of the test week is that systoomd needs to poll more frequently so as not to race with the kernel oom killer in situations where memory is eaten quickly. Memory pressure counters are lagging so it isn't worthwhile to change the current read rate; however swap is not lagging and can be checked more frequently. So let's split these into 2 different timer events. As a result, swap now also doesn't have to be subject to the post-action (post-kill) delay that we need for memory pressure events. Addresses some of slowness to kill discussed in https://bugzilla.redhat.com/show_bug.cgi?id=1941340
author: Anita Zhang <the.anitazha@gmail.com> 2021-03-24 02:17:04 -0700
committer: Anita Zhang <the.anitazha@gmail.com> 2021-04-01 19:44:14 -0700
commit: 81d66fab342e112aefe8b5f43bc853ef66c92add (patch)
tree: 38a79355fc512b0bc2f80ebd8445255c32f0d816 /src/oom
parent: 8ab34a49dbf75fd731973359a6f24c212682f479 (diff)
download: systemd-81d66fab342e112aefe8b5f43bc853ef66c92add.tar.gz
2 files changed, 127 insertions, 56 deletions
diff --git a/src/oom/oomd-manager.c b/src/oom/oomd-manager.c
index c3e84aadde..2a683d5469 100644
--- a/src/oom/oomd-manager.c
+++ b/src/oom/oomd-manager.c
@@ -299,8 +299,7 @@ static int acquire_managed_oom_connect(Manager *m) {
         return 0;
 }
 
-static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
-        _cleanup_set_free_ Set *targets = NULL;
+static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
         Manager *m = userdata;
         usec_t usec_now;
         int r;
@@ -313,7 +312,7 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
         if (r < 0)
                 return log_error_errno(r, "Failed to reset event timer: %m");
 
-        r = sd_event_source_set_time_relative(s, INTERVAL_USEC);
+        r = sd_event_source_set_time_relative(s, SWAP_INTERVAL_USEC);
         if (r < 0)
                 return log_error_errno(r, "Failed to set relative time for timer: %m");
 
@@ -324,13 +323,89 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
                         return log_error_errno(r, "Failed to acquire varlink connection: %m");
         }
 
-        /* Update the cgroups used for detection/action */
-        r = update_monitored_cgroup_contexts(&m->monitored_swap_cgroup_contexts);
-        if (r == -ENOMEM)
-                return log_oom();
+        /* We still try to acquire swap information for oomctl even if no units want swap monitoring */
+        r = oomd_system_context_acquire("/proc/swaps", &m->system_context);
+        /* If there are no units depending on swap actions, the only error we exit on is ENOMEM.
+         * Allow ENOENT in the event that swap is disabled on the system. */
+        if (r == -ENOENT) {
+                zero(m->system_context);
+                return 0;
+        } else if (r == -ENOMEM || (r < 0 && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
+                return log_error_errno(r, "Failed to acquire system context: %m");
+
+        /* Return early if nothing is requesting swap monitoring */
+        if (hashmap_isempty(m->monitored_swap_cgroup_contexts))
+                return 0;
+
+        /* Note that m->monitored_swap_cgroup_contexts does not need to be updated every interval because only the
+         * system context is used for deciding whether the swap threshold is hit. m->monitored_swap_cgroup_contexts
+         * is only used to decide which cgroups to kill (and even then only the resource usages of its descendent
+         * nodes are the ones that matter). */
+
+        if (oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) {
+                _cleanup_hashmap_free_ Hashmap *candidates = NULL;
+                _cleanup_free_ char *selected = NULL;
+
+                log_debug("Swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR,
+                          m->system_context.swap_used, m->system_context.swap_total,
+                          PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
+
+                r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0)
+                        log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m");
+
+                r = oomd_kill_by_swap_usage(candidates, m->dry_run, &selected);
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0)
+                        log_notice_errno(r, "Failed to kill any cgroup(s) based on swap: %m");
+                else {
+                        if (selected)
+                                log_notice("Killed %s due to swap used (%"PRIu64") / total (%"PRIu64") being more than "
+                                           PERMYRIAD_AS_PERCENT_FORMAT_STR,
+                                           selected, m->system_context.swap_used, m->system_context.swap_total,
+                                           PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
+                        return 0;
+                }
+        }
+
+        return 0;
+}
+
+static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
+        _cleanup_set_free_ Set *targets = NULL;
+        Manager *m = userdata;
+        usec_t usec_now;
+        int r;
+
+        assert(s);
+        assert(userdata);
+
+        /* Reset timer */
+        r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
         if (r < 0)
-                log_debug_errno(r, "Failed to update monitored swap cgroup contexts, ignoring: %m");
+                return log_error_errno(r, "Failed to reset event timer: %m");
+
+        r = sd_event_source_set_time_relative(s, MEM_PRESSURE_INTERVAL_USEC);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set relative time for timer: %m");
+
+        /* Reconnect if our connection dropped */
+        if (!m->varlink) {
+                r = acquire_managed_oom_connect(m);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to acquire varlink connection: %m");
+        }
 
+        /* Return early if nothing is requesting memory pressure monitoring */
+        if (hashmap_isempty(m->monitored_mem_pressure_cgroup_contexts)) {
+                hashmap_clear(m->monitored_mem_pressure_cgroup_contexts_candidates);
+                return 0;
+        }
+
+        /* Update the cgroups used for detection/action */
         r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts);
         if (r == -ENOMEM)
                 return log_oom();
@@ -344,23 +419,16 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
         if (r < 0)
                 log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
 
-        r = oomd_system_context_acquire("/proc/swaps", &m->system_context);
-        /* If there aren't units depending on swap actions, the only error we exit on is ENOMEM.
-         * Allow ENOENT in the event that swap is disabled on the system. */
-        if (r == -ENOMEM || (r < 0 && r != -ENOENT && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
-                return log_error_errno(r, "Failed to acquire system context: %m");
-        else if (r == -ENOENT)
-                zero(m->system_context);
-
         if (oomd_memory_reclaim(m->monitored_mem_pressure_cgroup_contexts))
                 m->last_reclaim_at = usec_now;
 
-        /* If we're still recovering from a kill, don't try to kill again yet */
-        if (m->post_action_delay_start > 0) {
-                if (m->post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now)
+        /* Since pressure counters are lagging, we need to wait a bit after a kill to ensure we don't read stale
+         * values and go on a kill storm. */
+        if (m->mem_pressure_post_action_delay_start > 0) {
+                if (m->mem_pressure_post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now)
                         return 0;
                 else
-                        m->post_action_delay_start = 0;
+                        m->mem_pressure_post_action_delay_start = 0;
         }
 
         r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets);
@@ -396,7 +464,7 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
                                         log_notice_errno(r, "Failed to kill any cgroup(s) under %s based on pressure: %m", t->path);
                                 else {
                                         /* Don't act on all the high pressure cgroups at once; return as soon as we kill one */
-                                        m->post_action_delay_start = usec_now;
+                                        m->mem_pressure_post_action_delay_start = usec_now;
                                         if (selected)
                                                 log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%"
                                                            " for > %s with reclaim activity",
@@ -412,47 +480,42 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
                 }
         }
 
-        if (oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) {
-                _cleanup_hashmap_free_ Hashmap *candidates = NULL;
-                _cleanup_free_ char *selected = NULL;
+        return 0;
+}
 
-                log_debug("Swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR,
-                          m->system_context.swap_used, m->system_context.swap_total,
-                          PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
+static int monitor_swap_contexts(Manager *m) {
+        _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
+        int r;
 
-                r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates);
-                if (r == -ENOMEM)
-                        return log_oom();
-                if (r < 0)
-                        log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m");
+        assert(m);
+        assert(m->event);
 
-                r = oomd_kill_by_swap_usage(candidates, m->dry_run, &selected);
-                if (r == -ENOMEM)
-                        return log_oom();
-                if (r < 0)
-                        log_notice_errno(r, "Failed to kill any cgroup(s) based on swap: %m");
-                else {
-                        m->post_action_delay_start = usec_now;
-                        if (selected)
-                                log_notice("Killed %s due to swap used (%"PRIu64") / total (%"PRIu64") being more than "
-                                           PERMYRIAD_AS_PERCENT_FORMAT_STR,
-                                           selected, m->system_context.swap_used, m->system_context.swap_total,
-                                           PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
-                        return 0;
-                }
-        }
+        r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_swap_contexts_handler, m);
+        if (r < 0)
+                return r;
+
+        r = sd_event_source_set_exit_on_failure(s, true);
+        if (r < 0)
+                return r;
 
+        r = sd_event_source_set_enabled(s, SD_EVENT_ON);
+        if (r < 0)
+                return r;
+
+        (void) sd_event_source_set_description(s, "oomd-swap-timer");
+
+        m->swap_context_event_source = TAKE_PTR(s);
         return 0;
 }
 
-static int monitor_cgroup_contexts(Manager *m) {
+static int monitor_memory_pressure_contexts(Manager *m) {
         _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
         int r;
 
         assert(m);
         assert(m->event);
 
-        r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_cgroup_contexts_handler, m);
+        r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_memory_pressure_contexts_handler, m);
         if (r < 0)
                 return r;
 
@@ -464,9 +527,9 @@ static int monitor_cgroup_contexts(Manager *m) {
         if (r < 0)
                 return r;
 
-        (void) sd_event_source_set_description(s, "oomd-timer");
+        (void) sd_event_source_set_description(s, "oomd-memory-pressure-timer");
 
-        m->cgroup_context_event_source = TAKE_PTR(s);
+        m->mem_pressure_context_event_source = TAKE_PTR(s);
         return 0;
 }
 
@@ -474,7 +537,8 @@ Manager* manager_free(Manager *m) {
         assert(m);
 
         varlink_close_unref(m->varlink);
-        sd_event_source_unref(m->cgroup_context_event_source);
+        sd_event_source_unref(m->swap_context_event_source);
+        sd_event_source_unref(m->mem_pressure_context_event_source);
         sd_event_unref(m->event);
 
         bus_verify_polkit_async_registry_free(m->polkit_registry);
@@ -596,7 +660,11 @@ int manager_start(
         if (r < 0)
                 return r;
 
-        r = monitor_cgroup_contexts(m);
+        r = monitor_memory_pressure_contexts(m);
+        if (r < 0)
+                return r;
+
+        r = monitor_swap_contexts(m);
         if (r < 0)
                 return r;
 
diff --git a/src/oom/oomd-manager.h b/src/oom/oomd-manager.h
index 9c580c8a24..3df7ce94d7 100644
--- a/src/oom/oomd-manager.h
+++ b/src/oom/oomd-manager.h
@@ -7,7 +7,9 @@
 #include "varlink.h"
 
 /* Polling interval for monitoring stats */
-#define INTERVAL_USEC (1 * USEC_PER_SEC)
+#define SWAP_INTERVAL_USEC 150000 /* 0.15 seconds */
+/* Pressure counters are lagging (~2 seconds) compared to swap so polling too frequently just wastes CPU */
+#define MEM_PRESSURE_INTERVAL_USEC (1 * USEC_PER_SEC)
 
 /* Used to weight the averages */
 #define AVERAGE_SIZE_DECAY 4
@@ -45,9 +47,10 @@ struct Manager {
         OomdSystemContext system_context;
 
         usec_t last_reclaim_at;
-        usec_t post_action_delay_start;
+        usec_t mem_pressure_post_action_delay_start;
 
-        sd_event_source *cgroup_context_event_source;
+        sd_event_source *swap_context_event_source;
+        sd_event_source *mem_pressure_context_event_source;
 
         Varlink *varlink;
 };
author	Anita Zhang <the.anitazha@gmail.com>	2021-03-24 02:17:04 -0700
committer	Anita Zhang <the.anitazha@gmail.com>	2021-04-01 19:44:14 -0700
commit	81d66fab342e112aefe8b5f43bc853ef66c92add (patch)
tree	38a79355fc512b0bc2f80ebd8445255c32f0d816 /src/oom
parent	8ab34a49dbf75fd731973359a6f24c212682f479 (diff)
download	systemd-81d66fab342e112aefe8b5f43bc853ef66c92add.tar.gz