summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLennart Poettering <lennart@poettering.net>2023-02-10 16:44:24 +0100
committerLennart Poettering <lennart@poettering.net>2023-02-22 13:09:42 +0100
commit158fe190afe37b222c9dc2c53bd7be426b92ef89 (patch)
treee9f2a08d7fd364ede9213da53af8a5684c7e13a8
parent81cfea95e51c72da6765b517e7038b3e7e3dec9f (diff)
downloadsystemd-158fe190afe37b222c9dc2c53bd7be426b92ef89.tar.gz
sd-event: add high-level sd_event_add_memory_pressure() event source
Typically, in reasonably complex programs we want to realease various caches (such as glibc allocation caches) in case of memory pressure. Let's add explicit infrastructure for that to sd-event, that can hook Linux' Pressure Stall Information (PSI) logic with our event loop. This adds sd_event_add_memory_pressure() as easy, one-step API to install an even source that is called under memory pressure. The parameters which file to watch (the per-cgroup PSI file, or the system-wide file /proc/pressure/memory) can be configured via env vars. The idea is that the service manager sooner or later gains controls for setting this up correctly. Alternatively to the PSI a similar logic is supported but instead of waiting for POLLPRI on a procfs/cgroupfs fd we'll wait for POLLIN on FIFO or AF_UNIX sockets. This is useful for testing, and possibly in other environments, for example to hook up this protocol directly with GNOME's low memory monitor. By default this watches on the cgroup-local PSI so that we aren't affected by pressure on cgroups we are not related to.
-rw-r--r--catalog/systemd.catalog.in15
-rw-r--r--src/basic/psi-util.h5
-rw-r--r--src/libsystemd/sd-event/event-source.h12
-rw-r--r--src/libsystemd/sd-event/sd-event.c582
-rw-r--r--src/systemd/sd-event.h5
-rw-r--r--src/systemd/sd-messages.h3
6 files changed, 621 insertions, 1 deletions
diff --git a/catalog/systemd.catalog.in b/catalog/systemd.catalog.in
index 975e77fcec..82d4820b80 100644
--- a/catalog/systemd.catalog.in
+++ b/catalog/systemd.catalog.in
@@ -549,3 +549,18 @@ Whenever the system transitions to a new runtime phase, the specified PCR is
extended with a different string, to ensure that security policies for
TPM-bound secrets and other resources are limited to specific phases of the
runtime.
+
+-- f9b0be465ad540d0850ad32172d57c21
+Subject: Memory Trimmed
+Defined-By: systemd
+Support: %SUPPORT_URL%
+
+Memory of process @_PID@ (@_COMM@) has been trimmed.
+
+Either on user request or as result of a memory pressure event, memory of the
+process has been trimmed, returning unneded allocation caches and other
+resources back to the OS kernel, making them available for other components of
+the OS.
+
+@TRIMMED_BYTES@ of memory were returned to the OS, which took @TRIMMED_USEC@
+micro-seconds (µs).
diff --git a/src/basic/psi-util.h b/src/basic/psi-util.h
index 415fbbdd47..558a130996 100644
--- a/src/basic/psi-util.h
+++ b/src/basic/psi-util.h
@@ -28,3 +28,8 @@ int read_resource_pressure(const char *path, PressureType type, ResourcePressure
/* Was the kernel compiled with CONFIG_PSI=y? 1 if yes, 0 if not, negative on error. */
int is_pressure_supported(void);
+
+/* Default parameters for memory pressure watch logic in sd-event and PID 1 */
+#define MEMORY_PRESSURE_DEFAULT_TYPE "some"
+#define MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC (100 * USEC_PER_MSEC)
+#define MEMORY_PRESSURE_DEFAULT_WINDOW_USEC USEC_PER_SEC
diff --git a/src/libsystemd/sd-event/event-source.h b/src/libsystemd/sd-event/event-source.h
index 6092652d0f..f4e38d78d0 100644
--- a/src/libsystemd/sd-event/event-source.h
+++ b/src/libsystemd/sd-event/event-source.h
@@ -27,6 +27,7 @@ typedef enum EventSourceType {
SOURCE_EXIT,
SOURCE_WATCHDOG,
SOURCE_INOTIFY,
+ SOURCE_MEMORY_PRESSURE,
_SOURCE_EVENT_SOURCE_TYPE_MAX,
_SOURCE_EVENT_SOURCE_TYPE_INVALID = -EINVAL,
} EventSourceType;
@@ -129,6 +130,17 @@ struct sd_event_source {
struct inode_data *inode_data;
LIST_FIELDS(sd_event_source, by_inode_data);
} inotify;
+ struct {
+ int fd;
+ sd_event_handler_t callback;
+ void *write_buffer;
+ size_t write_buffer_size;
+ uint32_t events, revents;
+ LIST_FIELDS(sd_event_source, write_list);
+ bool registered:1;
+ bool locked:1;
+ bool in_write_list:1;
+ } memory_pressure;
};
};
diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c
index 307fdde311..2f9b0ecda0 100644
--- a/src/libsystemd/sd-event/sd-event.c
+++ b/src/libsystemd/sd-event/sd-event.c
@@ -7,6 +7,7 @@
#include "sd-daemon.h"
#include "sd-event.h"
#include "sd-id128.h"
+#include "sd-messages.h"
#include "alloc-util.h"
#include "env-util.h"
@@ -15,15 +16,22 @@
#include "fs-util.h"
#include "glyph-util.h"
#include "hashmap.h"
+#include "hexdecoct.h"
#include "list.h"
#include "logarithm.h"
#include "macro.h"
+#include "mallinfo-util.h"
#include "memory-util.h"
+#include "missing_magic.h"
#include "missing_syscall.h"
+#include "path-util.h"
#include "prioq.h"
#include "process-util.h"
+#include "psi-util.h"
#include "set.h"
#include "signal-util.h"
+#include "socket-util.h"
+#include "stat-util.h"
#include "string-table.h"
#include "string-util.h"
#include "strxcpyx.h"
@@ -63,6 +71,7 @@ static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX]
[SOURCE_EXIT] = "exit",
[SOURCE_WATCHDOG] = "watchdog",
[SOURCE_INOTIFY] = "inotify",
+ [SOURCE_MEMORY_PRESSURE] = "memory-pressure",
};
DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
@@ -85,7 +94,8 @@ DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
SOURCE_TIME_BOOTTIME_ALARM, \
SOURCE_SIGNAL, \
SOURCE_DEFER, \
- SOURCE_INOTIFY)
+ SOURCE_INOTIFY, \
+ SOURCE_MEMORY_PRESSURE)
/* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
* Time sources and ratelimited sources can be passed, so effectively this is the same as the
@@ -130,6 +140,9 @@ struct sd_event {
/* A list of inotify objects that already have events buffered which aren't processed yet */
LIST_HEAD(struct inotify_data, buffered_inotify_data_list);
+ /* A list of memory pressure event sources that still need their subscription string written */
+ LIST_HEAD(sd_event_source, memory_pressure_write_list);
+
pid_t original_pid;
uint64_t iteration;
@@ -524,6 +537,65 @@ static int source_child_pidfd_register(sd_event_source *s, int enabled) {
return 0;
}
+static void source_memory_pressure_unregister(sd_event_source *s) {
+ assert(s);
+ assert(s->type == SOURCE_MEMORY_PRESSURE);
+
+ if (event_pid_changed(s->event))
+ return;
+
+ if (!s->memory_pressure.registered)
+ return;
+
+ if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->memory_pressure.fd, NULL) < 0)
+ log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
+ strna(s->description), event_source_type_to_string(s->type));
+
+ s->memory_pressure.registered = false;
+}
+
+static int source_memory_pressure_register(sd_event_source *s, int enabled) {
+ assert(s);
+ assert(s->type == SOURCE_MEMORY_PRESSURE);
+ assert(enabled != SD_EVENT_OFF);
+
+ struct epoll_event ev = {
+ .events = s->memory_pressure.write_buffer_size > 0 ? EPOLLOUT :
+ (s->memory_pressure.events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0)),
+ .data.ptr = s,
+ };
+
+ if (epoll_ctl(s->event->epoll_fd,
+ s->memory_pressure.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
+ s->memory_pressure.fd, &ev) < 0)
+ return -errno;
+
+ s->memory_pressure.registered = true;
+ return 0;
+}
+
+static void source_memory_pressure_add_to_write_list(sd_event_source *s) {
+ assert(s);
+ assert(s->type == SOURCE_MEMORY_PRESSURE);
+
+ if (s->memory_pressure.in_write_list)
+ return;
+
+ LIST_PREPEND(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
+ s->memory_pressure.in_write_list = true;
+}
+
+static void source_memory_pressure_remove_from_write_list(sd_event_source *s) {
+ assert(s);
+ assert(s->type == SOURCE_MEMORY_PRESSURE);
+
+ if (!s->memory_pressure.in_write_list)
+ return;
+
+ LIST_REMOVE(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
+ s->memory_pressure.in_write_list = false;
+}
+
static clockid_t event_source_type_to_clock(EventSourceType t) {
switch (t) {
@@ -947,6 +1019,11 @@ static void source_disconnect(sd_event_source *s) {
break;
}
+ case SOURCE_MEMORY_PRESSURE:
+ source_memory_pressure_remove_from_write_list(s);
+ source_memory_pressure_unregister(s);
+ break;
+
default:
assert_not_reached();
}
@@ -1017,6 +1094,11 @@ static sd_event_source* source_free(sd_event_source *s) {
s->child.pidfd = safe_close(s->child.pidfd);
}
+ if (s->type == SOURCE_MEMORY_PRESSURE) {
+ s->memory_pressure.fd = safe_close(s->memory_pressure.fd);
+ s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
+ }
+
if (s->destroy_callback)
s->destroy_callback(s->userdata);
@@ -1092,6 +1174,7 @@ static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType t
[SOURCE_POST] = endoffsetof_field(sd_event_source, post),
[SOURCE_EXIT] = endoffsetof_field(sd_event_source, exit),
[SOURCE_INOTIFY] = endoffsetof_field(sd_event_source, inotify),
+ [SOURCE_MEMORY_PRESSURE] = endoffsetof_field(sd_event_source, memory_pressure),
};
sd_event_source *s;
@@ -1771,6 +1854,257 @@ _public_ int sd_event_add_exit(
return 0;
}
+int sd_event_trim_memory(void) {
+ int r;
+
+ /* A default implementation of a memory pressure callback. Simply releases our own allocation caches
+ * and glibc's. This is automatically used when people call sd_event_add_memory_pressure() with a
+ * NULL callback parameter. */
+
+ log_debug("Memory pressure event, trimming malloc() memory.");
+
+#if HAVE_GENERIC_MALLINFO
+ generic_mallinfo before_mallinfo = generic_mallinfo_get();
+#endif
+
+ usec_t before_timestamp = now(CLOCK_MONOTONIC);
+ hashmap_trim_pools();
+ r = malloc_trim(0);
+ usec_t after_timestamp = now(CLOCK_MONOTONIC);
+
+ if (r > 0)
+ log_debug("Successfully trimmed some memory.");
+ else
+ log_debug("Couldn't trim any memory.");
+
+ usec_t period = after_timestamp - before_timestamp;
+
+#if HAVE_GENERIC_MALLINFO
+ generic_mallinfo after_mallinfo = generic_mallinfo_get();
+ size_t l = LESS_BY((size_t) before_mallinfo.hblkhd, (size_t) after_mallinfo.hblkhd) +
+ LESS_BY((size_t) before_mallinfo.arena, (size_t) after_mallinfo.arena);
+ log_struct(LOG_DEBUG,
+ LOG_MESSAGE("Memory trimming took %s, returned %s to OS.",
+ FORMAT_TIMESPAN(period, 0),
+ FORMAT_BYTES(l)),
+ "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
+ "TRIMMED_BYTES=%zu", l,
+ "TRIMMED_USEC=" USEC_FMT, period);
+#else
+ log_struct(LOG_DEBUG,
+ LOG_MESSAGE("Memory trimming took %s.",
+ FORMAT_TIMESPAN(period, 0)),
+ "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
+ "TRIMMED_USEC=" USEC_FMT, period);
+#endif
+
+ return 0;
+}
+
+static int memory_pressure_callback(sd_event_source *s, void *userdata) {
+ assert(s);
+
+ sd_event_trim_memory();
+ return 0;
+}
+
+_public_ int sd_event_add_memory_pressure(
+ sd_event *e,
+ sd_event_source **ret,
+ sd_event_handler_t callback,
+ void *userdata) {
+
+ _cleanup_free_ char *w = NULL;
+ _cleanup_(source_freep) sd_event_source *s = NULL;
+ _cleanup_close_ int path_fd = -1, fd = -1;
+ _cleanup_free_ void *write_buffer = NULL;
+ const char *watch, *watch_fallback, *env;
+ size_t write_buffer_size = 0;
+ struct stat st;
+ uint32_t events;
+ bool locked;
+ int r;
+
+ assert_return(e, -EINVAL);
+ assert_return(e = event_resolve(e), -ENOPKG);
+ assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+ assert_return(!event_pid_changed(e), -ECHILD);
+
+ if (!callback)
+ callback = memory_pressure_callback;
+
+ s = source_new(e, !ret, SOURCE_MEMORY_PRESSURE);
+ if (!s)
+ return -ENOMEM;
+
+ s->wakeup = WAKEUP_EVENT_SOURCE;
+ s->memory_pressure.callback = callback;
+ s->userdata = userdata;
+ s->enabled = SD_EVENT_ON;
+ s->memory_pressure.fd = -EBADF;
+
+ env = secure_getenv("MEMORY_PRESSURE_WATCH");
+ if (env) {
+ if (isempty(env) || path_equal(env, "/dev/null"))
+ return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN),
+ "Memory pressure logic is explicitly disabled via $MEMORY_PRESSURE_WATCH.");
+
+ if (!path_is_absolute(env) || !path_is_normalized(env))
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
+ "$MEMORY_PRESSURE_WATCH set to invalid path: %s", env);
+
+ watch = env;
+
+ env = secure_getenv("MEMORY_PRESSURE_WRITE");
+ if (env) {
+ r = unbase64mem(env, SIZE_MAX, &write_buffer, &write_buffer_size);
+ if (r < 0)
+ return r;
+ }
+
+ locked = true;
+ } else {
+
+ r = is_pressure_supported();
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EOPNOTSUPP;
+
+ /* By default we want to watch memory pressure on the local cgroup, but we'll fall back on
+ * the system wide pressure if for some reason we cannot (which could be: memory controller
+ * not delegated to us, or PSI simply not available in the kernel). On legacy cgroupv1 we'll
+ * only use the system-wide logic. */
+ r = cg_all_unified();
+ if (r < 0)
+ return r;
+ if (r == 0)
+ watch = "/proc/pressure/memory";
+ else {
+ _cleanup_free_ char *cg = NULL;
+
+ r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cg);
+ if (r < 0)
+ return r;
+
+ w = path_join("/sys/fs/cgroup", cg, "memory.pressure");
+ if (!w)
+ return -ENOMEM;
+
+ watch = w;
+ watch_fallback = "/proc/pressure/memory";
+ }
+
+ /* Android uses three levels in its userspace low memory killer logic:
+ * some 70000 1000000
+ * some 100000 1000000
+ * full 70000 1000000
+ *
+ * GNOME's low memory monitor uses:
+ * some 70000 1000000
+ * some 100000 1000000
+ * full 100000 1000000
+ *
+ * We'll default to the middle level that both agree on */
+ if (asprintf((char**) &write_buffer,
+ "%s " USEC_FMT " " USEC_FMT,
+ MEMORY_PRESSURE_DEFAULT_TYPE,
+ MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC,
+ MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
+ return -ENOMEM;
+
+ write_buffer_size = strlen(write_buffer) + 1;
+ locked = false;
+ }
+
+ path_fd = open(watch, O_PATH|O_CLOEXEC);
+ if (path_fd < 0) {
+ if (errno != ENOENT)
+ return -errno;
+
+ /* We got ENOENT. Three options now: try the fallback if we have one, or return the error as
+ * is (if based on user/env config), or return -EOPNOTSUPP (because we picked the path, and
+ * the PSI service apparently is not supported) */
+ if (!watch_fallback)
+ return locked ? -ENOENT : -EOPNOTSUPP;
+
+ path_fd = open(watch_fallback, O_PATH|O_CLOEXEC);
+ if (errno == ENOENT) /* PSI is not available in the kernel even under the fallback path? */
+ return -EOPNOTSUPP;
+ if (errno < 0)
+ return -errno;
+ }
+
+ if (fstat(path_fd, &st) < 0)
+ return -errno;
+
+ if (S_ISSOCK(st.st_mode)) {
+ fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
+ if (fd < 0)
+ return -errno;
+
+ r = connect_unix_path(fd, path_fd, NULL);
+ if (r < 0)
+ return r;
+
+ events = EPOLLIN;
+
+ } else if (S_ISREG(st.st_mode) || S_ISFIFO(st.st_mode) || S_ISCHR(st.st_mode)) {
+ fd = fd_reopen(path_fd, (write_buffer_size > 0 ? O_RDWR : O_RDONLY) |O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
+ if (fd < 0)
+ return fd;
+
+ if (S_ISREG(st.st_mode)) {
+ struct statfs sfs;
+
+ /* If this is a regular file validate this is a procfs or cgroupfs file, where we look for EPOLLPRI */
+
+ if (fstatfs(fd, &sfs) < 0)
+ return -errno;
+
+ if (!is_fs_type(&sfs, PROC_SUPER_MAGIC) &&
+ !is_fs_type(&sfs, CGROUP2_SUPER_MAGIC))
+ return -ENOTTY;
+
+ events = EPOLLPRI;
+ } else
+ /* For fifos and char devices just watch for EPOLLIN */
+ events = EPOLLIN;
+
+ } else if (S_ISDIR(st.st_mode))
+ return -EISDIR;
+ else
+ return -EBADF;
+
+ s->memory_pressure.fd = TAKE_FD(fd);
+ s->memory_pressure.write_buffer = TAKE_PTR(write_buffer);
+ s->memory_pressure.write_buffer_size = write_buffer_size;
+ s->memory_pressure.events = events;
+ s->memory_pressure.locked = locked;
+
+ /* So here's the thing: if we are talking to PSI we need to write the watch string before adding the
+ * fd to epoll (if we ignore this, then the watch won't work). Hence we'll not actually register the
+ * fd with the epoll right-away. Instead, we just add the event source to a list of memory pressure
+ * event sources on which writes must be executed before the first event loop iteration is
+ * executed. (We could also write the data here, right away, but we want to give the caller the
+ * freedom to call sd_event_source_set_memory_pressure_type() and
+ * sd_event_source_set_memory_pressure_rate() before we write it. */
+
+ if (s->memory_pressure.write_buffer_size > 0)
+ source_memory_pressure_add_to_write_list(s);
+ else {
+ r = source_memory_pressure_register(s, s->enabled);
+ if (r < 0)
+ return r;
+ }
+
+ if (ret)
+ *ret = s;
+ TAKE_PTR(s);
+
+ return 0;
+}
+
static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
assert(e);
@@ -2562,6 +2896,10 @@ static int event_source_offline(
prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
break;
+ case SOURCE_MEMORY_PRESSURE:
+ source_memory_pressure_unregister(s);
+ break;
+
case SOURCE_TIME_REALTIME:
case SOURCE_TIME_BOOTTIME:
case SOURCE_TIME_MONOTONIC:
@@ -2649,6 +2987,13 @@ static int event_source_online(
s->event->n_online_child_sources++;
break;
+ case SOURCE_MEMORY_PRESSURE:
+ r = source_memory_pressure_register(s, enabled);
+ if (r < 0)
+ return r;
+
+ break;
+
case SOURCE_TIME_REALTIME:
case SOURCE_TIME_BOOTTIME:
case SOURCE_TIME_MONOTONIC:
@@ -3630,6 +3975,106 @@ static int process_inotify(sd_event *e) {
return done;
}
+static int process_memory_pressure(sd_event_source *s, uint32_t revents) {
+ assert(s);
+ assert(s->type == SOURCE_MEMORY_PRESSURE);
+
+ if (s->pending)
+ s->memory_pressure.revents |= revents;
+ else
+ s->memory_pressure.revents = revents;
+
+ return source_set_pending(s, true);
+}
+
+static int source_memory_pressure_write(sd_event_source *s) {
+ ssize_t n;
+ int r;
+
+ assert(s);
+ assert(s->type == SOURCE_MEMORY_PRESSURE);
+
+ /* once we start writing, the buffer is locked, we allow no further changes. */
+ s->memory_pressure.locked = true;
+
+ if (s->memory_pressure.write_buffer_size > 0) {
+ n = write(s->memory_pressure.fd, s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size);
+ if (n < 0) {
+ if (!ERRNO_IS_TRANSIENT(errno))
+ return -errno;
+
+ n = 0;
+ }
+ } else
+ n = 0;
+
+ assert(n >= 0);
+
+ if ((size_t) n == s->memory_pressure.write_buffer_size) {
+ s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
+
+ if (n > 0) {
+ s->memory_pressure.write_buffer_size = 0;
+
+ /* Update epoll events mask, since we have now written everything and don't care for EPOLLOUT anymore */
+ r = source_memory_pressure_register(s, s->enabled);
+ if (r < 0)
+ return r;
+ }
+ } else if (n > 0) {
+ _cleanup_free_ void *c = NULL;
+
+ assert((size_t) n < s->memory_pressure.write_buffer_size);
+
+ c = memdup((uint8_t*) s->memory_pressure.write_buffer + n, s->memory_pressure.write_buffer_size - n);
+ if (!c)
+ return -ENOMEM;
+
+ free_and_replace(s->memory_pressure.write_buffer, c);
+ s->memory_pressure.write_buffer_size -= n;
+ return 1;
+ }
+
+ return 0;
+}
+
+static int source_memory_pressure_initiate_dispatch(sd_event_source *s) {
+ int r;
+
+ assert(s);
+ assert(s->type == SOURCE_MEMORY_PRESSURE);
+
+ r = source_memory_pressure_write(s);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ return 1; /* if we wrote something, then don't continue with dispatching user dispatch
+ * function. Instead, shortcut it so that we wait for next EPOLLOUT immediately. */
+
+ /* No pending incoming IO? Then let's not continue further */
+ if ((s->memory_pressure.revents & (EPOLLIN|EPOLLPRI)) == 0) {
+
+ /* Treat IO errors on the notifier the same ways errors returned from a callback */
+ if ((s->memory_pressure.revents & (EPOLLHUP|EPOLLERR|EPOLLRDHUP)) != 0)
+ return -EIO;
+
+ return 1; /* leave dispatch, we already processed everything */
+ }
+
+ if (s->memory_pressure.revents & EPOLLIN) {
+ uint8_t pipe_buf[PIPE_BUF];
+ ssize_t n;
+
+ /* If the fd is readable, then flush out anything that might be queued */
+
+ n = read(s->memory_pressure.fd, pipe_buf, sizeof(pipe_buf));
+ if (n < 0 && !ERRNO_IS_TRANSIENT(errno))
+ return -errno;
+ }
+
+ return 0; /* go on, dispatch to user callback */
+}
+
static int source_dispatch(sd_event_source *s) {
EventSourceType saved_type;
sd_event *saved_event;
@@ -3678,6 +4123,16 @@ static int source_dispatch(sd_event_source *s) {
}
}
+ if (s->type == SOURCE_MEMORY_PRESSURE) {
+ r = source_memory_pressure_initiate_dispatch(s);
+ if (r == -EIO) /* handle EIO errors similar to callback errors */
+ goto finish;
+ if (r < 0)
+ return r;
+ if (r > 0) /* already handled */
+ return 1;
+ }
+
if (s->enabled == SD_EVENT_ONESHOT) {
r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
if (r < 0)
@@ -3764,6 +4219,10 @@ static int source_dispatch(sd_event_source *s) {
break;
}
+ case SOURCE_MEMORY_PRESSURE:
+ r = s->memory_pressure.callback(s, s->userdata);
+ break;
+
case SOURCE_WATCHDOG:
case _SOURCE_EVENT_SOURCE_TYPE_MAX:
case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
@@ -3772,6 +4231,7 @@ static int source_dispatch(sd_event_source *s) {
s->dispatching = false;
+finish:
if (r < 0) {
log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m",
strna(s->description),
@@ -3922,6 +4382,30 @@ static void event_close_inode_data_fds(sd_event *e) {
}
}
+static int event_memory_pressure_write_list(sd_event *e) {
+ int r;
+
+ assert(e);
+
+ for (;;) {
+ sd_event_source *s;
+
+ s = LIST_POP(memory_pressure.write_list, e->memory_pressure_write_list);
+ if (!s)
+ break;
+
+ assert(s->type == SOURCE_MEMORY_PRESSURE);
+ assert(s->memory_pressure.write_buffer_size > 0);
+ s->memory_pressure.in_write_list = false;
+
+ r = source_memory_pressure_write(s);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
_public_ int sd_event_prepare(sd_event *e) {
int r;
@@ -3950,6 +4434,10 @@ _public_ int sd_event_prepare(sd_event *e) {
if (r < 0)
return r;
+ r = event_memory_pressure_write_list(e);
+ if (r < 0)
+ return r;
+
r = event_arm_timer(e, &e->realtime);
if (r < 0)
return r;
@@ -4115,6 +4603,10 @@ static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t
r = process_pidfd(e, s, e->event_queue[i].events);
break;
+ case SOURCE_MEMORY_PRESSURE:
+ r = process_memory_pressure(s, e->event_queue[i].events);
+ break;
+
default:
assert_not_reached();
}
@@ -4700,3 +5192,91 @@ _public_ int sd_event_set_signal_exit(sd_event *e, int b) {
return change;
}
+
+_public_ int sd_event_source_set_memory_pressure_type(sd_event_source *s, const char *ty) {
+ _cleanup_free_ char *b = NULL;
+ _cleanup_free_ void *w = NULL;
+
+ assert_return(s, -EINVAL);
+ assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
+ assert_return(ty, -EINVAL);
+
+ if (!STR_IN_SET(ty, "some", "full"))
+ return -EINVAL;
+
+ if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
+ return -EBUSY;
+
+ char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
+ if (!space)
+ return -EINVAL;
+
+ size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
+ b = memdup_suffix0(s->memory_pressure.write_buffer, l);
+ if (!b)
+ return -ENOMEM;
+ if (!STR_IN_SET(b, "some", "full"))
+ return -EINVAL;
+
+ if (streq(b, ty))
+ return 0;
+
+ size_t nl = strlen(ty) + (s->memory_pressure.write_buffer_size - l);
+ w = new(char, nl);
+ if (!w)
+ return -ENOMEM;
+
+ memcpy(stpcpy(w, ty), space, (s->memory_pressure.write_buffer_size - l));
+
+ free_and_replace(s->memory_pressure.write_buffer, w);
+ s->memory_pressure.write_buffer_size = nl;
+ s->memory_pressure.locked = false;
+
+ return 1;
+}
+
+_public_ int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) {
+ _cleanup_free_ char *b = NULL;
+ _cleanup_free_ void *w = NULL;
+
+ assert_return(s, -EINVAL);
+ assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
+
+ if (threshold_usec <= 0 || threshold_usec >= UINT64_MAX)
+ return -ERANGE;
+ if (window_usec <= 0 || window_usec >= UINT64_MAX)
+ return -ERANGE;
+ if (threshold_usec > window_usec)
+ return -EINVAL;
+
+ if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
+ return -EBUSY;
+
+ char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
+ if (!space)
+ return -EINVAL;
+
+ size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
+ b = memdup_suffix0(s->memory_pressure.write_buffer, l);
+ if (!b)
+ return -ENOMEM;
+ if (!STR_IN_SET(b, "some", "full"))
+ return -EINVAL;
+
+ if (asprintf((char**) &w,
+ "%s " USEC_FMT " " USEC_FMT "",
+ b,
+ threshold_usec,
+ window_usec) < 0)
+ return -EINVAL;
+
+ l = strlen(w) + 1;
+ if (memcmp_nn(s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size, w, l) == 0)
+ return 0;
+
+ free_and_replace(s->memory_pressure.write_buffer, w);
+ s->memory_pressure.write_buffer_size = l;
+ s->memory_pressure.locked = false;
+
+ return 1;
+}
diff --git a/src/systemd/sd-event.h b/src/systemd/sd-event.h
index cae4c8672a..5ca8528895 100644
--- a/src/systemd/sd-event.h
+++ b/src/systemd/sd-event.h
@@ -99,6 +99,7 @@ int sd_event_add_inotify_fd(sd_event *e, sd_event_source **s, int fd, uint32_t m
int sd_event_add_defer(sd_event *e, sd_event_source **s, sd_event_handler_t callback, void *userdata);
int sd_event_add_post(sd_event *e, sd_event_source **s, sd_event_handler_t callback, void *userdata);
int sd_event_add_exit(sd_event *e, sd_event_source **s, sd_event_handler_t callback, void *userdata);
+int sd_event_add_memory_pressure(sd_event *e, sd_event_source **s, sd_event_handler_t callback, void *userdata);
int sd_event_prepare(sd_event *e);
int sd_event_wait(sd_event *e, uint64_t usec);
@@ -160,6 +161,8 @@ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo
int sd_event_source_send_child_signal(sd_event_source *s, int sig, const void *si, unsigned flags);
#endif
int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *ret);
+int sd_event_source_set_memory_pressure_type(sd_event_source *e, const char *ty);
+int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec);
int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback);
int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret);
int sd_event_source_get_floating(sd_event_source *s);
@@ -171,6 +174,8 @@ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval_use
int sd_event_source_is_ratelimited(sd_event_source *s);
int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback);
+int sd_event_trim_memory(void);
+
/* Define helpers so that __attribute__((cleanup(sd_event_unrefp))) and similar may be used. */
_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_event, sd_event_unref);
_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_event_source, sd_event_source_unref);
diff --git a/src/systemd/sd-messages.h b/src/systemd/sd-messages.h
index 00fdbad2c5..39f56240fe 100644
--- a/src/systemd/sd-messages.h
+++ b/src/systemd/sd-messages.h
@@ -195,6 +195,9 @@ _SD_BEGIN_DECLARATIONS;
#define SD_MESSAGE_TPM_PCR_EXTEND SD_ID128_MAKE(3f,7d,5e,f3,e5,4f,43,02,b4,f0,b1,43,bb,27,0c,ab)
#define SD_MESSAGE_TPM_PCR_EXTEND_STR SD_ID128_MAKE_STR(3f,7d,5e,f3,e5,4f,43,02,b4,f0,b1,43,bb,27,0c,ab)
+#define SD_MESSAGE_MEMORY_TRIM SD_ID128_MAKE(f9,b0,be,46,5a,d5,40,d0,85,0a,d3,21,72,d5,7c,21)
+#define SD_MESSAGE_MEMORY_TRIM_STR SD_ID128_MAKE_STR(f9,b0,be,46,5a,d5,40,d0,85,0a,d3,21,72,d5,7c,21)
+
_SD_END_DECLARATIONS;
#endif