Merge pull request #11530 from keszybz/journal-cache-trimming

Journal cache trimming
author: Lennart Poettering <lennart@poettering.net> 2019-01-26 13:55:55 +0100
committer: GitHub <noreply@github.com> 2019-01-26 13:55:55 +0100
commit: 67216ef8f2cb63482dc7ba184b8b7460cf4916f1 (patch)
tree: 51daa2f3600cbf37d1423e8deff2b390d898840f
parent: b3f259d0566bdac1af2987dd316e312c0557edf9 (diff)
parent: 91714a7f427a6c9c5c3be8b3819fee45050028f3 (diff)
download: systemd-67216ef8f2cb63482dc7ba184b8b7460cf4916f1.tar.gz
10 files changed, 99 insertions, 17 deletions
diff --git a/src/basic/prioq.c b/src/basic/prioq.c
index cfd08d5d23..76b27fa0a8 100644
--- a/src/basic/prioq.c
+++ b/src/basic/prioq.c
@@ -259,15 +259,14 @@ int prioq_reshuffle(Prioq *q, void *data, unsigned *idx) {
         return 1;
 }
 
-void *prioq_peek(Prioq *q) {
-
+void *prioq_peek_by_index(Prioq *q, unsigned idx) {
         if (!q)
                 return NULL;
 
-        if (q->n_items <= 0)
+        if (idx >= q->n_items)
                 return NULL;
 
-        return q->items[0].data;
+        return q->items[idx].data;
 }
 
 void *prioq_pop(Prioq *q) {
diff --git a/src/basic/prioq.h b/src/basic/prioq.h
index bba5c7caa4..1fb57bfa4c 100644
--- a/src/basic/prioq.h
+++ b/src/basic/prioq.h
@@ -19,8 +19,14 @@ int prioq_put(Prioq *q, void *data, unsigned *idx);
 int prioq_remove(Prioq *q, void *data, unsigned *idx);
 int prioq_reshuffle(Prioq *q, void *data, unsigned *idx);
 
-void *prioq_peek(Prioq *q) _pure_;
+void *prioq_peek_by_index(Prioq *q, unsigned idx) _pure_;
+static inline void *prioq_peek(Prioq *q) {
+        return prioq_peek_by_index(q, 0);
+}
 void *prioq_pop(Prioq *q);
 
+#define PRIOQ_FOREACH_ITEM(q, p)                                \
+        for (unsigned _i = 0; (p = prioq_peek_by_index(q, _i)); _i++)
+
 unsigned prioq_size(Prioq *q) _pure_;
 bool prioq_isempty(Prioq *q) _pure_;
diff --git a/src/basic/procfs-util.c b/src/basic/procfs-util.c
index a159e344b3..7aaf95bfce 100644
--- a/src/basic/procfs-util.c
+++ b/src/basic/procfs-util.c
@@ -201,13 +201,11 @@ int procfs_cpu_get_usage(nsec_t *ret) {
         return 0;
 }
 
-int procfs_memory_get_current(uint64_t *ret) {
+int procfs_memory_get(uint64_t *ret_total, uint64_t *ret_used) {
         uint64_t mem_total = UINT64_MAX, mem_free = UINT64_MAX;
         _cleanup_fclose_ FILE *f = NULL;
         int r;
 
-        assert(ret);
-
         f = fopen("/proc/meminfo", "re");
         if (!f)
                 return -errno;
@@ -262,6 +260,9 @@ int procfs_memory_get_current(uint64_t *ret) {
         if (mem_free > mem_total)
                 return -EINVAL;
 
-        *ret = (mem_total - mem_free) * 1024U;
+        if (ret_total)
+                *ret_total = mem_total * 1024U;
+        if (ret_used)
+                *ret_used = (mem_total - mem_free) * 1024U;
         return 0;
 }
diff --git a/src/basic/procfs-util.h b/src/basic/procfs-util.h
index f697ed92bc..5a44e9eff7 100644
--- a/src/basic/procfs-util.h
+++ b/src/basic/procfs-util.h
@@ -11,4 +11,7 @@ int procfs_tasks_get_current(uint64_t *ret);
 
 int procfs_cpu_get_usage(nsec_t *ret);
 
-int procfs_memory_get_current(uint64_t *ret);
+int procfs_memory_get(uint64_t *ret_total, uint64_t *ret_used);
+static inline int procfs_memory_get_used(uint64_t *ret) {
+        return procfs_memory_get(NULL, ret);
+}
diff --git a/src/cgtop/cgtop.c b/src/cgtop/cgtop.c
index 11cc5fa2e9..b3bda30cec 100644
--- a/src/cgtop/cgtop.c
+++ b/src/cgtop/cgtop.c
@@ -291,7 +291,7 @@ static int process(
         } else if (streq(controller, "memory")) {
 
                 if (is_root_cgroup(path)) {
-                        r = procfs_memory_get_current(&g->memory);
+                        r = procfs_memory_get_used(&g->memory);
                         if (r < 0)
                                 return r;
                 } else {
diff --git a/src/core/cgroup.c b/src/core/cgroup.c
index ed2f331b33..18d470b6d6 100644
--- a/src/core/cgroup.c
+++ b/src/core/cgroup.c
@@ -2780,7 +2780,7 @@ int unit_get_memory_current(Unit *u, uint64_t *ret) {
 
         /* The root cgroup doesn't expose this information, let's get it from /proc instead */
         if (unit_has_host_root_cgroup(u))
-                return procfs_memory_get_current(ret);
+                return procfs_memory_get_used(ret);
 
         if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
                 return -ENODATA;
diff --git a/src/journal/journald-context.c b/src/journal/journald-context.c
index 80bbc34e37..7c51f2f633 100644
--- a/src/journal/journald-context.c
+++ b/src/journal/journald-context.c
@@ -16,6 +16,7 @@
 #include "parse-util.h"
 #include "path-util.h"
 #include "process-util.h"
+#include "procfs-util.h"
 #include "string-util.h"
 #include "syslog-util.h"
 #include "unaligned.h"
@@ -60,7 +61,37 @@
 /* Keep at most 16K entries in the cache. (Note though that this limit may be violated if enough streams pin entries in
  * the cache, in which case we *do* permit this limit to be breached. That's safe however, as the number of stream
  * clients itself is limited.) */
-#define CACHE_MAX (16*1024)
+#define CACHE_MAX_FALLBACK 128U
+#define CACHE_MAX_MAX (16*1024U)
+#define CACHE_MAX_MIN 64U
+
+static size_t cache_max(void) {
+        static size_t cached = -1;
+
+        if (cached == (size_t) -1) {
+                uint64_t mem_total;
+                int r;
+
+                r = procfs_memory_get(&mem_total, NULL);
+                if (r < 0) {
+                        log_warning_errno(r, "Cannot query /proc/meminfo for MemTotal: %m");
+                        cached = CACHE_MAX_FALLBACK;
+                } else {
+                        /* Cache entries are usually a few kB, but the process cmdline is controlled by the
+                         * user and can be up to _SC_ARG_MAX, usually 2MB. Let's say that approximately up to
+                         * 1/8th of memory may be used by the cache.
+                         *
+                         * In the common case, this formula gives 64 cache entries for each GB of RAM.
+                         */
+                        long l = sysconf(_SC_ARG_MAX);
+                        assert(l > 0);
+
+                        cached = CLAMP(mem_total / 8 / (uint64_t) l, CACHE_MAX_MIN, CACHE_MAX_MAX);
+                }
+        }
+
+        return cached;
+}
 
 static int client_context_compare(const void *a, const void *b) {
         const ClientContext *x = a, *y = b;
@@ -550,15 +581,39 @@ refresh:
 }
 
 static void client_context_try_shrink_to(Server *s, size_t limit) {
+        ClientContext *c;
+        usec_t t;
+
         assert(s);
 
+        /* Flush any cache entries for PIDs that have already moved on. Don't do this
+         * too often, since it's a slow process. */
+        t = now(CLOCK_MONOTONIC);
+        if (s->last_cache_pid_flush + MAX_USEC < t) {
+                unsigned n = prioq_size(s->client_contexts_lru), idx = 0;
+
+                /* We do a number of iterations based on the initial size of the prioq.  When we remove an
+                 * item, a new item is moved into its places, and items to the right might be reshuffled.
+                 */
+                for (unsigned i = 0; i < n; i++) {
+                        c = prioq_peek_by_index(s->client_contexts_lru, idx);
+
+                        assert(c->n_ref == 0);
+
+                        if (!pid_is_unwaited(c->pid))
+                                client_context_free(s, c);
+                        else
+                                idx ++;
+                }
+
+                s->last_cache_pid_flush = t;
+        }
+
         /* Bring the number of cache entries below the indicated limit, so that we can create a new entry without
          * breaching the limit. Note that we only flush out entries that aren't pinned here. This means the number of
          * cache entries may very well grow beyond the limit, if all entries stored remain pinned. */
 
         while (hashmap_size(s->client_contexts) > limit) {
-                ClientContext *c;
-
                 c = prioq_pop(s->client_contexts_lru);
                 if (!c)
                         break; /* All remaining entries are pinned, give up */
@@ -627,7 +682,7 @@ static int client_context_get_internal(
                 return 0;
         }
 
-        client_context_try_shrink_to(s, CACHE_MAX-1);
+        client_context_try_shrink_to(s, cache_max()-1);
 
         r = client_context_new(s, pid, &c);
         if (r < 0)
diff --git a/src/journal/journald-server.h b/src/journal/journald-server.h
index 6d4847b0cd..3f6b42ddd5 100644
--- a/src/journal/journald-server.h
+++ b/src/journal/journald-server.h
@@ -161,6 +161,8 @@ struct Server {
         Hashmap *client_contexts;
         Prioq *client_contexts_lru;
 
+        usec_t last_cache_pid_flush;
+
         ClientContext *my_context; /* the context of journald itself */
         ClientContext *pid1_context; /* the context of PID 1 */
 };
diff --git a/src/test/test-prioq.c b/src/test/test-prioq.c
index bc5fdd15b2..53c9e090a7 100644
--- a/src/test/test-prioq.c
+++ b/src/test/test-prioq.c
@@ -69,6 +69,11 @@ static void test_struct(void) {
         assert_se(q = prioq_new((compare_func_t) test_compare));
         assert_se(s = set_new(&test_hash_ops));
 
+        assert_se(prioq_peek(q) == NULL);
+        assert_se(prioq_peek_by_index(q, 0) == NULL);
+        assert_se(prioq_peek_by_index(q, 1) == NULL);
+        assert_se(prioq_peek_by_index(q, (unsigned) -1) == NULL);
+
         for (i = 0; i < SET_SIZE; i++) {
                 assert_se(t = new0(struct test, 1));
                 t->value = (unsigned) rand();
@@ -79,6 +84,17 @@ static void test_struct(void) {
                         assert_se(set_consume(s, t) >= 0);
         }
 
+        for (i = 0; i < SET_SIZE; i++)
+                assert_se(prioq_peek_by_index(q, i));
+        assert_se(prioq_peek_by_index(q, SET_SIZE) == NULL);
+
+        unsigned count = 0;
+        PRIOQ_FOREACH_ITEM(q, t) {
+                assert_se(t);
+                count++;
+        }
+        assert_se(count == SET_SIZE);
+
         while ((t = set_steal_first(s))) {
                 assert_se(prioq_remove(q, t, &t->idx) == 1);
                 assert_se(prioq_remove(q, t, &t->idx) == 0);
diff --git a/src/test/test-procfs-util.c b/src/test/test-procfs-util.c
index 08af380cc7..1d0612985b 100644
--- a/src/test/test-procfs-util.c
+++ b/src/test/test-procfs-util.c
@@ -18,7 +18,7 @@ int main(int argc, char *argv[]) {
         assert_se(procfs_cpu_get_usage(&nsec) >= 0);
         log_info("Current system CPU time: %s", format_timespan(buf, sizeof(buf), nsec/NSEC_PER_USEC, 1));
 
-        assert_se(procfs_memory_get_current(&v) >= 0);
+        assert_se(procfs_memory_get_used(&v) >= 0);
         log_info("Current memory usage: %s", format_bytes(buf, sizeof(buf), v));
 
         assert_se(procfs_tasks_get_current(&v) >= 0);
author	Lennart Poettering <lennart@poettering.net>	2019-01-26 13:55:55 +0100
committer	GitHub <noreply@github.com>	2019-01-26 13:55:55 +0100
commit	67216ef8f2cb63482dc7ba184b8b7460cf4916f1 (patch)
tree	51daa2f3600cbf37d1423e8deff2b390d898840f
parent	b3f259d0566bdac1af2987dd316e312c0557edf9 (diff)
parent	91714a7f427a6c9c5c3be8b3819fee45050028f3 (diff)
download	systemd-67216ef8f2cb63482dc7ba184b8b7460cf4916f1.tar.gz