summaryrefslogtreecommitdiff
path: root/src/coredump/coredump.c
diff options
context:
space:
mode:
authorLuca Boccassi <luca.boccassi@microsoft.com>2021-03-12 20:17:09 +0000
committerLuca Boccassi <luca.boccassi@microsoft.com>2021-06-08 14:05:56 +0100
commit587f2a5e564cf434c2e0a653f52b8f73e86092d8 (patch)
tree95d74d99a9e10e95ca19cdcb7f4e124ede36b57d /src/coredump/coredump.c
parent5b6f8e13ad26673e29d7aaad010d14261f247c89 (diff)
downloadsystemd-587f2a5e564cf434c2e0a653f52b8f73e86092d8.tar.gz
coredump: check cgroups memory limit if storing on tmpfs
When /var/lib/systemd/coredump/ is backed by a tmpfs, all disk usage will be accounted under the systemd-coredump process cgroup memory limit. If MemoryMax is set, this might cause systemd-coredump to be terminated by the kernel oom handler when writing large uncompressed core files, even if the compressed core would fit within the limits. Detect if a tmpfs is used, and if so check MemoryMax from the process and slice cgroups, and do not write uncompressed core files that are greater than half the available memory. If the limit is breached, stop writing and compress the written chunk immediately, then delete the uncompressed chunk to free more memory, and resume compressing directly from STDIN. Example debug log when this situation happens: systemd-coredump[737455]: Setting max_size to limit writes to 51344896 bytes. systemd-coredump[737455]: ZSTD compression finished (51344896 -> 3260 bytes, 0.0%) systemd-coredump[737455]: ZSTD compression finished (1022786048 -> 47245 bytes, 0.0%) systemd-coredump[737455]: Process 737445 (a.out) of user 1000 dumped core.
Diffstat (limited to 'src/coredump/coredump.c')
-rw-r--r--src/coredump/coredump.c194
1 files changed, 130 insertions, 64 deletions
diff --git a/src/coredump/coredump.c b/src/coredump/coredump.c
index b75a7c39ce..c6c232c7e7 100644
--- a/src/coredump/coredump.c
+++ b/src/coredump/coredump.c
@@ -3,6 +3,7 @@
#include <errno.h>
#include <stdio.h>
#include <sys/prctl.h>
+#include <sys/statvfs.h>
#include <sys/xattr.h>
#include <unistd.h>
@@ -18,6 +19,7 @@
#include "acl-util.h"
#include "alloc-util.h"
+#include "bus-error.h"
#include "capability-util.h"
#include "cgroup-util.h"
#include "compress.h"
@@ -42,6 +44,7 @@
#include "socket-util.h"
#include "special.h"
#include "stacktrace.h"
+#include "stat-util.h"
#include "string-table.h"
#include "string-util.h"
#include "strv.h"
@@ -63,6 +66,10 @@
#define JOURNAL_SIZE_MAX ((size_t) (10LU*1024LU*1024LU))
#endif
+/* When checking for available memory and setting lower limits, don't
+ * go below 4MB for writing core files to storage. */
+#define PROCESS_SIZE_MIN (4U*1024U*1024U)
+
/* Make sure to not make this larger than the maximum journal entry
* size. See DATA_SIZE_MAX in journal-importer.h. */
assert_cc(JOURNAL_SIZE_MAX <= DATA_SIZE_MAX);
@@ -329,11 +336,14 @@ static int save_external_coredump(
int *ret_node_fd,
int *ret_data_fd,
uint64_t *ret_size,
+ uint64_t *ret_compressed_size,
bool *ret_truncated) {
- _cleanup_free_ char *fn = NULL, *tmp = NULL;
+ _cleanup_(unlink_and_freep) char *tmp = NULL;
+ _cleanup_free_ char *fn = NULL;
_cleanup_close_ int fd = -1;
uint64_t rlimit, process_limit, max_size;
+ bool truncated, storage_on_tmpfs;
struct stat st;
uid_t uid;
int r;
@@ -343,6 +353,8 @@ static int save_external_coredump(
assert(ret_node_fd);
assert(ret_data_fd);
assert(ret_size);
+ assert(ret_compressed_size);
+ assert(ret_truncated);
r = parse_uid(context->meta[META_ARGV_UID], &uid);
if (r < 0)
@@ -379,92 +391,145 @@ static int save_external_coredump(
if (fd < 0)
return log_error_errno(fd, "Failed to create temporary file for coredump %s: %m", fn);
- r = copy_bytes(input_fd, fd, max_size, 0);
- if (r < 0) {
- log_error_errno(r, "Cannot store coredump of %s (%s): %m",
- context->meta[META_ARGV_PID], context->meta[META_COMM]);
- goto fail;
- }
- *ret_truncated = r == 1;
- if (*ret_truncated)
- log_struct(LOG_INFO,
- LOG_MESSAGE("Core file was truncated to %zu bytes.", max_size),
- "SIZE_LIMIT=%zu", max_size,
- "MESSAGE_ID=" SD_MESSAGE_TRUNCATED_CORE_STR);
+ /* If storage is on tmpfs, the kernel oomd might kill us if there's MemoryMax set on
+ * the service or the slice it belongs to. This is common on low-resources systems,
+ * to avoid crashing processes to take away too many system resources.
+ * Check the cgroup settings, and set max_size to a bit less than half of the
+ * available memory left to the process.
+ * Then, attempt to write the core file uncompressed first - if the write gets
+ * interrupted, we know we won't be able to write it all, so instead compress what
+ * was written so far, delete the uncompressed truncated core, and then continue
+ * compressing from STDIN. Given the compressed core cannot be larger than the
+ * uncompressed one, and 1KB for metadata is accounted for in the calculation, we
+ * should be able to at least store the full compressed core file. */
+
+ storage_on_tmpfs = fd_is_temporary_fs(fd) > 0;
+ if (storage_on_tmpfs && arg_compress) {
+ _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+ uint64_t cgroup_limit = UINT64_MAX;
+ struct statvfs sv;
+
+ /* If we can't get the cgroup limit, just ignore it, but don't fail,
+ * try anyway with the config settings. */
+ r = sd_bus_default_system(&bus);
+ if (r < 0)
+ log_info_errno(r, "Failed to connect to system bus, skipping MemoryAvailable check: %m");
+ else {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+
+ r = sd_bus_get_property_trivial(
+ bus,
+ "org.freedesktop.systemd1",
+ "/org/freedesktop/systemd1/unit/self",
+ "org.freedesktop.systemd1.Service",
+ "MemoryAvailable",
+ &error,
+ 't', &cgroup_limit);
+ if (r < 0)
+ log_warning_errno(r,
+ "Failed to query MemoryAvailable for current unit, "
+ "falling back to static config settings: %s",
+ bus_error_message(&error, r));
+ }
- if (fstat(fd, &st) < 0) {
- log_error_errno(errno, "Failed to fstat core file %s: %m", coredump_tmpfile_name(tmp));
- goto fail;
- }
+ max_size = MIN(cgroup_limit, max_size);
+ max_size = LESS_BY(max_size, 1024U) / 2; /* Account for 1KB metadata overhead for compressing */
+ max_size = MAX(PROCESS_SIZE_MIN, max_size); /* Impose a lower minimum */
+
+ /* tmpfs might get full quickly, so check the available space too.
+ * But don't worry about errors here, failing to access the storage
+ * location will be better logged when writing to it. */
+ if (statvfs("/var/lib/systemd/coredump/", &sv) >= 0)
+ max_size = MIN((uint64_t)sv.f_frsize * (uint64_t)sv.f_bfree, max_size);
- if (lseek(fd, 0, SEEK_SET) == (off_t) -1) {
- log_error_errno(errno, "Failed to seek on %s: %m", coredump_tmpfile_name(tmp));
- goto fail;
+ log_debug("Limiting core file size to %" PRIu64 " bytes due to cgroup memory limits.", max_size);
}
-#if HAVE_COMPRESSION
- /* If we will remove the coredump anyway, do not compress. */
- if (arg_compress && !maybe_remove_external_coredump(NULL, st.st_size)) {
+ r = copy_bytes(input_fd, fd, max_size, 0);
+ if (r < 0)
+ return log_error_errno(r, "Cannot store coredump of %s (%s): %m",
+ context->meta[META_ARGV_PID], context->meta[META_COMM]);
+ truncated = r == 1;
- _cleanup_free_ char *fn_compressed = NULL, *tmp_compressed = NULL;
+#if HAVE_COMPRESSION
+ if (arg_compress) {
+ _cleanup_(unlink_and_freep) char *tmp_compressed = NULL;
+ _cleanup_free_ char *fn_compressed = NULL;
_cleanup_close_ int fd_compressed = -1;
+ uint64_t uncompressed_size = 0;
+
+ if (lseek(fd, 0, SEEK_SET) == (off_t) -1)
+ return log_error_errno(errno, "Failed to seek on coredump %s: %m", fn);
fn_compressed = strjoin(fn, COMPRESSED_EXT);
- if (!fn_compressed) {
- log_oom();
- goto uncompressed;
- }
+ if (!fn_compressed)
+ return log_oom();
fd_compressed = open_tmpfile_linkable(fn_compressed, O_RDWR|O_CLOEXEC, &tmp_compressed);
- if (fd_compressed < 0) {
- log_error_errno(fd_compressed, "Failed to create temporary file for coredump %s: %m", fn_compressed);
- goto uncompressed;
- }
+ if (fd_compressed < 0)
+ return log_error_errno(fd_compressed, "Failed to create temporary file for coredump %s: %m", fn_compressed);
- r = compress_stream(fd, fd_compressed, -1);
- if (r < 0) {
- log_error_errno(r, "Failed to compress %s: %m", coredump_tmpfile_name(tmp_compressed));
- goto fail_compressed;
+ r = compress_stream(fd, fd_compressed, max_size, &uncompressed_size);
+ if (r < 0)
+ return log_error_errno(r, "Failed to compress %s: %m", coredump_tmpfile_name(tmp_compressed));
+
+ if (truncated && storage_on_tmpfs) {
+ uint64_t partial_uncompressed_size = 0;
+
+ /* Uncompressed write was truncated and we are writing to tmpfs: delete
+ * the uncompressed core, and compress the remaining part from STDIN. */
+
+ tmp = unlink_and_free(tmp);
+ fd = safe_close(fd);
+
+ r = compress_stream(input_fd, fd_compressed, max_size, &partial_uncompressed_size);
+ if (r < 0)
+ return log_error_errno(r, "Failed to compress %s: %m", coredump_tmpfile_name(tmp_compressed));
+ uncompressed_size += partial_uncompressed_size;
}
r = fix_permissions(fd_compressed, tmp_compressed, fn_compressed, context, uid);
if (r < 0)
- goto fail_compressed;
+ return r;
- /* OK, this worked, we can get rid of the uncompressed version now */
- if (tmp)
- unlink_noerrno(tmp);
+ if (fstat(fd_compressed, &st) < 0)
+ return log_error_errno(errno,
+ "Failed to fstat core file %s: %m",
+ coredump_tmpfile_name(tmp_compressed));
- *ret_filename = TAKE_PTR(fn_compressed); /* compressed */
- *ret_node_fd = TAKE_FD(fd_compressed); /* compressed */
- *ret_data_fd = TAKE_FD(fd); /* uncompressed */
- *ret_size = (uint64_t) st.st_size; /* uncompressed */
+ *ret_filename = TAKE_PTR(fn_compressed); /* compressed */
+ *ret_node_fd = TAKE_FD(fd_compressed); /* compressed */
+ *ret_compressed_size = (uint64_t) st.st_size; /* compressed */
+ *ret_data_fd = TAKE_FD(fd);
+ *ret_size = uncompressed_size;
+ *ret_truncated = truncated;
+ tmp_compressed = mfree(tmp_compressed);
return 0;
-
- fail_compressed:
- if (tmp_compressed)
- (void) unlink(tmp_compressed);
}
-
-uncompressed:
#endif
+ if (truncated)
+ log_struct(LOG_INFO,
+ LOG_MESSAGE("Core file was truncated to %zu bytes.", max_size),
+ "SIZE_LIMIT=%zu", max_size,
+ "MESSAGE_ID=" SD_MESSAGE_TRUNCATED_CORE_STR);
+
r = fix_permissions(fd, tmp, fn, context, uid);
if (r < 0)
- goto fail;
+ return log_error_errno(r, "Failed to fix permissions and finalize coredump %s into %s: %m", coredump_tmpfile_name(tmp), fn);
+
+ if (fstat(fd, &st) < 0)
+ return log_error_errno(errno, "Failed to fstat core file %s: %m", coredump_tmpfile_name(tmp));
+
+ if (lseek(fd, 0, SEEK_SET) == (off_t) -1)
+ return log_error_errno(errno, "Failed to seek on coredump %s: %m", fn);
- *ret_filename = TAKE_PTR(fn);
*ret_data_fd = TAKE_FD(fd);
- *ret_node_fd = -1;
*ret_size = (uint64_t) st.st_size;
+ *ret_truncated = truncated;
return 0;
-
-fail:
- if (tmp)
- (void) unlink(tmp);
- return r;
}
static int allocate_journal_field(int fd, size_t size, char **ret, size_t *ret_size) {
@@ -709,7 +774,7 @@ static int submit_coredump(
_cleanup_free_ char *stacktrace = NULL;
char *core_message;
const char *module_name;
- uint64_t coredump_size = UINT64_MAX;
+ uint64_t coredump_size = UINT64_MAX, coredump_compressed_size = UINT64_MAX;
bool truncated = false;
JsonVariant *module_json;
int r;
@@ -722,7 +787,8 @@ static int submit_coredump(
/* Always stream the coredump to disk, if that's possible */
r = save_external_coredump(context, input_fd,
- &filename, &coredump_node_fd, &coredump_fd, &coredump_size, &truncated);
+ &filename, &coredump_node_fd, &coredump_fd,
+ &coredump_size, &coredump_compressed_size, &truncated);
if (r < 0)
/* Skip whole core dumping part */
goto log;
@@ -730,7 +796,7 @@ static int submit_coredump(
/* If we don't want to keep the coredump on disk, remove it now, as later on we
* will lack the privileges for it. However, we keep the fd to it, so that we can
* still process it and log it. */
- r = maybe_remove_external_coredump(filename, coredump_size);
+ r = maybe_remove_external_coredump(filename, coredump_node_fd >= 0 ? coredump_compressed_size : coredump_size);
if (r < 0)
return r;
if (r == 0) {
@@ -738,7 +804,7 @@ static int submit_coredump(
} else if (arg_storage == COREDUMP_STORAGE_EXTERNAL)
log_info("The core will not be stored: size %"PRIu64" is greater than %"PRIu64" (the configured maximum)",
- coredump_size, arg_external_size_max);
+ coredump_node_fd >= 0 ? coredump_compressed_size : coredump_size, arg_external_size_max);
/* Vacuum again, but exclude the coredump we just created */
(void) coredump_vacuum(coredump_node_fd >= 0 ? coredump_node_fd : coredump_fd, arg_keep_free, arg_max_use);
@@ -758,7 +824,7 @@ static int submit_coredump(
log_debug("Not generating stack trace: core size %"PRIu64" is greater "
"than %"PRIu64" (the configured maximum)",
coredump_size, arg_process_size_max);
- } else
+ } else if (coredump_fd >= 0)
coredump_parse_core(coredump_fd, context->meta[META_EXE], &stacktrace, &json_metadata);
#endif
@@ -812,7 +878,7 @@ log:
}
/* Optionally store the entire coredump in the journal */
- if (arg_storage == COREDUMP_STORAGE_JOURNAL) {
+ if (arg_storage == COREDUMP_STORAGE_JOURNAL && coredump_fd >= 0) {
if (coredump_size <= arg_journal_size_max) {
size_t sz = 0;