summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorge Prekas <prekgeo@yahoo.com>2021-01-07 09:06:05 -0600
committerOran Agra <oran@redislabs.com>2021-02-22 23:22:53 +0200
commitddf81e2f15394a1b1014a9d2580758ab09054b82 (patch)
tree1a4fd323d81535c88367da0369ec701b46780f0e
parentb1242ce92b1e494182c1b562545acd168084dca1 (diff)
downloadredis-ddf81e2f15394a1b1014a9d2580758ab09054b82.tar.gz
Add check for the MADV_FREE/fork arm64 Linux kernel bug (#8224)
Older arm64 Linux kernels have a bug that could lead to data corruption during background save under the following scenario: 1) jemalloc uses MADV_FREE on a page, 2) jemalloc reuses and writes the page, 3) Redis forks the background save process, and 4) Linux performs page reclamation. Under these conditions, Linux will reclaim the page wrongfully and the background save process will read zeros when it tries to read the page. The bug has been fixed in Linux with commit: ff1712f953e27f0b0718762ec17d0adb15c9fd0b ("arm64: pgtable: Ensure dirty bit is preserved across pte_wrprotect()") This Commit adds an ignore-warnings config, when not found, redis will print a warning and exit on startup (default behavior). Co-authored-by: Oran Agra <oran@redislabs.com> (cherry picked from commit b02780c41dbc5b28d265b5cf141c03c1a7383ef9)
-rw-r--r--redis.conf31
-rw-r--r--src/config.c4
-rw-r--r--src/server.c138
-rw-r--r--src/server.h2
4 files changed, 174 insertions, 1 deletions
diff --git a/redis.conf b/redis.conf
index d74ff98a5..197ba0d2f 100644
--- a/redis.conf
+++ b/redis.conf
@@ -1370,3 +1370,34 @@ rdb-save-incremental-fsync yes
# the main dictionary scan
# active-defrag-max-scan-fields 1000
+# It is possible to pin different threads and processes of Redis to specific
+# CPUs in your system, in order to maximize the performances of the server.
+# This is useful both in order to pin different Redis threads in different
+# CPUs, but also in order to make sure that multiple Redis instances running
+# in the same host will be pinned to different CPUs.
+#
+# Normally you can do this using the "taskset" command, however it is also
+# possible to this via Redis configuration directly, both in Linux and FreeBSD.
+#
+# You can pin the server/IO threads, bio threads, aof rewrite child process, and
+# the bgsave child process. The syntax to specify the cpu list is the same as
+# the taskset command:
+#
+# Set redis server/io threads to cpu affinity 0,2,4,6:
+# server_cpulist 0-7:2
+#
+# Set bio threads to cpu affinity 1,3:
+# bio_cpulist 1,3
+#
+# Set aof rewrite child process to cpu affinity 8,9,10,11:
+# aof_rewrite_cpulist 8-11
+#
+# Set bgsave child process to cpu affinity 1,10,11
+# bgsave_cpulist 1,10-11
+
+# In some cases redis will emit warnings and even refuse to start if it detects
+# that the system is in bad state, it is possible to suppress these warnings
+# by setting the following config which takes a space delimited list of warnings
+# to suppress
+#
+# ignore-warnings ARM64-COW-BUG
diff --git a/src/config.c b/src/config.c
index ba8e17e51..5f22442ec 100644
--- a/src/config.c
+++ b/src/config.c
@@ -294,6 +294,9 @@ void loadServerConfigFromString(char *config) {
} else if (!strcasecmp(argv[0],"syslog-ident") && argc == 2) {
if (server.syslog_ident) zfree(server.syslog_ident);
server.syslog_ident = zstrdup(argv[1]);
+ } else if (!strcasecmp(argv[0],"ignore-warnings") && argc == 2) {
+ if (server.ignore_warnings) zfree(server.ignore_warnings);
+ server.ignore_warnings = zstrdup(argv[1]);
} else if (!strcasecmp(argv[0],"syslog-facility") && argc == 2) {
server.syslog_facility =
configEnumGetValue(syslog_facility_enum,argv[1]);
@@ -2135,6 +2138,7 @@ int rewriteConfig(char *path) {
rewriteConfigStringOption(state,"logfile",server.logfile,CONFIG_DEFAULT_LOGFILE);
rewriteConfigYesNoOption(state,"syslog-enabled",server.syslog_enabled,CONFIG_DEFAULT_SYSLOG_ENABLED);
rewriteConfigStringOption(state,"syslog-ident",server.syslog_ident,CONFIG_DEFAULT_SYSLOG_IDENT);
+ rewriteConfigStringOption(state,"ignore-warnings",server.ignore_warnings,CONFIG_DEFAULT_IGNORE_WARNINGS);
rewriteConfigSyslogfacilityOption(state);
rewriteConfigSaveOption(state);
rewriteConfigNumericalOption(state,"databases",server.dbnum,CONFIG_DEFAULT_DBNUM);
diff --git a/src/server.c b/src/server.c
index 03b4f03aa..5c1400510 100644
--- a/src/server.c
+++ b/src/server.c
@@ -56,6 +56,10 @@
#include <locale.h>
#include <sys/socket.h>
+#ifdef __linux__
+#include <sys/mman.h>
+#endif
+
/* Our shared "common" objects */
struct sharedObjectsStruct shared;
@@ -1577,6 +1581,7 @@ void initServerConfig(void) {
server.logfile = zstrdup(CONFIG_DEFAULT_LOGFILE);
server.syslog_enabled = CONFIG_DEFAULT_SYSLOG_ENABLED;
server.syslog_ident = zstrdup(CONFIG_DEFAULT_SYSLOG_IDENT);
+ server.ignore_warnings = zstrdup(CONFIG_DEFAULT_IGNORE_WARNINGS);
server.syslog_facility = LOG_LOCAL0;
server.daemonize = CONFIG_DEFAULT_DAEMONIZE;
server.supervised = 0;
@@ -3707,6 +3712,21 @@ void monitorCommand(client *c) {
/* =================================== Main! ================================ */
+int checkIgnoreWarning(const char *warning) {
+ int argc, j;
+ sds *argv = sdssplitargs(server.ignore_warnings, &argc);
+ if (argv == NULL)
+ return 0;
+
+ for (j = 0; j < argc; j++) {
+ char *flag = argv[j];
+ if (!strcasecmp(flag, warning))
+ break;
+ }
+ sdsfreesplitres(argv,argc);
+ return j < argc;
+}
+
#ifdef __linux__
int linuxOvercommitMemoryValue(void) {
FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
@@ -3730,6 +3750,113 @@ void linuxMemoryWarnings(void) {
serverLog(LL_WARNING,"WARNING you have Transparent Huge Pages (THP) support enabled in your kernel. This will create latency and memory usage issues with Redis. To fix this issue run the command 'echo never > /sys/kernel/mm/transparent_hugepage/enabled' as root, and add it to your /etc/rc.local in order to retain the setting after a reboot. Redis must be restarted after THP is disabled.");
}
}
+
+#ifdef __arm64__
+
+/* Get size in kilobytes of the Shared_Dirty pages of the calling process for the
+ * memory map corresponding to the provided address, or -1 on error. */
+static int smapsGetSharedDirty(unsigned long addr) {
+ int ret, in_mapping = 0, val = -1;
+ unsigned long from, to;
+ char buf[64];
+ FILE *f;
+
+ f = fopen("/proc/self/smaps", "r");
+ serverAssert(f);
+
+ while (1) {
+ if (!fgets(buf, sizeof(buf), f))
+ break;
+
+ ret = sscanf(buf, "%lx-%lx", &from, &to);
+ if (ret == 2)
+ in_mapping = from <= addr && addr < to;
+
+ if (in_mapping && !memcmp(buf, "Shared_Dirty:", 13)) {
+ ret = sscanf(buf, "%*s %d", &val);
+ serverAssert(ret == 1);
+ break;
+ }
+ }
+
+ fclose(f);
+ return val;
+}
+
+/* Older arm64 Linux kernels have a bug that could lead to data corruption
+ * during background save in certain scenarios. This function checks if the
+ * kernel is affected.
+ * The bug was fixed in commit ff1712f953e27f0b0718762ec17d0adb15c9fd0b
+ * titled: "arm64: pgtable: Ensure dirty bit is preserved across pte_wrprotect()"
+ * Return 1 if the kernel seems to be affected, and 0 otherwise. */
+int linuxMadvFreeForkBugCheck(void) {
+ int ret, pipefd[2];
+ pid_t pid;
+ char *p, *q, bug_found = 0;
+ const long map_size = 3 * 4096;
+
+ /* Create a memory map that's in our full control (not one used by the allocator). */
+ p = mmap(NULL, map_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ serverAssert(p != MAP_FAILED);
+
+ q = p + 4096;
+
+ /* Split the memory map in 3 pages by setting their protection as RO|RW|RO to prevent
+ * Linux from merging this memory map with adjacent VMAs. */
+ ret = mprotect(q, 4096, PROT_READ | PROT_WRITE);
+ serverAssert(!ret);
+
+ /* Write to the page once to make it resident */
+ *(volatile char*)q = 0;
+
+ /* Tell the kernel that this page is free to be reclaimed. */
+#ifndef MADV_FREE
+#define MADV_FREE 8
+#endif
+ ret = madvise(q, 4096, MADV_FREE);
+ serverAssert(!ret);
+
+ /* Write to the page after being marked for freeing, this is supposed to take
+ * ownership of that page again. */
+ *(volatile char*)q = 0;
+
+ /* Create a pipe for the child to return the info to the parent. */
+ ret = pipe(pipefd);
+ serverAssert(!ret);
+
+ /* Fork the process. */
+ pid = fork();
+ serverAssert(pid >= 0);
+ if (!pid) {
+ /* Child: check if the page is marked as dirty, expecing 4 (kB).
+ * A value of 0 means the kernel is affected by the bug. */
+ if (!smapsGetSharedDirty((unsigned long)q))
+ bug_found = 1;
+
+ ret = write(pipefd[1], &bug_found, 1);
+ serverAssert(ret == 1);
+
+ exit(0);
+ } else {
+ /* Read the result from the child. */
+ ret = read(pipefd[0], &bug_found, 1);
+ serverAssert(ret == 1);
+
+ /* Reap the child pid. */
+ serverAssert(waitpid(pid, NULL, 0) == pid);
+ }
+
+ /* Cleanup */
+ ret = close(pipefd[0]);
+ serverAssert(!ret);
+ ret = close(pipefd[1]);
+ serverAssert(!ret);
+ ret = munmap(p, map_size);
+ serverAssert(!ret);
+
+ return bug_found;
+}
+#endif /* __arm64__ */
#endif /* __linux__ */
void createPidFile(void) {
@@ -4215,7 +4342,16 @@ int main(int argc, char **argv) {
serverLog(LL_WARNING,"Server initialized");
#ifdef __linux__
linuxMemoryWarnings();
- #endif
+ #if defined (__arm64__)
+ if (linuxMadvFreeForkBugCheck()) {
+ serverLog(LL_WARNING,"WARNING Your kernel has a bug that could lead to data corruption during background save. Please upgrade to the latest stable kernel.");
+ if (!checkIgnoreWarning("ARM64-COW-BUG")) {
+ serverLog(LL_WARNING,"Redis will now exit to prevent data corruption. Note that it is possible to suppress this warning by setting the following config: ignore-warnings ARM64-COW-BUG");
+ exit(1);
+ }
+ }
+ #endif /* __arm64__ */
+ #endif /* __linux__ */
moduleLoadFromQueue();
InitServerLast();
loadDataFromDisk();
diff --git a/src/server.h b/src/server.h
index a905c0e28..e6379cd54 100644
--- a/src/server.h
+++ b/src/server.h
@@ -114,6 +114,7 @@ typedef long long ustime_t; /* microsecond time type. */
#define CONFIG_BGSAVE_RETRY_DELAY 5 /* Wait a few secs before trying again. */
#define CONFIG_DEFAULT_PID_FILE "/var/run/redis.pid"
#define CONFIG_DEFAULT_SYSLOG_IDENT "redis"
+#define CONFIG_DEFAULT_IGNORE_WARNINGS ""
#define CONFIG_DEFAULT_CLUSTER_CONFIG_FILE "nodes.conf"
#define CONFIG_DEFAULT_CLUSTER_ANNOUNCE_IP NULL /* Auto detect. */
#define CONFIG_DEFAULT_CLUSTER_ANNOUNCE_PORT 0 /* Use server.port */
@@ -965,6 +966,7 @@ struct redisServer {
int sentinel_mode; /* True if this instance is a Sentinel. */
size_t initial_memory_usage; /* Bytes used after initialization. */
int always_show_logo; /* Show logo even for non-stdout logging. */
+ char *ignore_warnings; /* Config: warnings that should be ignored. */
/* Modules */
dict *moduleapi; /* Exported core APIs dictionary for modules. */
dict *sharedapi; /* Like moduleapi but containing the APIs that