summaryrefslogtreecommitdiff
path: root/src/zmalloc.c
diff options
context:
space:
mode:
authorWang Yuan <wangyuan21@baidu.com>2021-08-05 04:01:46 +0800
committerGitHub <noreply@github.com>2021-08-04 23:01:46 +0300
commitd4bca53cd9879e0296bfa0a7c17df79dd52496ae (patch)
treea95ebc8a6afe01c030b259144919e9a6898597db /src/zmalloc.c
parent56eb7f7de407c66b479005d3179fb36002099f4d (diff)
downloadredis-d4bca53cd9879e0296bfa0a7c17df79dd52496ae.tar.gz
Use madvise(MADV_DONTNEED) to release memory to reduce COW (#8974)
## Backgroud As we know, after `fork`, one process will copy pages when writing data to these pages(CoW), and another process still keep old pages, they totally cost more memory. For redis, we suffered that redis consumed much memory when the fork child is serializing key/values, even that maybe cause OOM. But actually we find, in redis fork child process, the child process don't need to keep some memory and parent process may write or update that, for example, child process will never access the key-value that is serialized but users may update it in parent process. So we think it may reduce COW if the child process release memory that it is not needed. ## Implementation For releasing key value in child process, we may think we call `decrRefCount` to free memory, but i find the fork child process still use much memory when we don't write any data to redis, and it costs much more time that slows down bgsave. Maybe because memory allocator doesn't really release memory to OS, and it may modify some inner data for this free operation, especially when we free small objects. Moreover, CoW is based on pages, so it is a easy way that we only free the memory bulk that is not less than kernel page size. madvise(MADV_DONTNEED) can quickly release specified region pages to OS bypassing memory allocator, and allocator still consider that this memory still is used and don't change its inner data. There are some buffers we can release in the fork child process: - **Serialized key-values** the fork child process never access serialized key-values, so we try to free them. Because we only can release big bulk memory, and it is time consumed to iterate all items/members/fields/entries of complex data type. So we decide to iterate them and try to release them only when their average size of item/member/field/entry is more than page size of OS. - **Replication backlog** Because replication backlog is a cycle buffer, it will be changed quickly if redis has heavy write traffic, but in fork child process, we don't need to access that. - **Client buffers** If clients have requests during having the fork child process, clients' buffer also be changed frequently. The memory includes client query buffer, output buffer, and client struct used memory. To get child process peak private dirty memory, we need to count peak memory instead of last used memory, because the child process may continue to release memory (since COW used to only grow till now, the last was equivalent to the peak). Also we're adding a new `current_cow_peak` info variable (to complement the existing `current_cow_size`) Co-authored-by: Oran Agra <oran@redislabs.com>
Diffstat (limited to 'src/zmalloc.c')
-rw-r--r--src/zmalloc.c34
1 files changed, 33 insertions, 1 deletions
diff --git a/src/zmalloc.c b/src/zmalloc.c
index 3645efcf1..012dadd2f 100644
--- a/src/zmalloc.c
+++ b/src/zmalloc.c
@@ -28,12 +28,20 @@
* POSSIBILITY OF SUCH DAMAGE.
*/
+#include "fmacros.h"
+#include "config.h"
+#include "solarisfixes.h"
+
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <assert.h>
+#ifdef __linux__
+#include <sys/mman.h>
+#endif
+
/* This function provide us access to the original libc free(). This is useful
* for instance to free results obtained by backtrace_symbols(). We need
* to define this function before including zmalloc.h that may shadow the
@@ -44,7 +52,6 @@ void zlibc_free(void *ptr) {
#include <string.h>
#include <pthread.h>
-#include "config.h"
#include "zmalloc.h"
#include "atomicvar.h"
@@ -335,6 +342,31 @@ void zmalloc_set_oom_handler(void (*oom_handler)(size_t)) {
zmalloc_oom_handler = oom_handler;
}
+/* Use 'MADV_DONTNEED' to release memory to operating system quickly.
+ * We do that in a fork child process to avoid CoW when the parent modifies
+ * these shared pages. */
+void zmadvise_dontneed(void *ptr) {
+#if defined(USE_JEMALLOC)
+ static size_t page_size = 0;
+ if (page_size == 0) page_size = sysconf(_SC_PAGESIZE);
+ size_t page_size_mask = page_size - 1;
+
+ size_t real_size = zmalloc_size(ptr);
+ if (real_size < page_size) return;
+
+ /* We need to align the pointer upwards according to page size, because
+ * the memory address is increased upwards and we only can free memory
+ * based on page. */
+ char *aligned_ptr = (char *)(((size_t)ptr+page_size_mask) & ~page_size_mask);
+ real_size -= (aligned_ptr-(char*)ptr);
+ if (real_size >= page_size) {
+ madvise((void *)aligned_ptr, real_size&~page_size_mask, MADV_DONTNEED);
+ }
+#else
+ (void)(ptr);
+#endif
+}
+
/* Get the RSS information in an OS-specific way.
*
* WARNING: the function zmalloc_get_rss() is not designed to be fast