support transparent hugepages on Linux

Linux has supported transparent huge pages on Linux for quite some time. Memory regions can be marked for conversion to huge pages with madvise. Alternatively, Users can have the system default to using huge pages for all memory regions when applicable, i.e. when the mapped region is large enough, the properly aligned pages will be converted. Using either method, we would preallocate memory for the cache with proper alignment, and call madvise on it. Whether the memory region actually gets converted to hugepages ultimately depends on the setting of /sys/kernel/mm/transparent_hugepage/enabled. The existence of this file is also checked to see if transparent huge pages support is compiled into the kernel. If any step of the preallocation fails, we simply fallback to standard allocation, without even preallocating slabs, as they would not have the proper alignment or settings anyway.
author: Chen-Yu Tsai <wens@csie.org> 2017-12-13 13:12:44 +0800
committer: dormando <dormando@rydia.net> 2018-07-05 22:18:29 -0700
commit: 94ca8d4c4cb076c82e46d005437dd5295f38e7c6 (patch)
tree: fc1f27441e6fe115174a90881363534366946807
parent: 06b1e7b796f2ceef60176809ff1b64f6b3fd222b (diff)
download: memcached-94ca8d4c4cb076c82e46d005437dd5295f38e7c6.tar.gz
2 files changed, 76 insertions, 1 deletions
diff --git a/memcached.c b/memcached.c
index 42c5854..2f8f89c 100644
--- a/memcached.c
+++ b/memcached.c
@@ -6432,6 +6432,16 @@ static int enable_large_pages(void) {
     }
 
     return ret;
+#elif defined(__linux__) && defined(MADV_HUGEPAGE)
+    /* check if transparent hugepages is compiled into the kernel */
+    struct stat st;
+    int ret = stat("/sys/kernel/mm/transparent_hugepage/enabled", &st);
+    if (ret || !(st.st_mode & S_IFREG)) {
+        fprintf(stderr, "Transparent huge pages support not detected.\n");
+        fprintf(stderr, "Will use default page size.\n");
+        return -1;
+    }
+    return 0;
 #else
     return -1;
 #endif
diff --git a/slabs.c b/slabs.c
index 200d575..ad76d81 100644
--- a/slabs.c
+++ b/slabs.c
@@ -8,6 +8,7 @@
  * memcached protocol.
  */
 #include "memcached.h"
+#include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/socket.h>
 #include <sys/resource.h>
@@ -98,6 +99,57 @@ unsigned int slabs_clsid(const size_t size) {
     return res;
 }
 
+#if defined(__linux__) && defined(MADV_HUGEPAGE)
+/* Function split out for better error path handling */
+static void * alloc_large_chunk_linux(const size_t limit)
+{
+    size_t pagesize = 0;
+    void *ptr = NULL;
+    FILE *fp;
+    int ret;
+
+    /* Get the size of huge pages */
+    fp = fopen("/proc/meminfo", "r");
+    if (fp != NULL) {
+        char buf[64];
+
+        while ((fgets(buf, sizeof(buf), fp)))
+            if (!strncmp(buf, "Hugepagesize:", 13)) {
+                ret = sscanf(buf + 13, "%zu\n", &pagesize);
+
+                /* meminfo huge page size is in KiBs */
+                pagesize <<= 10;
+            }
+        fclose(fp);
+    }
+
+    if (!pagesize) {
+        fprintf(stderr, "Failed to get supported huge page size\n");
+        return NULL;
+    }
+
+    if (settings.verbose > 1)
+        fprintf(stderr, "huge page size: %zu\n", pagesize);
+
+    /* This works because glibc simply uses mmap when the alignment is
+     * above a certain limit. */
+    ret = posix_memalign(&ptr, pagesize, limit);
+    if (ret != 0) {
+        fprintf(stderr, "Failed to get aligned memory chunk: %d\n", ret);
+        return NULL;
+    }
+
+    ret = madvise(ptr, limit, MADV_HUGEPAGE);
+    if (ret < 0) {
+        fprintf(stderr, "Failed to set transparent hugepage hint: %d\n", ret);
+        free(ptr);
+        ptr = NULL;
+    }
+
+    return ptr;
+}
+#endif
+
 /**
  * Determines the chunk sizes and initializes the slab class descriptors
  * accordingly.
@@ -106,11 +158,24 @@ void slabs_init(const size_t limit, const double factor, const bool prealloc, co
     int i = POWER_SMALLEST - 1;
     unsigned int size = sizeof(item) + settings.chunk_size;
 
+    /* Some platforms use runtime transparent hugepages. If for any reason
+     * the initial allocation fails, the required settings do not persist
+     * for remaining allocations. As such it makes little sense to do slab
+     * preallocation. */
+    bool __attribute__ ((unused)) do_slab_prealloc = false;
+
     mem_limit = limit;
 
     if (prealloc) {
+#if defined(__linux__) && defined(MADV_HUGEPAGE)
+        mem_base = alloc_large_chunk_linux(mem_limit);
+        if (mem_base)
+            do_slab_prealloc = true;
+#else
         /* Allocate everything in a big chunk with malloc */
         mem_base = malloc(mem_limit);
+        do_slab_prealloc = true;
+#endif
         if (mem_base != NULL) {
             mem_current = mem_base;
             mem_avail = mem_limit;
@@ -161,7 +226,7 @@ void slabs_init(const size_t limit, const double factor, const bool prealloc, co
 
     }
 
-    if (prealloc) {
+    if (prealloc && do_slab_prealloc) {
         slabs_preallocate(power_largest);
     }
 }
author	Chen-Yu Tsai <wens@csie.org>	2017-12-13 13:12:44 +0800
committer	dormando <dormando@rydia.net>	2018-07-05 22:18:29 -0700
commit	94ca8d4c4cb076c82e46d005437dd5295f38e7c6 (patch)
tree	fc1f27441e6fe115174a90881363534366946807
parent	06b1e7b796f2ceef60176809ff1b64f6b3fd222b (diff)
download	memcached-94ca8d4c4cb076c82e46d005437dd5295f38e7c6.tar.gz