Use sync_file_range to optimize fsync if possible (#9409)

We implement incremental data sync in rio.c by call fsync, on slow disk, that may cost a lot of time, sync_file_range could provide async fsync, so we could serialize key/value and sync file data at the same time. > one tip for sync_file_range usage: http://lkml.iu.edu/hypermail/linux/kernel/1005.2/01845.html Additionally, this change avoids a single large write to be used, which can result in a mass of dirty pages in the kernel (increasing the risk of someone else's write to block). On HDD, current solution could reduce approximate half of dumping RDB time, this PR costs 50s for dump 7.7G rdb but unstable branch costs 93s. On NVME SSD, this PR can't reduce much time, this PR costs 40s, unstable branch costs 48s. Moreover, I find calling data sync every 4MB is better than 32MB.
author: Wang Yuan <wangyuan21@baidu.com> 2021-08-30 15:24:53 +0800
committer: GitHub <noreply@github.com> 2021-08-30 10:24:53 +0300
commit: 9a0c0617f10ebb6cd5bf82f5f4f7049e7ff0a3ce (patch)
tree: fbbf767d9291f4e0ef1e49064320246e9390cba9
parent: aefbc23451ae8d0c7503f2c3cb014bfd5b07b4ed (diff)
download: redis-9a0c0617f10ebb6cd5bf82f5f4f7049e7ff0a3ce.tar.gz
4 files changed, 47 insertions, 13 deletions
diff --git a/redis.conf b/redis.conf
index 84f45fdb7..09107dcea 100644
--- a/redis.conf
+++ b/redis.conf
@@ -1881,13 +1881,13 @@ hz 10
 dynamic-hz yes
 
 # When a child rewrites the AOF file, if the following option is enabled
-# the file will be fsync-ed every 32 MB of data generated. This is useful
+# the file will be fsync-ed every 4 MB of data generated. This is useful
 # in order to commit the file to the disk more incrementally and avoid
 # big latency spikes.
 aof-rewrite-incremental-fsync yes
 
 # When redis saves RDB file, if the following option is enabled
-# the file will be fsync-ed every 32 MB of data generated. This is useful
+# the file will be fsync-ed every 4 MB of data generated. This is useful
 # in order to commit the file to the disk more incrementally and avoid
 # big latency spikes.
 rdb-save-incremental-fsync yes
diff --git a/src/config.h b/src/config.h
index 4700e7208..5a4223fbd 100644
--- a/src/config.h
+++ b/src/config.h
@@ -120,6 +120,7 @@
 /* Define rdb_fsync_range to sync_file_range() on Linux, otherwise we use
  * the plain fsync() call. */
 #if (defined(__linux__) && defined(SYNC_FILE_RANGE_WAIT_BEFORE))
+#define HAVE_SYNC_FILE_RANGE 1
 #define rdb_fsync_range(fd,off,size) sync_file_range(fd,off,size,SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE)
 #else
 #define rdb_fsync_range(fd,off,size) fsync(fd)
diff --git a/src/rio.c b/src/rio.c
index 2234ec6e3..70817347b 100644
--- a/src/rio.c
+++ b/src/rio.c
@@ -108,19 +108,52 @@ void rioInitWithBuffer(rio *r, sds s) {
 
 /* Returns 1 or 0 for success/failure. */
 static size_t rioFileWrite(rio *r, const void *buf, size_t len) {
-    size_t retval;
+    if (!r->io.file.autosync) return fwrite(buf,len,1,r->io.file.fp);
 
-    retval = fwrite(buf,len,1,r->io.file.fp);
-    r->io.file.buffered += len;
+    size_t nwritten = 0;
+    /* Incrementally write data to the file, avoid a single write larger than
+     * the autosync threshold (so that the kernel's buffer cache never has too
+     * many dirty pages at once). */
+    while (len != nwritten) {
+        serverAssert(r->io.file.autosync > r->io.file.buffered);
+        size_t nalign = (size_t)(r->io.file.autosync - r->io.file.buffered);
+        size_t towrite = nalign > len-nwritten ? len-nwritten : nalign;
+
+        if (fwrite((char*)buf+nwritten,towrite,1,r->io.file.fp) == 0) return 0;
+        nwritten += towrite;
+        r->io.file.buffered += towrite;
+
+        if (r->io.file.buffered >= r->io.file.autosync) {
+            fflush(r->io.file.fp);
+
+            size_t processed = r->processed_bytes + nwritten;
+            serverAssert(processed % r->io.file.autosync == 0);
+            serverAssert(r->io.file.buffered == r->io.file.autosync);
+
+#if HAVE_SYNC_FILE_RANGE
+            /* Start writeout asynchronously. */
+            if (sync_file_range(fileno(r->io.file.fp),
+                    processed - r->io.file.autosync, r->io.file.autosync,
+                    SYNC_FILE_RANGE_WRITE) == -1)
+                return 0;
 
-    if (r->io.file.autosync &&
-        r->io.file.buffered >= r->io.file.autosync)
-    {
-        fflush(r->io.file.fp);
-        if (redis_fsync(fileno(r->io.file.fp)) == -1) return 0;
-        r->io.file.buffered = 0;
+            if (processed >= (size_t)r->io.file.autosync * 2) {
+                /* To keep the promise to 'autosync', we should make sure last
+                 * asynchronous writeout persists into disk. This call may block
+                 * if last writeout is not finished since disk is slow. */
+                if (sync_file_range(fileno(r->io.file.fp),
+                        processed - r->io.file.autosync*2,
+                        r->io.file.autosync, SYNC_FILE_RANGE_WAIT_BEFORE|
+                        SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER) == -1)
+                    return 0;
+            }
+#else
+            if (redis_fsync(fileno(r->io.file.fp)) == -1) return 0;
+#endif
+            r->io.file.buffered = 0;
+        }
     }
-    return retval;
+    return 1;
 }
 
 /* Returns 1 or 0 for success/failure. */
diff --git a/src/server.h b/src/server.h
index 177525aa4..1d7a84615 100644
--- a/src/server.h
+++ b/src/server.h
@@ -151,7 +151,7 @@ typedef long long ustime_t; /* microsecond time type. */
 #define PROTO_MBULK_BIG_ARG     (1024*32)
 #define PROTO_RESIZE_THRESHOLD  (1024*32) /* Threshold for determining whether to resize query buffer */
 #define LONG_STR_SIZE      21          /* Bytes needed for long -> str + '\0' */
-#define REDIS_AUTOSYNC_BYTES (1024*1024*32) /* fdatasync every 32MB */
+#define REDIS_AUTOSYNC_BYTES (1024*1024*4) /* Sync file every 4MB. */
 
 #define LIMIT_PENDING_QUERYBUF (4*1024*1024) /* 4mb */
author	Wang Yuan <wangyuan21@baidu.com>	2021-08-30 15:24:53 +0800
committer	GitHub <noreply@github.com>	2021-08-30 10:24:53 +0300
commit	9a0c0617f10ebb6cd5bf82f5f4f7049e7ff0a3ce (patch)
tree	fbbf767d9291f4e0ef1e49064320246e9390cba9
parent	aefbc23451ae8d0c7503f2c3cb014bfd5b07b4ed (diff)
download	redis-9a0c0617f10ebb6cd5bf82f5f4f7049e7ff0a3ce.tar.gz