summaryrefslogtreecommitdiff
path: root/src/rio.c
diff options
context:
space:
mode:
authorWang Yuan <wangyuan21@baidu.com>2021-08-30 15:24:53 +0800
committerGitHub <noreply@github.com>2021-08-30 10:24:53 +0300
commit9a0c0617f10ebb6cd5bf82f5f4f7049e7ff0a3ce (patch)
treefbbf767d9291f4e0ef1e49064320246e9390cba9 /src/rio.c
parentaefbc23451ae8d0c7503f2c3cb014bfd5b07b4ed (diff)
downloadredis-9a0c0617f10ebb6cd5bf82f5f4f7049e7ff0a3ce.tar.gz
Use sync_file_range to optimize fsync if possible (#9409)
We implement incremental data sync in rio.c by call fsync, on slow disk, that may cost a lot of time, sync_file_range could provide async fsync, so we could serialize key/value and sync file data at the same time. > one tip for sync_file_range usage: http://lkml.iu.edu/hypermail/linux/kernel/1005.2/01845.html Additionally, this change avoids a single large write to be used, which can result in a mass of dirty pages in the kernel (increasing the risk of someone else's write to block). On HDD, current solution could reduce approximate half of dumping RDB time, this PR costs 50s for dump 7.7G rdb but unstable branch costs 93s. On NVME SSD, this PR can't reduce much time, this PR costs 40s, unstable branch costs 48s. Moreover, I find calling data sync every 4MB is better than 32MB.
Diffstat (limited to 'src/rio.c')
-rw-r--r--src/rio.c53
1 files changed, 43 insertions, 10 deletions
diff --git a/src/rio.c b/src/rio.c
index 2234ec6e3..70817347b 100644
--- a/src/rio.c
+++ b/src/rio.c
@@ -108,19 +108,52 @@ void rioInitWithBuffer(rio *r, sds s) {
/* Returns 1 or 0 for success/failure. */
static size_t rioFileWrite(rio *r, const void *buf, size_t len) {
- size_t retval;
+ if (!r->io.file.autosync) return fwrite(buf,len,1,r->io.file.fp);
- retval = fwrite(buf,len,1,r->io.file.fp);
- r->io.file.buffered += len;
+ size_t nwritten = 0;
+ /* Incrementally write data to the file, avoid a single write larger than
+ * the autosync threshold (so that the kernel's buffer cache never has too
+ * many dirty pages at once). */
+ while (len != nwritten) {
+ serverAssert(r->io.file.autosync > r->io.file.buffered);
+ size_t nalign = (size_t)(r->io.file.autosync - r->io.file.buffered);
+ size_t towrite = nalign > len-nwritten ? len-nwritten : nalign;
+
+ if (fwrite((char*)buf+nwritten,towrite,1,r->io.file.fp) == 0) return 0;
+ nwritten += towrite;
+ r->io.file.buffered += towrite;
+
+ if (r->io.file.buffered >= r->io.file.autosync) {
+ fflush(r->io.file.fp);
+
+ size_t processed = r->processed_bytes + nwritten;
+ serverAssert(processed % r->io.file.autosync == 0);
+ serverAssert(r->io.file.buffered == r->io.file.autosync);
+
+#if HAVE_SYNC_FILE_RANGE
+ /* Start writeout asynchronously. */
+ if (sync_file_range(fileno(r->io.file.fp),
+ processed - r->io.file.autosync, r->io.file.autosync,
+ SYNC_FILE_RANGE_WRITE) == -1)
+ return 0;
- if (r->io.file.autosync &&
- r->io.file.buffered >= r->io.file.autosync)
- {
- fflush(r->io.file.fp);
- if (redis_fsync(fileno(r->io.file.fp)) == -1) return 0;
- r->io.file.buffered = 0;
+ if (processed >= (size_t)r->io.file.autosync * 2) {
+ /* To keep the promise to 'autosync', we should make sure last
+ * asynchronous writeout persists into disk. This call may block
+ * if last writeout is not finished since disk is slow. */
+ if (sync_file_range(fileno(r->io.file.fp),
+ processed - r->io.file.autosync*2,
+ r->io.file.autosync, SYNC_FILE_RANGE_WAIT_BEFORE|
+ SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER) == -1)
+ return 0;
+ }
+#else
+ if (redis_fsync(fileno(r->io.file.fp)) == -1) return 0;
+#endif
+ r->io.file.buffered = 0;
+ }
}
- return retval;
+ return 1;
}
/* Returns 1 or 0 for success/failure. */