summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorzhaozhao.zz <276441700@qq.com>2021-01-29 14:35:10 +0800
committerGitHub <noreply@github.com>2021-01-29 14:35:10 +0800
commit49b36633324074edd3e06c334d23562edc49159d (patch)
treef00b86007d129bf004aba2d12ee9216c73c98087
parent0d18a1e85fdb9967419dc9f7a5905452d065c84a (diff)
downloadredis-49b36633324074edd3e06c334d23562edc49159d.tar.gz
AOF: recover from last write error after turn on appendonly again (#8030)
The key point is how to recover from last AOF write error, for example: 1. start redis with appendonly yes, and append some write commands 2. short write or something else error happen, `server.aof_last_write_status` changed to `C_ERR`, now redis doesn't accept write commands 3. execute `CONFIG SET appendonly no` to avoid the above problem, now redis can accept write commands again 4. disk error resolved, and execute `CONFIG SET appendonly yes` to reopen AOF, but `server.aof_last_write_status` cannot be changed to `C_OK` (if background aof rewrite run less then 1 second, it will free `server.aof_buf` and then serverCron cannot fix `aof_last_write_status`), then redis cannot accept write commands forever. This PR use a simple way to fix it: 1. just free `server.aof_buf` when stop appendonly to save memory, if error happens in `flushAppendOnlyFile(1)`, the `server.aof_buf` may contains some data which has not be written to aof, I think we can ignore it because we turn off the appendonly. 2. reset fsync status after stop appendonly and call `flushAppendOnlyFile` only when `aof_state` is ON 3. reset `server.last_write_status` when reopen aof to accept write commands
-rw-r--r--src/aof.c19
-rw-r--r--src/server.c8
2 files changed, 22 insertions, 5 deletions
diff --git a/src/aof.c b/src/aof.c
index f07f7ca2e..6753e8bcc 100644
--- a/src/aof.c
+++ b/src/aof.c
@@ -235,6 +235,8 @@ void stopAppendOnly(void) {
serverAssert(server.aof_state != AOF_OFF);
flushAppendOnlyFile(1);
redis_fsync(server.aof_fd);
+ server.aof_fsync_offset = server.aof_current_size;
+ server.aof_last_fsync = server.unixtime;
close(server.aof_fd);
server.aof_fd = -1;
@@ -242,6 +244,8 @@ void stopAppendOnly(void) {
server.aof_state = AOF_OFF;
server.aof_rewrite_scheduled = 0;
killAppendOnlyChild();
+ sdsfree(server.aof_buf);
+ server.aof_buf = sdsempty();
}
/* Called when the user switches from "appendonly no" to "appendonly yes"
@@ -285,6 +289,12 @@ int startAppendOnly(void) {
server.aof_state = AOF_WAIT_REWRITE;
server.aof_last_fsync = server.unixtime;
server.aof_fd = newfd;
+
+ /* If AOF was in error state, we just ignore it and log the event. */
+ if (server.aof_last_write_status == C_ERR) {
+ serverLog(LL_WARNING,"AOF reopen, just ignore the last error.");
+ server.aof_last_write_status = C_OK;
+ }
return C_OK;
}
@@ -1855,18 +1865,22 @@ void backgroundRewriteDoneHandler(int exitcode, int bysignal) {
close(newfd);
goto cleanup;
}
+ latencyEndMonitor(latency);
+ latencyAddSampleIfNeeded("aof-rewrite-diff-write",latency);
+
if (server.aof_fsync == AOF_FSYNC_EVERYSEC) {
aof_background_fsync(newfd);
} else if (server.aof_fsync == AOF_FSYNC_ALWAYS) {
+ latencyStartMonitor(latency);
if (redis_fsync(newfd) == -1) {
serverLog(LL_WARNING,
"Error trying to fsync the parent diff to the rewritten AOF: %s", strerror(errno));
close(newfd);
goto cleanup;
}
+ latencyEndMonitor(latency);
+ latencyAddSampleIfNeeded("aof-rewrite-done-fsync",latency);
}
- latencyEndMonitor(latency);
- latencyAddSampleIfNeeded("aof-rewrite-diff-write",latency);
serverLog(LL_NOTICE,
"Residual parent diff successfully flushed to the rewritten AOF (%.2f MB)", (double) aofRewriteBufferSize() / (1024*1024));
@@ -1938,6 +1952,7 @@ void backgroundRewriteDoneHandler(int exitcode, int bysignal) {
aofUpdateCurrentSize();
server.aof_rewrite_base_size = server.aof_current_size;
server.aof_fsync_offset = server.aof_current_size;
+ server.aof_last_fsync = server.unixtime;
/* Clear regular AOF buffer since its contents was just written to
* the new AOF from the background rewrite buffer. */
diff --git a/src/server.c b/src/server.c
index 1166c4b3e..0a12e2e31 100644
--- a/src/server.c
+++ b/src/server.c
@@ -2174,14 +2174,15 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
/* AOF postponed flush: Try at every cron cycle if the slow fsync
* completed. */
- if (server.aof_flush_postponed_start) flushAppendOnlyFile(0);
+ if (server.aof_state == AOF_ON && server.aof_flush_postponed_start)
+ flushAppendOnlyFile(0);
/* AOF write errors: in this case we have a buffer to flush as well and
* clear the AOF error in case of success to make the DB writable again,
* however to try every second is enough in case of 'hz' is set to
* a higher frequency. */
run_with_period(1000) {
- if (server.aof_last_write_status == C_ERR)
+ if (server.aof_state == AOF_ON && server.aof_last_write_status == C_ERR)
flushAppendOnlyFile(0);
}
@@ -2418,7 +2419,8 @@ void beforeSleep(struct aeEventLoop *eventLoop) {
trackingBroadcastInvalidationMessages();
/* Write the AOF buffer on disk */
- flushAppendOnlyFile(0);
+ if (server.aof_state == AOF_ON)
+ flushAppendOnlyFile(0);
/* Handle writes with pending output buffers. */
handleClientsWithPendingWritesUsingThreads();