summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
author杨博东 <bodong.ybd@alibaba-inc.com>2020-08-20 13:59:02 +0800
committerOran Agra <oran@redislabs.com>2021-02-22 23:22:53 +0200
commite4ab38a3e440f945229fd9e4379b3f98aca2345e (patch)
tree9ed2b6012dafc2743a600c17c754de1326702e12
parentc86eabb0d03d068933d75fa67385d0f734abac6c (diff)
downloadredis-e4ab38a3e440f945229fd9e4379b3f98aca2345e.tar.gz
Fix flock cluster config may cause failure to restart after kill -9 (#7674)
After fork, the child process(redis-aof-rewrite) will get the fd opened by the parent process(redis), when redis killed by kill -9, it will not graceful exit(call prepareForShutdown()), so redis-aof-rewrite thread may still alive, the fd(lock) will still be held by redis-aof-rewrite thread, and redis restart will fail to get lock, means fail to start. This issue was causing failures in the cluster tests in github actions. Co-authored-by: Oran Agra <oran@redislabs.com> (cherry picked from commit cbaf3c5bbafd43e009a2d6b38dd0e9fc450a3e12)
-rw-r--r--src/aof.c2
-rw-r--r--src/cluster.c11
-rw-r--r--src/rdb.c4
-rw-r--r--src/scripting.c2
-rw-r--r--src/server.c10
-rw-r--r--src/server.h2
-rw-r--r--tests/instances.tcl14
7 files changed, 35 insertions, 10 deletions
diff --git a/src/aof.c b/src/aof.c
index e70c635fa..80a5f63e2 100644
--- a/src/aof.c
+++ b/src/aof.c
@@ -1578,7 +1578,7 @@ int rewriteAppendOnlyFileBackground(void) {
char tmpfile[256];
/* Child */
- closeListeningSockets(0);
+ closeClildUnusedResourceAfterFork();
redisSetProcTitle("redis-aof-rewrite");
snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
if (rewriteAppendOnlyFile(tmpfile) == C_OK) {
diff --git a/src/cluster.c b/src/cluster.c
index 3e19f88eb..f8e03534d 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -418,7 +418,15 @@ int clusterLockConfig(char *filename) {
return C_ERR;
}
/* Lock acquired: leak the 'fd' by not closing it, so that we'll retain the
- * lock to the file as long as the process exists. */
+ * lock to the file as long as the process exists.
+ *
+ * After fork, the child process will get the fd opened by the parent process,
+ * we need save `fd` to `cluster_config_file_lock_fd`, so that in redisFork(),
+ * it will be closed in the child process.
+ * If it is not closed, when the main process is killed -9, but the child process
+ * (redis-aof-rewrite) is still alive, the fd(lock) will still be held by the
+ * child process, and the main process will fail to get lock, means fail to start. */
+ server.cluster_config_file_lock_fd = fd;
#endif /* __sun */
return C_OK;
@@ -468,6 +476,7 @@ void clusterInit(void) {
/* Lock the cluster config file to make sure every node uses
* its own nodes.conf. */
+ server.cluster_config_file_lock_fd = -1;
if (clusterLockConfig(server.cluster_configfile) == C_ERR)
exit(1);
diff --git a/src/rdb.c b/src/rdb.c
index a1e966eae..afbbd8ca4 100644
--- a/src/rdb.c
+++ b/src/rdb.c
@@ -1340,7 +1340,7 @@ int rdbSaveBackground(char *filename, rdbSaveInfo *rsi) {
int retval;
/* Child */
- closeListeningSockets(0);
+ closeClildUnusedResourceAfterFork();
redisSetProcTitle("redis-rdb-bgsave");
retval = rdbSave(filename,rsi);
if (retval == C_OK) {
@@ -2350,7 +2350,7 @@ int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) {
rioInitWithFdset(&slave_sockets,fds,numfds);
zfree(fds);
- closeListeningSockets(0);
+ closeClildUnusedResourceAfterFork();
redisSetProcTitle("redis-rdb-to-slaves");
retval = rdbSaveRioWithEOFMark(&slave_sockets,NULL,rsi);
diff --git a/src/scripting.c b/src/scripting.c
index 1ad781dda..50c728180 100644
--- a/src/scripting.c
+++ b/src/scripting.c
@@ -1675,7 +1675,7 @@ int ldbStartSession(client *c) {
* socket to make sure if the parent crashes a reset is sent
* to the clients. */
serverLog(LL_WARNING,"Redis forked for debugging eval");
- closeListeningSockets(0);
+ closeClildUnusedResourceAfterFork();
} else {
/* Parent */
listAddNodeTail(ldb.children,(void*)(unsigned long)cp);
diff --git a/src/server.c b/src/server.c
index 1d6de48a0..68fa089ac 100644
--- a/src/server.c
+++ b/src/server.c
@@ -4035,6 +4035,16 @@ void setupSignalHandlers(void) {
return;
}
+/* After fork, the child process will inherit the resources
+ * of the parent process, e.g. fd(socket or flock) etc.
+ * should close the resources not used by the child process, so that if the
+ * parent restarts it can bind/lock despite the child possibly still running. */
+void closeClildUnusedResourceAfterFork() {
+ closeListeningSockets(0);
+ if (server.cluster_enabled && server.cluster_config_file_lock_fd != -1)
+ close(server.cluster_config_file_lock_fd); /* don't care if this fails */
+}
+
void memtest(size_t megabytes, int passes);
/* Returns 1 if there is --sentinel among the arguments or if
diff --git a/src/server.h b/src/server.h
index e6379cd54..f60dfa24c 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1260,6 +1260,7 @@ struct redisServer {
to set in order to suppress certain
native Redis Cluster features. Check the
REDISMODULE_CLUSTER_FLAG_*. */
+ int cluster_config_file_lock_fd; /* cluster config fd, will be flock */
/* Scripting */
lua_State *lua; /* The Lua interpreter. We use just one for all clients */
client *lua_client; /* The "fake client" to query Redis from Lua */
@@ -1757,6 +1758,7 @@ void populateCommandTable(void);
void resetCommandTableStats(void);
void adjustOpenFilesLimit(void);
void closeListeningSockets(int unlink_unix_socket);
+void closeClildUnusedResourceAfterFork();
void updateCachedTime(int update_daylight_info);
void resetServerStats(void);
void activeDefragCycle(void);
diff --git a/tests/instances.tcl b/tests/instances.tcl
index 352b2a2e5..8338d5741 100644
--- a/tests/instances.tcl
+++ b/tests/instances.tcl
@@ -84,7 +84,9 @@ proc spawn_instance {type base_port count {conf {}}} {
# Check availability
if {[server_is_up 127.0.0.1 $port 100] == 0} {
- abort_sentinel_test "Problems starting $type #$j: ping timeout"
+ set logfile [file join $dirname log.txt]
+ puts [exec tail $logfile]
+ abort_sentinel_test "Problems starting $type #$j: ping timeout, maybe server start failed, check $logfile"
}
# Push the instance into the right list
@@ -462,12 +464,12 @@ proc kill_instance {type id} {
# Wait for the port it was using to be available again, so that's not
# an issue to start a new server ASAP with the same port.
- set retry 10
+ set retry 100
while {[incr retry -1]} {
- set port_is_free [catch {set s [socket 127.0.01 $port]}]
+ set port_is_free [catch {set s [socket 127.0.0.1 $port]}]
if {$port_is_free} break
catch {close $s}
- after 1000
+ after 100
}
if {$retry == 0} {
error "Port $port does not return available after killing instance."
@@ -494,7 +496,9 @@ proc restart_instance {type id} {
# Check that the instance is running
if {[server_is_up 127.0.0.1 $port 100] == 0} {
- abort_sentinel_test "Problems starting $type #$id: ping timeout"
+ set logfile [file join $dirname log.txt]
+ puts [exec tail $logfile]
+ abort_sentinel_test "Problems starting $type #$id: ping timeout, maybe server start failed, check $logfile"
}
# Connect with it with a fresh link