diff options
author | 杨博东 <bodong.ybd@alibaba-inc.com> | 2020-08-20 13:59:02 +0800 |
---|---|---|
committer | Oran Agra <oran@redislabs.com> | 2021-02-22 23:22:53 +0200 |
commit | e4ab38a3e440f945229fd9e4379b3f98aca2345e (patch) | |
tree | 9ed2b6012dafc2743a600c17c754de1326702e12 | |
parent | c86eabb0d03d068933d75fa67385d0f734abac6c (diff) | |
download | redis-e4ab38a3e440f945229fd9e4379b3f98aca2345e.tar.gz |
Fix flock cluster config may cause failure to restart after kill -9 (#7674)
After fork, the child process(redis-aof-rewrite) will get the fd opened
by the parent process(redis), when redis killed by kill -9, it will not
graceful exit(call prepareForShutdown()), so redis-aof-rewrite thread may still
alive, the fd(lock) will still be held by redis-aof-rewrite thread, and
redis restart will fail to get lock, means fail to start.
This issue was causing failures in the cluster tests in github actions.
Co-authored-by: Oran Agra <oran@redislabs.com>
(cherry picked from commit cbaf3c5bbafd43e009a2d6b38dd0e9fc450a3e12)
-rw-r--r-- | src/aof.c | 2 | ||||
-rw-r--r-- | src/cluster.c | 11 | ||||
-rw-r--r-- | src/rdb.c | 4 | ||||
-rw-r--r-- | src/scripting.c | 2 | ||||
-rw-r--r-- | src/server.c | 10 | ||||
-rw-r--r-- | src/server.h | 2 | ||||
-rw-r--r-- | tests/instances.tcl | 14 |
7 files changed, 35 insertions, 10 deletions
@@ -1578,7 +1578,7 @@ int rewriteAppendOnlyFileBackground(void) { char tmpfile[256]; /* Child */ - closeListeningSockets(0); + closeClildUnusedResourceAfterFork(); redisSetProcTitle("redis-aof-rewrite"); snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid()); if (rewriteAppendOnlyFile(tmpfile) == C_OK) { diff --git a/src/cluster.c b/src/cluster.c index 3e19f88eb..f8e03534d 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -418,7 +418,15 @@ int clusterLockConfig(char *filename) { return C_ERR; } /* Lock acquired: leak the 'fd' by not closing it, so that we'll retain the - * lock to the file as long as the process exists. */ + * lock to the file as long as the process exists. + * + * After fork, the child process will get the fd opened by the parent process, + * we need save `fd` to `cluster_config_file_lock_fd`, so that in redisFork(), + * it will be closed in the child process. + * If it is not closed, when the main process is killed -9, but the child process + * (redis-aof-rewrite) is still alive, the fd(lock) will still be held by the + * child process, and the main process will fail to get lock, means fail to start. */ + server.cluster_config_file_lock_fd = fd; #endif /* __sun */ return C_OK; @@ -468,6 +476,7 @@ void clusterInit(void) { /* Lock the cluster config file to make sure every node uses * its own nodes.conf. */ + server.cluster_config_file_lock_fd = -1; if (clusterLockConfig(server.cluster_configfile) == C_ERR) exit(1); @@ -1340,7 +1340,7 @@ int rdbSaveBackground(char *filename, rdbSaveInfo *rsi) { int retval; /* Child */ - closeListeningSockets(0); + closeClildUnusedResourceAfterFork(); redisSetProcTitle("redis-rdb-bgsave"); retval = rdbSave(filename,rsi); if (retval == C_OK) { @@ -2350,7 +2350,7 @@ int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) { rioInitWithFdset(&slave_sockets,fds,numfds); zfree(fds); - closeListeningSockets(0); + closeClildUnusedResourceAfterFork(); redisSetProcTitle("redis-rdb-to-slaves"); retval = rdbSaveRioWithEOFMark(&slave_sockets,NULL,rsi); diff --git a/src/scripting.c b/src/scripting.c index 1ad781dda..50c728180 100644 --- a/src/scripting.c +++ b/src/scripting.c @@ -1675,7 +1675,7 @@ int ldbStartSession(client *c) { * socket to make sure if the parent crashes a reset is sent * to the clients. */ serverLog(LL_WARNING,"Redis forked for debugging eval"); - closeListeningSockets(0); + closeClildUnusedResourceAfterFork(); } else { /* Parent */ listAddNodeTail(ldb.children,(void*)(unsigned long)cp); diff --git a/src/server.c b/src/server.c index 1d6de48a0..68fa089ac 100644 --- a/src/server.c +++ b/src/server.c @@ -4035,6 +4035,16 @@ void setupSignalHandlers(void) { return; } +/* After fork, the child process will inherit the resources + * of the parent process, e.g. fd(socket or flock) etc. + * should close the resources not used by the child process, so that if the + * parent restarts it can bind/lock despite the child possibly still running. */ +void closeClildUnusedResourceAfterFork() { + closeListeningSockets(0); + if (server.cluster_enabled && server.cluster_config_file_lock_fd != -1) + close(server.cluster_config_file_lock_fd); /* don't care if this fails */ +} + void memtest(size_t megabytes, int passes); /* Returns 1 if there is --sentinel among the arguments or if diff --git a/src/server.h b/src/server.h index e6379cd54..f60dfa24c 100644 --- a/src/server.h +++ b/src/server.h @@ -1260,6 +1260,7 @@ struct redisServer { to set in order to suppress certain native Redis Cluster features. Check the REDISMODULE_CLUSTER_FLAG_*. */ + int cluster_config_file_lock_fd; /* cluster config fd, will be flock */ /* Scripting */ lua_State *lua; /* The Lua interpreter. We use just one for all clients */ client *lua_client; /* The "fake client" to query Redis from Lua */ @@ -1757,6 +1758,7 @@ void populateCommandTable(void); void resetCommandTableStats(void); void adjustOpenFilesLimit(void); void closeListeningSockets(int unlink_unix_socket); +void closeClildUnusedResourceAfterFork(); void updateCachedTime(int update_daylight_info); void resetServerStats(void); void activeDefragCycle(void); diff --git a/tests/instances.tcl b/tests/instances.tcl index 352b2a2e5..8338d5741 100644 --- a/tests/instances.tcl +++ b/tests/instances.tcl @@ -84,7 +84,9 @@ proc spawn_instance {type base_port count {conf {}}} { # Check availability if {[server_is_up 127.0.0.1 $port 100] == 0} { - abort_sentinel_test "Problems starting $type #$j: ping timeout" + set logfile [file join $dirname log.txt] + puts [exec tail $logfile] + abort_sentinel_test "Problems starting $type #$j: ping timeout, maybe server start failed, check $logfile" } # Push the instance into the right list @@ -462,12 +464,12 @@ proc kill_instance {type id} { # Wait for the port it was using to be available again, so that's not # an issue to start a new server ASAP with the same port. - set retry 10 + set retry 100 while {[incr retry -1]} { - set port_is_free [catch {set s [socket 127.0.01 $port]}] + set port_is_free [catch {set s [socket 127.0.0.1 $port]}] if {$port_is_free} break catch {close $s} - after 1000 + after 100 } if {$retry == 0} { error "Port $port does not return available after killing instance." @@ -494,7 +496,9 @@ proc restart_instance {type id} { # Check that the instance is running if {[server_is_up 127.0.0.1 $port 100] == 0} { - abort_sentinel_test "Problems starting $type #$id: ping timeout" + set logfile [file join $dirname log.txt] + puts [exec tail $logfile] + abort_sentinel_test "Problems starting $type #$id: ping timeout, maybe server start failed, check $logfile" } # Connect with it with a fresh link |