diskless master, avoid bgsave child hung when fork parent crashes (#11463)

During a diskless sync, if the master main process crashes, the child would have hung in `write`. This fix closes the read fd on the child side, so that if the parent crashes, the child will get a write error and exit. This change also fixes disk-based replication, BGSAVE and AOFRW. In that case the child wouldn't have been hang, it would have just kept running until done which may be pointless. There is a certain degree of risk here. in case there's a BGSAVE child that could maybe succeed and the parent dies for some reason, the old code would have let the child keep running and maybe succeed and avoid data loss. On the other hand, if the parent is restarted, it would have loaded an old rdb file (or none), and then the child could reach the end and rename the rdb file (data conflicting with what the parent has), or also have a race with another BGSAVE child that the new parent started. Note that i removed a comment saying a write error will be ignored in the child and handled by the parent (this comment was very old and i don't think relevant).
author: Oran Agra <oran@redislabs.com> 2022-11-09 10:02:18 +0200
committer: GitHub <noreply@github.com> 2022-11-09 10:02:18 +0200
commit: ccaef5c923a14dc183c50530f52ada0fda012179 (patch)
tree: f8af010a6549d77db8d90465fbff65459a557d51 /tests/integration
parent: f92899185367ab08a084c501ba54759390c92e63 (diff)
download: redis-ccaef5c923a14dc183c50530f52ada0fda012179.tar.gz
1 files changed, 40 insertions, 1 deletions
diff --git a/tests/integration/replication.tcl b/tests/integration/replication.tcl
index 153aa8620..617b9f78e 100644
--- a/tests/integration/replication.tcl
+++ b/tests/integration/replication.tcl
@@ -992,7 +992,7 @@ test "diskless replication child being killed is collected" {
             # wait for the replicas to start reading the rdb
             wait_for_log_messages 0 {"*Loading DB in memory*"} $loglines 800 10
 
-            # wait to be sure the eplica is hung and the master is blocked on write
+            # wait to be sure the replica is hung and the master is blocked on write
             after 500
 
             # simulate the OOM killer or anyone else kills the child
@@ -1012,6 +1012,45 @@ test "diskless replication child being killed is collected" {
     }
 } {} {external:skip}
 
+foreach mdl {yes no} {
+    test "replication dies when parent is killed - diskless: $mdl" {
+        # when master is killed, make sure the fork child can detect that and exit
+        start_server {tags {"repl"}} {
+            set master [srv 0 client]
+            set master_host [srv 0 host]
+            set master_port [srv 0 port]
+            set master_pid [srv 0 pid]
+            $master config set repl-diskless-sync $mdl
+            $master config set repl-diskless-sync-delay 0
+            # create keys that will take 10 seconds to save
+            $master config set rdb-key-save-delay 1000
+            $master debug populate 10000
+            start_server {} {
+                set replica [srv 0 client]
+                $replica replicaof $master_host $master_port
+
+                # wait for rdb child to start
+                wait_for_condition 5000 10 {
+                    [s -1 rdb_bgsave_in_progress] == 1
+                } else {
+                    fail "rdb child didn't start"
+                }
+                set fork_child_pid [get_child_pid -1]
+
+                # simulate the OOM killer or anyone else kills the parent
+                exec kill -9 $master_pid
+
+                # wait for the child to notice the parent died have exited
+                wait_for_condition 500 10 {
+                    [process_is_alive $fork_child_pid] == 0
+                } else {
+                    fail "rdb child didn't terminate"
+                }
+            }
+        }
+    } {} {external:skip}
+}
+
 test "diskless replication read pipe cleanup" {
     # In diskless replication, we create a read pipe for the RDB, between the child and the parent.
     # When we close this pipe (fd), the read handler also needs to be removed from the event loop (if it still registered).
author	Oran Agra <oran@redislabs.com>	2022-11-09 10:02:18 +0200
committer	GitHub <noreply@github.com>	2022-11-09 10:02:18 +0200
commit	ccaef5c923a14dc183c50530f52ada0fda012179 (patch)
tree	f8af010a6549d77db8d90465fbff65459a557d51 /tests/integration
parent	f92899185367ab08a084c501ba54759390c92e63 (diff)
download	redis-ccaef5c923a14dc183c50530f52ada0fda012179.tar.gz