From e81bd15e992362062e2d60b614407667431cf248 Mon Sep 17 00:00:00 2001 From: Madelyn Olson <34459052+madolson@users.noreply.github.com> Date: Wed, 30 Mar 2022 22:15:00 -0700 Subject: Prevent replica failover during manual takeover test (#10499) During 11-manual-takeover.tcl, if the killing of the instances happens too slowly, one of the replicas might be able to promote itself. I'm not sure why it was slow, but it was observed taking 6 seconds which is enough time to do an election. I was able to verify the error locally by adding a small delay (1 second) during ASAN CI. A fix is just to disable automated failover until all the nodes are confirmed dead. --- tests/cluster/tests/11-manual-takeover.tcl | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'tests/cluster') diff --git a/tests/cluster/tests/11-manual-takeover.tcl b/tests/cluster/tests/11-manual-takeover.tcl index f567c6962..78a0f858b 100644 --- a/tests/cluster/tests/11-manual-takeover.tcl +++ b/tests/cluster/tests/11-manual-takeover.tcl @@ -14,20 +14,32 @@ test "Cluster is writable" { cluster_write_test 0 } +# For this test, disable replica failover until +# all of the primaries are confirmed killed. Otherwise +# there might be enough time to elect a replica. +set replica_ids { 5 6 7 } +foreach id $replica_ids { + R $id config set cluster-replica-no-failover yes +} + test "Killing majority of master nodes" { kill_instance redis 0 kill_instance redis 1 kill_instance redis 2 } +foreach id $replica_ids { + R $id config set cluster-replica-no-failover no +} + test "Cluster should eventually be down" { assert_cluster_state fail } test "Use takeover to bring slaves back" { - R 5 cluster failover takeover - R 6 cluster failover takeover - R 7 cluster failover takeover + foreach id $replica_ids { + R $id cluster failover takeover + } } test "Cluster should eventually be up again" { @@ -39,9 +51,9 @@ test "Cluster is writable" { } test "Instance #5, #6, #7 are now masters" { - assert {[RI 5 role] eq {master}} - assert {[RI 6 role] eq {master}} - assert {[RI 7 role] eq {master}} + foreach id $replica_ids { + assert {[RI $id role] eq {master}} + } } test "Restarting the previously killed master nodes" { -- cgit v1.2.1