summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWenbin Zhu <wenbin.zhu@mongodb.com>2021-03-16 00:27:22 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-03-18 22:36:59 +0000
commit2cd5154b070f58191762979ad0014dfe78297b70 (patch)
treecbe6ec558e4c6fc9ad87e7fcdb71fe5cdae8ca75
parent927b3a649c421bac2f58f617702c138a13dff85c (diff)
downloadmongo-2cd5154b070f58191762979ad0014dfe78297b70.tar.gz
SERVER-48179 Allow transition to SECONDARY at the end of rollback even it was changed to REMOVED.
(cherry picked from commit 0ae1138bbfc066c4c7eb9f857cf4e29447743a3c)
-rw-r--r--etc/backports_required_for_multiversion_tests.yml2
-rw-r--r--jstests/replsets/reconfig_removes_node_in_rollback.js93
-rw-r--r--src/mongo/db/repl/rollback_impl.cpp6
-rw-r--r--src/mongo/db/repl/topology_coordinator.cpp7
4 files changed, 105 insertions, 3 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml
index 58f351c0675..1fe400bfd16 100644
--- a/etc/backports_required_for_multiversion_tests.yml
+++ b/etc/backports_required_for_multiversion_tests.yml
@@ -123,6 +123,8 @@ all:
test_file: jstests/sharding/mongos_helloOk_protocol.js
- ticket: SERVER-50414
test_file: jstests/replsets/not_primary_errors_returned_during_rollback_if_helloOk.js
+ - ticket: SERVER-48179
+ test_file: jstests/replsets/reconfig_removes_node_in_rollback.js
suites:
diff --git a/jstests/replsets/reconfig_removes_node_in_rollback.js b/jstests/replsets/reconfig_removes_node_in_rollback.js
new file mode 100644
index 00000000000..eaca012332c
--- /dev/null
+++ b/jstests/replsets/reconfig_removes_node_in_rollback.js
@@ -0,0 +1,93 @@
+/*
+ * Test that a node in rollback state can safely be removed from the replica set
+ * config via reconfig. See SERVER-48179.
+ *
+ * @tags: [requires_majority_read_concern]
+ */
+(function() {
+"use strict";
+
+load("jstests/replsets/libs/rollback_test.js");
+load('jstests/replsets/rslib.js');
+
+const dbName = "test";
+const collName = "rollbackColl";
+
+// RollbackTest begins with a 3 node replSet in steady-state replication.
+let rollbackTest = new RollbackTest(jsTestName());
+let rollbackNode = rollbackTest.getPrimary();
+let secondTermPrimary = rollbackTest.getSecondary();
+let tieBreakerNode = rollbackTest.getTieBreaker();
+
+// Isolate the current primary from the replica sets. Ops applied here will get rolled back.
+rollbackTest.transitionToRollbackOperations();
+assert.commandWorked(rollbackNode.getDB(dbName)[collName].insert({"num1": 123}));
+
+// Elect the previous secondary as the new primary.
+rollbackTest.transitionToSyncSourceOperationsBeforeRollback();
+assert.commandWorked(secondTermPrimary.getDB(dbName)[collName].insert({"num2": 123}));
+
+// Enable a failpoint to hang after transitioning to rollback mode.
+const rollbackHangFailPoint =
+ configureFailPoint(rollbackNode, "rollbackHangAfterTransitionToRollback");
+
+// Reconnect the isolated node and rollback should start.
+rollbackTest.transitionToSyncSourceOperationsDuringRollback();
+
+// The connection will be closed during rollback, so retries are needed here.
+assert.soonNoExcept(() => {
+ rollbackHangFailPoint.wait();
+ return true;
+}, `failed to wait for fail point ${rollbackHangFailPoint.failPointName}`);
+
+// Enable a failpoint to hang after processing heartbeat reconfig, so we can
+// verify that the old primary was successfully removed while rolling back.
+const postHbReconfigFailPoint =
+ configureFailPoint(rollbackNode, "waitForPostActionCompleteInHbReconfig");
+
+// RollbackTest stopped replication on tie breaker node, need to restart it.
+// Otherwise the new config, which contains only the new primary and the tie
+// breaker, cannot be majority committed.
+restartServerReplication(tieBreakerNode);
+
+// Now, we'll remove the node going through rollback from the replica set.
+// The new primary will then propagate the new config to the other nodes via heartbeat.
+// When the node in rollback processes a heartbeat from the primary with a new config,
+// it will learn it is removed and transition to the RS_REMOVED state.
+let newConfig = rollbackTest.getTestFixture().getReplSetConfigFromNode();
+const initialMembers = newConfig.members;
+newConfig.members = newConfig.members.filter((node) => node.host !== rollbackNode.host);
+newConfig.version++;
+assert.commandWorked(secondTermPrimary.adminCommand({replSetReconfig: newConfig}));
+waitForConfigReplication(secondTermPrimary);
+
+assert.soonNoExcept(() => {
+ postHbReconfigFailPoint.wait();
+ return true;
+}, `failed to wait for fail point ${postHbReconfigFailPoint.failPointName}`);
+
+// Verify the rollback node is removed from replica set config.
+assert.commandFailedWithCode(rollbackNode.adminCommand({replSetGetStatus: 1}),
+ ErrorCodes.InvalidReplicaSetConfig);
+
+// Now we disable the fail points, allowing the rollback to continue.
+postHbReconfigFailPoint.off();
+rollbackHangFailPoint.off();
+
+// Add the removed node back the replica set config, so that we can execute the last
+// step of the RollbackTest to transition back to steady state.
+newConfig = Object.assign({}, rollbackTest.getTestFixture().getReplSetConfigFromNode());
+newConfig.members = initialMembers;
+newConfig.version++;
+
+assert.commandWorked(secondTermPrimary.adminCommand({replSetReconfig: newConfig}));
+waitForConfigReplication(secondTermPrimary);
+
+// Verify the removed node is added back and primary sees its state as SECONDARY.
+rollbackTest.getTestFixture().waitForState(rollbackNode, ReplSetTest.State.SECONDARY);
+
+// Transition back to steady state.
+rollbackTest.transitionToSteadyStateOperations();
+
+rollbackTest.stop();
+})();
diff --git a/src/mongo/db/repl/rollback_impl.cpp b/src/mongo/db/repl/rollback_impl.cpp
index 94f66713b4c..9adc6ec8a98 100644
--- a/src/mongo/db/repl/rollback_impl.cpp
+++ b/src/mongo/db/repl/rollback_impl.cpp
@@ -1211,8 +1211,12 @@ Status RollbackImpl::_triggerOpObserver(OperationContext* opCtx) {
void RollbackImpl::_transitionFromRollbackToSecondary(OperationContext* opCtx) {
invariant(opCtx);
- invariant(_replicationCoordinator->getMemberState() == MemberState(MemberState::RS_ROLLBACK));
+ // It is possible that this node has actually been removed due to a reconfig via
+ // heartbeat during rollback. But it should be fine to transition to SECONDARY
+ // and this won't change how the node reports its member state since topology
+ // coordinator will always check if the node exists in its local config when
+ // returning member state.
LOGV2(21611, "Transition to SECONDARY");
ReplicationStateTransitionLockGuard transitionGuard(opCtx, MODE_X);
diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp
index aa1943760bd..ea679f51ba0 100644
--- a/src/mongo/db/repl/topology_coordinator.cpp
+++ b/src/mongo/db/repl/topology_coordinator.cpp
@@ -903,8 +903,11 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse(
nextAction = HeartbeatResponseAction::makeReconfigAction();
nextAction.setNextHeartbeatStartDate(nextHeartbeatStartDate);
- // TODO(SERVER-48178) Only continue processing heartbeat in primary state to avoid
- // concurrent reconfig and rollback.
+ // Only continue processing heartbeat in primary state. In other states it is not
+ // safe to continue processing heartbeat and should start reconfig right away.
+ // e.g. if this node was removed from replSet, _selfIndex is -1, and a following
+ // check on _selfIndex will keep retrying heartbeat to fetch new config, preventing
+ // the new config to be installed.
if (_role != Role::kLeader) {
return nextAction;
}