diff options
author | Wenbin Zhu <wenbin.zhu@mongodb.com> | 2021-03-16 00:27:22 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-03-18 22:36:59 +0000 |
commit | 2cd5154b070f58191762979ad0014dfe78297b70 (patch) | |
tree | cbe6ec558e4c6fc9ad87e7fcdb71fe5cdae8ca75 | |
parent | 927b3a649c421bac2f58f617702c138a13dff85c (diff) | |
download | mongo-2cd5154b070f58191762979ad0014dfe78297b70.tar.gz |
SERVER-48179 Allow transition to SECONDARY at the end of rollback even it was changed to REMOVED.
(cherry picked from commit 0ae1138bbfc066c4c7eb9f857cf4e29447743a3c)
-rw-r--r-- | etc/backports_required_for_multiversion_tests.yml | 2 | ||||
-rw-r--r-- | jstests/replsets/reconfig_removes_node_in_rollback.js | 93 | ||||
-rw-r--r-- | src/mongo/db/repl/rollback_impl.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator.cpp | 7 |
4 files changed, 105 insertions, 3 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml index 58f351c0675..1fe400bfd16 100644 --- a/etc/backports_required_for_multiversion_tests.yml +++ b/etc/backports_required_for_multiversion_tests.yml @@ -123,6 +123,8 @@ all: test_file: jstests/sharding/mongos_helloOk_protocol.js - ticket: SERVER-50414 test_file: jstests/replsets/not_primary_errors_returned_during_rollback_if_helloOk.js + - ticket: SERVER-48179 + test_file: jstests/replsets/reconfig_removes_node_in_rollback.js suites: diff --git a/jstests/replsets/reconfig_removes_node_in_rollback.js b/jstests/replsets/reconfig_removes_node_in_rollback.js new file mode 100644 index 00000000000..eaca012332c --- /dev/null +++ b/jstests/replsets/reconfig_removes_node_in_rollback.js @@ -0,0 +1,93 @@ +/* + * Test that a node in rollback state can safely be removed from the replica set + * config via reconfig. See SERVER-48179. + * + * @tags: [requires_majority_read_concern] + */ +(function() { +"use strict"; + +load("jstests/replsets/libs/rollback_test.js"); +load('jstests/replsets/rslib.js'); + +const dbName = "test"; +const collName = "rollbackColl"; + +// RollbackTest begins with a 3 node replSet in steady-state replication. +let rollbackTest = new RollbackTest(jsTestName()); +let rollbackNode = rollbackTest.getPrimary(); +let secondTermPrimary = rollbackTest.getSecondary(); +let tieBreakerNode = rollbackTest.getTieBreaker(); + +// Isolate the current primary from the replica sets. Ops applied here will get rolled back. +rollbackTest.transitionToRollbackOperations(); +assert.commandWorked(rollbackNode.getDB(dbName)[collName].insert({"num1": 123})); + +// Elect the previous secondary as the new primary. +rollbackTest.transitionToSyncSourceOperationsBeforeRollback(); +assert.commandWorked(secondTermPrimary.getDB(dbName)[collName].insert({"num2": 123})); + +// Enable a failpoint to hang after transitioning to rollback mode. +const rollbackHangFailPoint = + configureFailPoint(rollbackNode, "rollbackHangAfterTransitionToRollback"); + +// Reconnect the isolated node and rollback should start. +rollbackTest.transitionToSyncSourceOperationsDuringRollback(); + +// The connection will be closed during rollback, so retries are needed here. +assert.soonNoExcept(() => { + rollbackHangFailPoint.wait(); + return true; +}, `failed to wait for fail point ${rollbackHangFailPoint.failPointName}`); + +// Enable a failpoint to hang after processing heartbeat reconfig, so we can +// verify that the old primary was successfully removed while rolling back. +const postHbReconfigFailPoint = + configureFailPoint(rollbackNode, "waitForPostActionCompleteInHbReconfig"); + +// RollbackTest stopped replication on tie breaker node, need to restart it. +// Otherwise the new config, which contains only the new primary and the tie +// breaker, cannot be majority committed. +restartServerReplication(tieBreakerNode); + +// Now, we'll remove the node going through rollback from the replica set. +// The new primary will then propagate the new config to the other nodes via heartbeat. +// When the node in rollback processes a heartbeat from the primary with a new config, +// it will learn it is removed and transition to the RS_REMOVED state. +let newConfig = rollbackTest.getTestFixture().getReplSetConfigFromNode(); +const initialMembers = newConfig.members; +newConfig.members = newConfig.members.filter((node) => node.host !== rollbackNode.host); +newConfig.version++; +assert.commandWorked(secondTermPrimary.adminCommand({replSetReconfig: newConfig})); +waitForConfigReplication(secondTermPrimary); + +assert.soonNoExcept(() => { + postHbReconfigFailPoint.wait(); + return true; +}, `failed to wait for fail point ${postHbReconfigFailPoint.failPointName}`); + +// Verify the rollback node is removed from replica set config. +assert.commandFailedWithCode(rollbackNode.adminCommand({replSetGetStatus: 1}), + ErrorCodes.InvalidReplicaSetConfig); + +// Now we disable the fail points, allowing the rollback to continue. +postHbReconfigFailPoint.off(); +rollbackHangFailPoint.off(); + +// Add the removed node back the replica set config, so that we can execute the last +// step of the RollbackTest to transition back to steady state. +newConfig = Object.assign({}, rollbackTest.getTestFixture().getReplSetConfigFromNode()); +newConfig.members = initialMembers; +newConfig.version++; + +assert.commandWorked(secondTermPrimary.adminCommand({replSetReconfig: newConfig})); +waitForConfigReplication(secondTermPrimary); + +// Verify the removed node is added back and primary sees its state as SECONDARY. +rollbackTest.getTestFixture().waitForState(rollbackNode, ReplSetTest.State.SECONDARY); + +// Transition back to steady state. +rollbackTest.transitionToSteadyStateOperations(); + +rollbackTest.stop(); +})(); diff --git a/src/mongo/db/repl/rollback_impl.cpp b/src/mongo/db/repl/rollback_impl.cpp index 94f66713b4c..9adc6ec8a98 100644 --- a/src/mongo/db/repl/rollback_impl.cpp +++ b/src/mongo/db/repl/rollback_impl.cpp @@ -1211,8 +1211,12 @@ Status RollbackImpl::_triggerOpObserver(OperationContext* opCtx) { void RollbackImpl::_transitionFromRollbackToSecondary(OperationContext* opCtx) { invariant(opCtx); - invariant(_replicationCoordinator->getMemberState() == MemberState(MemberState::RS_ROLLBACK)); + // It is possible that this node has actually been removed due to a reconfig via + // heartbeat during rollback. But it should be fine to transition to SECONDARY + // and this won't change how the node reports its member state since topology + // coordinator will always check if the node exists in its local config when + // returning member state. LOGV2(21611, "Transition to SECONDARY"); ReplicationStateTransitionLockGuard transitionGuard(opCtx, MODE_X); diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp index aa1943760bd..ea679f51ba0 100644 --- a/src/mongo/db/repl/topology_coordinator.cpp +++ b/src/mongo/db/repl/topology_coordinator.cpp @@ -903,8 +903,11 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse( nextAction = HeartbeatResponseAction::makeReconfigAction(); nextAction.setNextHeartbeatStartDate(nextHeartbeatStartDate); - // TODO(SERVER-48178) Only continue processing heartbeat in primary state to avoid - // concurrent reconfig and rollback. + // Only continue processing heartbeat in primary state. In other states it is not + // safe to continue processing heartbeat and should start reconfig right away. + // e.g. if this node was removed from replSet, _selfIndex is -1, and a following + // check on _selfIndex will keep retrying heartbeat to fetch new config, preventing + // the new config to be installed. if (_role != Role::kLeader) { return nextAction; } |