diff options
author | William Schultz <william.schultz@mongodb.com> | 2016-11-01 14:47:07 -0400 |
---|---|---|
committer | William Schultz <william.schultz@mongodb.com> | 2016-11-14 15:40:12 -0500 |
commit | 627f25d2e64078a6de32116aa496ffc3c461ec67 (patch) | |
tree | 1bf837681b0e13a389ae7d467ca14f9ee6a43c73 | |
parent | 8b07dc8933006c20de3111eae5face52320ab45a (diff) | |
download | mongo-627f25d2e64078a6de32116aa496ffc3c461ec67.tar.gz |
SERVER-26747 replSetStepDown waits for caught up majority with electable secondary
-rw-r--r-- | jstests/replsets/stepdown_needs_electable_secondary.js | 146 | ||||
-rw-r--r-- | jstests/replsets/stepdown_wrt_electable.js | 40 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.cpp | 3 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_test.cpp | 198 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator.h | 20 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator_impl.cpp | 42 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator_impl.h | 5 |
7 files changed, 363 insertions, 91 deletions
diff --git a/jstests/replsets/stepdown_needs_electable_secondary.js b/jstests/replsets/stepdown_needs_electable_secondary.js new file mode 100644 index 00000000000..88490ee803f --- /dev/null +++ b/jstests/replsets/stepdown_needs_electable_secondary.js @@ -0,0 +1,146 @@ +/** + * Test to ensure that replSetStepDown called on a primary will only succeed if a majority of nodes + * are caught up to it and that at least one node in this majority is electable. Tests this with a + * 5 node replica set. + * + * 1. Initiate a 5-node replica set + * 2. Disable replication to all secondaries + * 3. Execute some writes on primary + * 4. Try to step down primary and expect to fail + * 5. Enable replication to one unelectable secondary, secondary B + * 6. Await replication to secondary B by executing primary write with writeConcern:2 + * 7. Try to step down primary and expect failure + * 8. Enable replication to a different unelectable secondary, secondary C + * 9. Await replication to secondary C by executing primary write with writeConcern:3 + * 10. Try to step down primary and expect failure + * 11. Enable replication to an electable secondary, secondary A + * 12. Await replication to secondary A by executing primary write with writeConcern:4 + * 13. Try to step down primary and expect success + * 14. Assert that original primary is now a secondary + * + */ +(function() { + 'use strict'; + var name = 'stepdown_needs_electable_secondary'; + + var replTest = new ReplSetTest({name: name, nodes: 5}); + var nodes = replTest.nodeList(); + + replTest.startSet(); + replTest.initiate({ + "_id": name, + "members": [ + {"_id": 0, "host": nodes[0]}, + {"_id": 1, "host": nodes[1]}, + {"_id": 2, "host": nodes[2]}, + {"_id": 3, "host": nodes[3], "priority": 0}, // unelectable + {"_id": 4, "host": nodes[4], "priority": 0} // unelectable + ] + }); + + /* Disable all incoming writes to a node (secondary) */ + function disableReplicationToNode(node) { + assert.commandWorked(node.getDB('admin').runCommand( + {configureFailPoint: 'rsSyncApplyStop', mode: 'alwaysOn'}), + 'Failed to enable rsSyncApplyStop failpoint.'); + } + + /* Re-enable all incoming writes to a node (secondary) */ + function enableReplicationToNode(node) { + assert.commandWorked( + node.getDB('admin').runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'off'}), + 'Failed to disable rsSyncApplyStop failpoint.'); + } + + function assertStepDownFailsWithExceededTimeLimit(node) { + assert.commandFailedWithCode( + node.getDB("admin").runCommand({replSetStepDown: 5, secondaryCatchUpPeriodSecs: 5}), + ErrorCodes.ExceededTimeLimit, + "step down did not fail with 'ExceededTimeLimit'"); + } + + function assertStepDownSucceeds(node) { + assert.throws(function() { + node.adminCommand({replSetStepDown: 60, secondaryCatchUpPeriodSecs: 60}); + }); + } + + var primary = replTest.getPrimary(); + + jsTestLog("Blocking writes to all secondaries."); + replTest.liveNodes.slaves.forEach(disableReplicationToNode); + + jsTestLog("Doing a write to primary."); + var testDB = replTest.getPrimary().getDB('testdb'); + var coll = testDB.stepdown_needs_electable_secondary; + assert.writeOK(coll.insert({"dummy_key": "dummy_val"}, {writeConcern: {w: 1}})); + + // Try to step down with only the primary caught up (1 node out of 5). + // stepDown should fail. + jsTestLog("Trying to step down primary with only 1 node out of 5 caught up."); + assertStepDownFailsWithExceededTimeLimit(primary); + + // Get the two unelectable secondaries + var secondaryB_unelectable = replTest.nodes[3]; + var secondaryC_unelectable = replTest.nodes[4]; + + // Get an electable secondary + var secondaryA_electable = replTest.getSecondaries().find(function(s) { + var nodeId = replTest.getNodeId(s); + return (nodeId !== 3 && nodeId !== 4); // nodes 3 and 4 are set to be unelectable + }); + + // Enable writes to Secondary B (unelectable). Await replication. + // (2 out of 5 nodes caught up, 0 electable) + // stepDown should fail due to no caught up majority. + jsTestLog("Re-enabling writes to unelectable secondary: node #" + + replTest.getNodeId(secondaryB_unelectable) + ", " + secondaryB_unelectable); + enableReplicationToNode(secondaryB_unelectable); + + // Wait for this secondary to catch up by issuing a write that must be replicated to 2 nodes + assert.writeOK(coll.insert({"dummy_key": "dummy_val"}, {writeConcern: {w: 2}})); + + // Try to step down and fail + jsTestLog("Trying to step down primary with only 2 nodes out of 5 caught up."); + assertStepDownFailsWithExceededTimeLimit(primary); + + // Enable writes to Secondary C (unelectable). Await replication. + // (3 out of 5 nodes caught up, 0 electable) + // stepDown should fail due to caught up majority without electable node. + jsTestLog("Re-enabling writes to unelectable secondary: node #" + + replTest.getNodeId(secondaryC_unelectable) + ", " + secondaryC_unelectable); + enableReplicationToNode(secondaryC_unelectable); + + // Wait for this secondary to catch up by issuing a write that must be replicated to 3 nodes + assert.writeOK(coll.insert({"dummy_key": "dummy_val"}, {writeConcern: {w: 3}})); + + // Try to step down and fail + jsTestLog("Trying to step down primary with a caught up majority that " + + "doesn't contain an electable node."); + assertStepDownFailsWithExceededTimeLimit(primary); + + // Enable writes to Secondary A (electable). Await replication. + // (4 out of 5 nodes caught up, 1 electable) + // stepDown should succeed due to caught up majority containing an electable node. + jsTestLog("Re-enabling writes to electable secondary: node #" + + replTest.getNodeId(secondaryA_electable) + ", " + secondaryA_electable); + enableReplicationToNode(secondaryA_electable); + + // Wait for this secondary to catch up by issuing a write that must be replicated to 4 nodes + assert.writeOK(coll.insert({"dummy_key": "dummy_val"}, {writeConcern: {w: 4}})); + + // Try to step down. We expect success, so catch the exception thrown by 'replSetStepDown'. + jsTestLog("Trying to step down primary with a caught up majority that " + + "does contain an electable node."); + + assertStepDownSucceeds(primary); + + // Make sure that original primary has transitioned to SECONDARY state + jsTestLog("Wait for PRIMARY " + primary.host + " to completely step down."); + replTest.waitForState(primary, ReplSetTest.State.SECONDARY); + + // Disable all fail points for clean shutdown + replTest.liveNodes.slaves.forEach(enableReplicationToNode); + replTest.stopSet(); + +}()); diff --git a/jstests/replsets/stepdown_wrt_electable.js b/jstests/replsets/stepdown_wrt_electable.js deleted file mode 100644 index c929f2a2c56..00000000000 --- a/jstests/replsets/stepdown_wrt_electable.js +++ /dev/null @@ -1,40 +0,0 @@ -// Test that replSetStepDown filters out non-electable nodes -var replTest = new ReplSetTest({name: 'testSet', nodes: 2}); -var nodes = replTest.startSet(); - -// setup config -var c = replTest.getReplSetConfig(); -c.members[1].priority = 0; // not electable -replTest.initiate(c); - -var master = replTest.getPrimary(); -var testDB = master.getDB('test'); -var firstPrimary = testDB.isMaster().primary; - -// do a write to allow stepping down of the primary; -// otherwise, the primary will refuse to step down -testDB.foo.insert({x: 1}); -replTest.awaitReplication(); - -// stepdown should fail since there is no-one to elect within 10 secs -testDB.adminCommand({replSetStepDown: 5}); -assert(master.getDB("a").isMaster().ismaster, "not master"); - -// step down the primary asyncronously so it doesn't kill this test -var wait = startParallelShell("db.adminCommand({replSetStepDown:1000, force:true})", master.port); -var exitCode = wait({checkExitSuccess: false}); -assert.neq(0, exitCode, "expected replSetStepDown to close the shell's connection"); - -// check that the old primary is no longer master -assert.soon(function() { - try { - var isMaster = master.getDB("a").isMaster(); - printjson(isMaster); - return !(isMaster.ismaster); - } catch (e) { - return false; - } -}, "they shouldn't be master, but are"); - -// stop -replTest.stopSet(); diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index 2e722da577e..11b44773711 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -1750,7 +1750,8 @@ bool ReplicationCoordinatorImpl::_tryToStepDown(const Date_t waitUntil, } const bool forceNow = now >= waitUntil ? force : false; - if (!_topCoord->stepDown(stepDownUntil, forceNow, getMyLastAppliedOpTime())) { + if (!_topCoord->stepDown( + stepDownUntil, forceNow, getMyLastAppliedOpTime(), getLastCommittedOpTime())) { if (now >= waitUntil) { uasserted(ErrorCodes::ExceededTimeLimit, str::stream() << "No electable secondaries caught up as of " diff --git a/src/mongo/db/repl/replication_coordinator_impl_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_test.cpp index ac5d384539d..1c43172f83c 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_test.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_test.cpp @@ -1419,7 +1419,6 @@ private: virtual void setUp() { ReplCoordTest::setUp(); init("mySet/test1:1234,test2:1234,test3:1234"); - assertStartSuccess(BSON("_id" << "mySet" << "version" @@ -1437,6 +1436,7 @@ private: } }; + TEST_F(ReplCoordTest, NodeReturnsBadValueWhenUpdateTermIsRunAgainstANonReplNode) { init(ReplSettings()); ASSERT_TRUE(ReplicationCoordinator::modeNone == getReplCoord()->getReplicationMode()); @@ -1608,44 +1608,175 @@ TEST_F(StepDownTest, ASSERT_TRUE(getReplCoord()->getMemberState().primary()); } -TEST_F(StepDownTest, - NodeTransitionsToSecondaryImmediatelyWhenStepDownIsRunAndAnUpToDateElectableNodeExists) { - OpTimeWithTermZero optime1(100, 1); +/* Step Down Test for a 5-node replica set */ +class StepDownTestFiveNode : public StepDownTest { +protected: + /* + * Simulate a round of heartbeat requests from the primary by manually setting + * the heartbeat response messages from each node. 'numNodesCaughtUp' will + * determine how many nodes return an optime that is up to date with the + * primary's optime. Sets electability of all caught up nodes to 'caughtUpAreElectable' + */ + void simulateHeartbeatResponses(OpTime optimePrimary, + OpTime optimeLagged, + int numNodesCaughtUp, + bool caughtUpAreElectable) { + int hbNum = 1; + while (getNet()->hasReadyRequests()) { + NetworkInterfaceMock::NetworkOperationIterator noi = getNet()->getNextReadyRequest(); + RemoteCommandRequest request = noi->getRequest(); + + // Only process heartbeat requests. + ASSERT_EQ(request.cmdObj.firstElement().fieldNameStringData().toString(), + "replSetHeartbeat"); + + ReplSetHeartbeatArgsV1 hbArgs; + ASSERT_OK(hbArgs.initialize(request.cmdObj)); + + log() << request.target.toString() << " processing " << request.cmdObj; + + // Catch up 'numNodesCaughtUp' nodes out of 5. + OpTime optimeResponse = (hbNum <= numNodesCaughtUp) ? optimePrimary : optimeLagged; + bool isElectable = (hbNum <= numNodesCaughtUp) ? caughtUpAreElectable : true; + + ReplSetHeartbeatResponse hbResp; + hbResp.setSetName(hbArgs.getSetName()); + hbResp.setState(MemberState::RS_SECONDARY); + hbResp.setConfigVersion(hbArgs.getConfigVersion()); + hbResp.setDurableOpTime(optimeResponse); + hbResp.setAppliedOpTime(optimeResponse); + hbResp.setElectable(isElectable); + BSONObjBuilder respObj; + respObj << "ok" << 1; + hbResp.addToBSON(&respObj, false); + getNet()->scheduleResponse(noi, getNet()->now(), makeResponseStatus(respObj.obj())); + hbNum += 1; + } + } + +private: + virtual void setUp() { + ReplCoordTest::setUp(); + init("mySet/test1:1234,test2:1234,test3:1234,test4:1234,test5:1234"); + + assertStartSuccess(BSON("_id" + << "mySet" + << "version" + << 1 + << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "test1:1234") + << BSON("_id" << 1 << "host" + << "test2:1234") + << BSON("_id" << 2 << "host" + << "test3:1234") + << BSON("_id" << 3 << "host" + << "test4:1234") + << BSON("_id" << 4 << "host" + << "test5:1234"))), + HostAndPort("test1", 1234)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + myRid = getReplCoord()->getMyRID(); + } +}; + +TEST_F( + StepDownTestFiveNode, + NodeReturnsExceededTimeLimitWhenStepDownIsRunAndCaughtUpMajorityExistsButWithoutElectableNode) { + OpTime optimeLagged(Timestamp(100, 1), 1); + OpTime optimePrimary(Timestamp(100, 2), 1); + // All nodes are caught up - getReplCoord()->setMyLastAppliedOpTime(optime1); - getReplCoord()->setMyLastDurableOpTime(optime1); - ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 1, optime1)); - ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 2, optime1)); + getReplCoord()->setMyLastAppliedOpTime(optimePrimary); + getReplCoord()->setMyLastDurableOpTime(optimePrimary); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 1, optimeLagged)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 2, optimeLagged)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 3, optimeLagged)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 4, optimeLagged)); simulateSuccessfulV1Election(); enterNetwork(); getNet()->runUntil(getNet()->now() + Seconds(2)); ASSERT(getNet()->hasReadyRequests()); - NetworkInterfaceMock::NetworkOperationIterator noi = getNet()->getNextReadyRequest(); - RemoteCommandRequest request = noi->getRequest(); - log() << request.target.toString() << " processing " << request.cmdObj; - ReplSetHeartbeatArgsV1 hbArgs; - if (hbArgs.initialize(request.cmdObj).isOK()) { - ReplSetHeartbeatResponse hbResp; - hbResp.setSetName(hbArgs.getSetName()); - hbResp.setState(MemberState::RS_SECONDARY); - hbResp.setConfigVersion(hbArgs.getConfigVersion()); - hbResp.setDurableOpTime(optime1); - hbResp.setAppliedOpTime(optime1); - BSONObjBuilder respObj; - respObj << "ok" << 1; - hbResp.addToBSON(&respObj, false); - getNet()->scheduleResponse(noi, getNet()->now(), makeResponseStatus(respObj.obj())); - } - while (getNet()->hasReadyRequests()) { - getNet()->blackHole(getNet()->getNextReadyRequest()); - } + + // Make sure a majority are caught up (i.e. 3 out of 5). We catch up two secondaries since + // the primary counts as one towards majority + int numNodesCaughtUp = 2; + simulateHeartbeatResponses(optimePrimary, optimeLagged, numNodesCaughtUp, false); getNet()->runReadyNetworkOperations(); exitNetwork(); const auto txn = makeOperationContext(); + ASSERT_TRUE(getReplCoord()->getMemberState().primary()); + auto status = getReplCoord()->stepDown(txn.get(), false, Milliseconds(0), Milliseconds(1000)); + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, status); + ASSERT_TRUE(getReplCoord()->getMemberState().primary()); +} + +TEST_F(StepDownTestFiveNode, + NodeReturnsExceededTimeLimitWhenStepDownIsRunAndNoCaughtUpMajorityExists) { + OpTime optimeLagged(Timestamp(100, 1), 1); + OpTime optimePrimary(Timestamp(100, 2), 1); + + // All nodes are caught up + getReplCoord()->setMyLastAppliedOpTime(optimePrimary); + getReplCoord()->setMyLastDurableOpTime(optimePrimary); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 1, optimeLagged)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 2, optimeLagged)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 3, optimeLagged)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 4, optimeLagged)); + + simulateSuccessfulV1Election(); + + enterNetwork(); + getNet()->runUntil(getNet()->now() + Seconds(2)); + ASSERT(getNet()->hasReadyRequests()); + + // Make sure less than a majority are caught up (i.e. 2 out of 5) We catch up one secondary + // since the primary counts as one towards majority + int numNodesCaughtUp = 1; + simulateHeartbeatResponses(optimePrimary, optimeLagged, numNodesCaughtUp, true); + getNet()->runReadyNetworkOperations(); + exitNetwork(); + + const auto txn = makeOperationContext(); + + ASSERT_TRUE(getReplCoord()->getMemberState().primary()); + auto status = getReplCoord()->stepDown(txn.get(), false, Milliseconds(0), Milliseconds(1000)); + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, status); + ASSERT_TRUE(getReplCoord()->getMemberState().primary()); +} + +TEST_F( + StepDownTestFiveNode, + NodeTransitionsToSecondaryImmediatelyWhenStepDownIsRunAndAnUpToDateMajorityWithElectableNodeExists) { + OpTime optimeLagged(Timestamp(100, 1), 1); + OpTime optimePrimary(Timestamp(100, 2), 1); + + // All nodes are caught up + getReplCoord()->setMyLastAppliedOpTime(optimePrimary); + getReplCoord()->setMyLastDurableOpTime(optimePrimary); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 1, optimeLagged)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 2, optimeLagged)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 3, optimeLagged)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 4, optimeLagged)); + + simulateSuccessfulV1Election(); + + enterNetwork(); + getNet()->runUntil(getNet()->now() + Seconds(2)); + ASSERT(getNet()->hasReadyRequests()); + + // Make sure a majority are caught up (i.e. 3 out of 5). We catch up two secondaries since + // the primary counts as one towards majority + int numNodesCaughtUp = 2; + simulateHeartbeatResponses(optimePrimary, optimeLagged, numNodesCaughtUp, true); + getNet()->runReadyNetworkOperations(); + exitNetwork(); + + const auto txn = makeOperationContext(); ASSERT_TRUE(getReplCoord()->getMemberState().primary()); ASSERT_OK(getReplCoord()->stepDown(txn.get(), false, Milliseconds(0), Milliseconds(1000))); @@ -1724,8 +1855,9 @@ TEST_F(StepDownTest, TEST_F(StepDownTest, NodeTransitionsToSecondaryWhenASecondaryCatchesUpAfterTheFirstRoundOfHeartbeats) { - OpTimeWithTermZero optime1(100, 1); - OpTimeWithTermZero optime2(100, 2); + OpTime optime1(Timestamp(100, 1), 1); + OpTime optime2(Timestamp(100, 2), 1); + // No secondary is caught up auto repl = getReplCoord(); repl->setMyLastAppliedOpTime(optime2); @@ -1735,6 +1867,8 @@ TEST_F(StepDownTest, simulateSuccessfulV1Election(); + ASSERT_TRUE(getReplCoord()->getMemberState().primary()); + // Step down where the secondary actually has to catch up before the stepDown can succeed. // On entering the network, _stepDownContinue should cancel the heartbeats scheduled for // T + 2 seconds and send out a new round of heartbeats immediately. @@ -1766,6 +1900,7 @@ TEST_F(StepDownTest, log() << "Blackholing network request " << noi->getRequest().cmdObj; getNet()->blackHole(noi); } + getNet()->runReadyNetworkOperations(); exitNetwork(); @@ -1775,8 +1910,9 @@ TEST_F(StepDownTest, TEST_F(StepDownTest, NodeTransitionsToSecondaryWhenASecondaryCatchesUpDuringStepDownsSecondaryCatchupPeriod) { - OpTimeWithTermZero optime1(100, 1); - OpTimeWithTermZero optime2(100, 2); + OpTime optime1(Timestamp(100, 1), 1); + OpTime optime2(Timestamp(100, 2), 1); + // No secondary is caught up auto repl = getReplCoord(); repl->setMyLastAppliedOpTime(optime2); diff --git a/src/mongo/db/repl/topology_coordinator.h b/src/mongo/db/repl/topology_coordinator.h index 32c98c2c3e1..3dbd739f9cd 100644 --- a/src/mongo/db/repl/topology_coordinator.h +++ b/src/mongo/db/repl/topology_coordinator.h @@ -394,12 +394,24 @@ public: /** * Tries to transition the coordinator from the leader role to the follower role. * - * Fails if "force" is not set and no follower is known to be up. It is illegal - * to call this method if the node is not leader. + * If force==true, step down this node and return true immediately. Else, a step down + * succeeds only if the following conditions are met: * - * Returns whether or not the step down succeeded. + * C1. A majority set of nodes, M, in the replica set have optimes greater than or + * equal to the last applied optime of the primary. + * + * C2. If C1 holds, then there must exist at least one electable secondary node in the + * majority set M. + * + * If C1 and C2 hold, a step down occurs and this method returns true. Else, the step down + * fails and this method returns false. + * + * NOTE: It is illegal to call this method if the node is not a primary. */ - virtual bool stepDown(Date_t until, bool force, const OpTime& lastOpApplied) = 0; + virtual bool stepDown(Date_t until, + bool force, + const OpTime& lastOpApplied, + const OpTime& lastOpCommitted) = 0; /** * Sometimes a request to step down comes in (like via a heartbeat), but we don't have the diff --git a/src/mongo/db/repl/topology_coordinator_impl.cpp b/src/mongo/db/repl/topology_coordinator_impl.cpp index 9a82f78b1d0..8062a122e5c 100644 --- a/src/mongo/db/repl/topology_coordinator_impl.cpp +++ b/src/mongo/db/repl/topology_coordinator_impl.cpp @@ -2272,24 +2272,38 @@ void TopologyCoordinatorImpl::processLoseElection() { } } -bool TopologyCoordinatorImpl::stepDown(Date_t until, bool force, const OpTime& lastOpApplied) { - bool canStepDown = force; - for (int i = 0; !canStepDown && i < _rsConfig.getNumMembers(); ++i) { - if (i == _selfIndex) { - continue; - } - UnelectableReasonMask reason = _getUnelectableReason(i, lastOpApplied); - if (!reason && _hbdata.at(i).getAppliedOpTime() >= lastOpApplied) { - canStepDown = true; - } +bool TopologyCoordinatorImpl::stepDown(Date_t until, + bool force, + const OpTime& lastOpApplied, + const OpTime& lastOpCommitted) { + + // force==true overrides all other checks. + if (force) { + _stepDownUntil = until; + _stepDownSelfAndReplaceWith(-1); + return true; } - if (!canStepDown) { + // Ensure a majority of caught up nodes. + if (lastOpCommitted < lastOpApplied) { return false; } - _stepDownUntil = until; - _stepDownSelfAndReplaceWith(-1); - return true; + + // Now make sure we also have at least one caught up node that is also electable. + for (int memberIndex = 0; memberIndex < _rsConfig.getNumMembers(); memberIndex++) { + // ignore your self + if (memberIndex == _selfIndex) { + continue; + } + UnelectableReasonMask reason = _getUnelectableReason(memberIndex, lastOpApplied); + if (!reason && _hbdata.at(memberIndex).getAppliedOpTime() >= lastOpApplied) { + // Found a caught up and electable node, succeed with step down. + _stepDownUntil = until; + _stepDownSelfAndReplaceWith(-1); + return true; + } + } + return false; } void TopologyCoordinatorImpl::setFollowerMode(MemberState::MS newMode) { diff --git a/src/mongo/db/repl/topology_coordinator_impl.h b/src/mongo/db/repl/topology_coordinator_impl.h index 21b8d84ba1b..9d57ffe0812 100644 --- a/src/mongo/db/repl/topology_coordinator_impl.h +++ b/src/mongo/db/repl/topology_coordinator_impl.h @@ -217,7 +217,10 @@ public: virtual void processLoseElection(); virtual Status checkShouldStandForElection(Date_t now, const OpTime& lastOpApplied) const; virtual void setMyHeartbeatMessage(const Date_t now, const std::string& message); - virtual bool stepDown(Date_t until, bool force, const OpTime& lastOpApplied); + virtual bool stepDown(Date_t until, + bool force, + const OpTime& lastOpApplied, + const OpTime& lastOpCommitted); virtual bool stepDownIfPending(); virtual Date_t getStepDownTime() const; virtual void prepareReplMetadata(rpc::ReplSetMetadata* metadata, |