summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWilliam Schultz <william.schultz@mongodb.com>2016-11-01 14:47:07 -0400
committerJudah Schvimer <judah@mongodb.com>2016-12-28 14:33:03 -0500
commit4075543b3ef659c26798e291615d5271438b7fc2 (patch)
treee66c59d1f45089dbdad1bdd20677656092a0ba33
parent1c43b90c0483bc7574fecd0eaeee610480cc9217 (diff)
downloadmongo-4075543b3ef659c26798e291615d5271438b7fc2.tar.gz
SERVER-26747 replSetStepDown waits for caught up majority with electable secondary
(cherry picked from commit 627f25d2e64078a6de32116aa496ffc3c461ec67)
-rw-r--r--jstests/replsets/stepdown_needs_electable_secondary.js146
-rw-r--r--jstests/replsets/stepdown_wrt_electable.js40
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp3
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_test.cpp194
-rw-r--r--src/mongo/db/repl/topology_coordinator.h20
-rw-r--r--src/mongo/db/repl/topology_coordinator_impl.cpp41
-rw-r--r--src/mongo/db/repl/topology_coordinator_impl.h5
7 files changed, 358 insertions, 91 deletions
diff --git a/jstests/replsets/stepdown_needs_electable_secondary.js b/jstests/replsets/stepdown_needs_electable_secondary.js
new file mode 100644
index 00000000000..88490ee803f
--- /dev/null
+++ b/jstests/replsets/stepdown_needs_electable_secondary.js
@@ -0,0 +1,146 @@
+/**
+ * Test to ensure that replSetStepDown called on a primary will only succeed if a majority of nodes
+ * are caught up to it and that at least one node in this majority is electable. Tests this with a
+ * 5 node replica set.
+ *
+ * 1. Initiate a 5-node replica set
+ * 2. Disable replication to all secondaries
+ * 3. Execute some writes on primary
+ * 4. Try to step down primary and expect to fail
+ * 5. Enable replication to one unelectable secondary, secondary B
+ * 6. Await replication to secondary B by executing primary write with writeConcern:2
+ * 7. Try to step down primary and expect failure
+ * 8. Enable replication to a different unelectable secondary, secondary C
+ * 9. Await replication to secondary C by executing primary write with writeConcern:3
+ * 10. Try to step down primary and expect failure
+ * 11. Enable replication to an electable secondary, secondary A
+ * 12. Await replication to secondary A by executing primary write with writeConcern:4
+ * 13. Try to step down primary and expect success
+ * 14. Assert that original primary is now a secondary
+ *
+ */
+(function() {
+ 'use strict';
+ var name = 'stepdown_needs_electable_secondary';
+
+ var replTest = new ReplSetTest({name: name, nodes: 5});
+ var nodes = replTest.nodeList();
+
+ replTest.startSet();
+ replTest.initiate({
+ "_id": name,
+ "members": [
+ {"_id": 0, "host": nodes[0]},
+ {"_id": 1, "host": nodes[1]},
+ {"_id": 2, "host": nodes[2]},
+ {"_id": 3, "host": nodes[3], "priority": 0}, // unelectable
+ {"_id": 4, "host": nodes[4], "priority": 0} // unelectable
+ ]
+ });
+
+ /* Disable all incoming writes to a node (secondary) */
+ function disableReplicationToNode(node) {
+ assert.commandWorked(node.getDB('admin').runCommand(
+ {configureFailPoint: 'rsSyncApplyStop', mode: 'alwaysOn'}),
+ 'Failed to enable rsSyncApplyStop failpoint.');
+ }
+
+ /* Re-enable all incoming writes to a node (secondary) */
+ function enableReplicationToNode(node) {
+ assert.commandWorked(
+ node.getDB('admin').runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'off'}),
+ 'Failed to disable rsSyncApplyStop failpoint.');
+ }
+
+ function assertStepDownFailsWithExceededTimeLimit(node) {
+ assert.commandFailedWithCode(
+ node.getDB("admin").runCommand({replSetStepDown: 5, secondaryCatchUpPeriodSecs: 5}),
+ ErrorCodes.ExceededTimeLimit,
+ "step down did not fail with 'ExceededTimeLimit'");
+ }
+
+ function assertStepDownSucceeds(node) {
+ assert.throws(function() {
+ node.adminCommand({replSetStepDown: 60, secondaryCatchUpPeriodSecs: 60});
+ });
+ }
+
+ var primary = replTest.getPrimary();
+
+ jsTestLog("Blocking writes to all secondaries.");
+ replTest.liveNodes.slaves.forEach(disableReplicationToNode);
+
+ jsTestLog("Doing a write to primary.");
+ var testDB = replTest.getPrimary().getDB('testdb');
+ var coll = testDB.stepdown_needs_electable_secondary;
+ assert.writeOK(coll.insert({"dummy_key": "dummy_val"}, {writeConcern: {w: 1}}));
+
+ // Try to step down with only the primary caught up (1 node out of 5).
+ // stepDown should fail.
+ jsTestLog("Trying to step down primary with only 1 node out of 5 caught up.");
+ assertStepDownFailsWithExceededTimeLimit(primary);
+
+ // Get the two unelectable secondaries
+ var secondaryB_unelectable = replTest.nodes[3];
+ var secondaryC_unelectable = replTest.nodes[4];
+
+ // Get an electable secondary
+ var secondaryA_electable = replTest.getSecondaries().find(function(s) {
+ var nodeId = replTest.getNodeId(s);
+ return (nodeId !== 3 && nodeId !== 4); // nodes 3 and 4 are set to be unelectable
+ });
+
+ // Enable writes to Secondary B (unelectable). Await replication.
+ // (2 out of 5 nodes caught up, 0 electable)
+ // stepDown should fail due to no caught up majority.
+ jsTestLog("Re-enabling writes to unelectable secondary: node #" +
+ replTest.getNodeId(secondaryB_unelectable) + ", " + secondaryB_unelectable);
+ enableReplicationToNode(secondaryB_unelectable);
+
+ // Wait for this secondary to catch up by issuing a write that must be replicated to 2 nodes
+ assert.writeOK(coll.insert({"dummy_key": "dummy_val"}, {writeConcern: {w: 2}}));
+
+ // Try to step down and fail
+ jsTestLog("Trying to step down primary with only 2 nodes out of 5 caught up.");
+ assertStepDownFailsWithExceededTimeLimit(primary);
+
+ // Enable writes to Secondary C (unelectable). Await replication.
+ // (3 out of 5 nodes caught up, 0 electable)
+ // stepDown should fail due to caught up majority without electable node.
+ jsTestLog("Re-enabling writes to unelectable secondary: node #" +
+ replTest.getNodeId(secondaryC_unelectable) + ", " + secondaryC_unelectable);
+ enableReplicationToNode(secondaryC_unelectable);
+
+ // Wait for this secondary to catch up by issuing a write that must be replicated to 3 nodes
+ assert.writeOK(coll.insert({"dummy_key": "dummy_val"}, {writeConcern: {w: 3}}));
+
+ // Try to step down and fail
+ jsTestLog("Trying to step down primary with a caught up majority that " +
+ "doesn't contain an electable node.");
+ assertStepDownFailsWithExceededTimeLimit(primary);
+
+ // Enable writes to Secondary A (electable). Await replication.
+ // (4 out of 5 nodes caught up, 1 electable)
+ // stepDown should succeed due to caught up majority containing an electable node.
+ jsTestLog("Re-enabling writes to electable secondary: node #" +
+ replTest.getNodeId(secondaryA_electable) + ", " + secondaryA_electable);
+ enableReplicationToNode(secondaryA_electable);
+
+ // Wait for this secondary to catch up by issuing a write that must be replicated to 4 nodes
+ assert.writeOK(coll.insert({"dummy_key": "dummy_val"}, {writeConcern: {w: 4}}));
+
+ // Try to step down. We expect success, so catch the exception thrown by 'replSetStepDown'.
+ jsTestLog("Trying to step down primary with a caught up majority that " +
+ "does contain an electable node.");
+
+ assertStepDownSucceeds(primary);
+
+ // Make sure that original primary has transitioned to SECONDARY state
+ jsTestLog("Wait for PRIMARY " + primary.host + " to completely step down.");
+ replTest.waitForState(primary, ReplSetTest.State.SECONDARY);
+
+ // Disable all fail points for clean shutdown
+ replTest.liveNodes.slaves.forEach(enableReplicationToNode);
+ replTest.stopSet();
+
+}());
diff --git a/jstests/replsets/stepdown_wrt_electable.js b/jstests/replsets/stepdown_wrt_electable.js
deleted file mode 100644
index c929f2a2c56..00000000000
--- a/jstests/replsets/stepdown_wrt_electable.js
+++ /dev/null
@@ -1,40 +0,0 @@
-// Test that replSetStepDown filters out non-electable nodes
-var replTest = new ReplSetTest({name: 'testSet', nodes: 2});
-var nodes = replTest.startSet();
-
-// setup config
-var c = replTest.getReplSetConfig();
-c.members[1].priority = 0; // not electable
-replTest.initiate(c);
-
-var master = replTest.getPrimary();
-var testDB = master.getDB('test');
-var firstPrimary = testDB.isMaster().primary;
-
-// do a write to allow stepping down of the primary;
-// otherwise, the primary will refuse to step down
-testDB.foo.insert({x: 1});
-replTest.awaitReplication();
-
-// stepdown should fail since there is no-one to elect within 10 secs
-testDB.adminCommand({replSetStepDown: 5});
-assert(master.getDB("a").isMaster().ismaster, "not master");
-
-// step down the primary asyncronously so it doesn't kill this test
-var wait = startParallelShell("db.adminCommand({replSetStepDown:1000, force:true})", master.port);
-var exitCode = wait({checkExitSuccess: false});
-assert.neq(0, exitCode, "expected replSetStepDown to close the shell's connection");
-
-// check that the old primary is no longer master
-assert.soon(function() {
- try {
- var isMaster = master.getDB("a").isMaster();
- printjson(isMaster);
- return !(isMaster.ismaster);
- } catch (e) {
- return false;
- }
-}, "they shouldn't be master, but are");
-
-// stop
-replTest.stopSet();
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index 5dcf7b9b1c3..5439c40f46b 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -1638,7 +1638,8 @@ void ReplicationCoordinatorImpl::_stepDownContinue(
return;
}
bool forceNow = now >= waitUntil ? force : false;
- if (_topCoord->stepDown(stepDownUntil, forceNow, getMyLastAppliedOpTime())) {
+ if (_topCoord->stepDown(
+ stepDownUntil, forceNow, getMyLastAppliedOpTime(), getLastCommittedOpTime())) {
// Schedule work to (potentially) step back up once the stepdown period has ended.
_replExecutor.scheduleWorkAt(stepDownUntil,
stdx::bind(&ReplicationCoordinatorImpl::_handleTimePassing,
diff --git a/src/mongo/db/repl/replication_coordinator_impl_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_test.cpp
index f730e4deccd..2c791030e76 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_test.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_test.cpp
@@ -1377,7 +1377,6 @@ private:
virtual void setUp() {
ReplCoordTest::setUp();
init("mySet/test1:1234,test2:1234,test3:1234");
-
assertStartSuccess(BSON("_id"
<< "mySet"
<< "version" << 1 << "members"
@@ -1393,6 +1392,7 @@ private:
}
};
+
TEST_F(ReplCoordTest, NodeReturnsBadValueWhenUpdateTermIsRunAgainstANonReplNode) {
init(ReplSettings());
ASSERT_TRUE(ReplicationCoordinator::modeNone == getReplCoord()->getReplicationMode());
@@ -1554,43 +1554,170 @@ TEST_F(StepDownTest,
ASSERT_TRUE(getReplCoord()->getMemberState().primary());
}
-TEST_F(StepDownTest,
- NodeTransitionsToSecondaryImmediatelyWhenStepDownIsRunAndAnUpToDateElectableNodeExists) {
+/* Step Down Test for a 5-node replica set */
+class StepDownTestFiveNode : public StepDownTest {
+protected:
+ /*
+ * Simulate a round of heartbeat requests from the primary by manually setting
+ * the heartbeat response messages from each node. 'numNodesCaughtUp' will
+ * determine how many nodes return an optime that is up to date with the
+ * primary's optime. Sets electability of all caught up nodes to 'caughtUpAreElectable'
+ */
+ void simulateHeartbeatResponses(OpTime optimePrimary,
+ OpTime optimeLagged,
+ int numNodesCaughtUp,
+ bool caughtUpAreElectable) {
+ int hbNum = 1;
+ while (getNet()->hasReadyRequests()) {
+ NetworkInterfaceMock::NetworkOperationIterator noi = getNet()->getNextReadyRequest();
+ RemoteCommandRequest request = noi->getRequest();
+
+ // Only process heartbeat requests.
+ ASSERT_EQ(request.cmdObj.firstElement().fieldNameStringData().toString(),
+ "replSetHeartbeat");
+
+ ReplSetHeartbeatArgsV1 hbArgs;
+ ASSERT_OK(hbArgs.initialize(request.cmdObj));
+
+ log() << request.target.toString() << " processing " << request.cmdObj;
+
+ // Catch up 'numNodesCaughtUp' nodes out of 5.
+ OpTime optimeResponse = (hbNum <= numNodesCaughtUp) ? optimePrimary : optimeLagged;
+ bool isElectable = (hbNum <= numNodesCaughtUp) ? caughtUpAreElectable : true;
+
+ ReplSetHeartbeatResponse hbResp;
+ hbResp.setSetName(hbArgs.getSetName());
+ hbResp.setState(MemberState::RS_SECONDARY);
+ hbResp.setConfigVersion(hbArgs.getConfigVersion());
+ hbResp.setDurableOpTime(optimeResponse);
+ hbResp.setAppliedOpTime(optimeResponse);
+ hbResp.setElectable(isElectable);
+ BSONObjBuilder respObj;
+ respObj << "ok" << 1;
+ hbResp.addToBSON(&respObj, false);
+ getNet()->scheduleResponse(noi, getNet()->now(), makeResponseStatus(respObj.obj()));
+ hbNum += 1;
+ }
+ }
+
+private:
+ virtual void setUp() {
+ ReplCoordTest::setUp();
+ init("mySet/test1:1234,test2:1234,test3:1234,test4:1234,test5:1234");
+
+ assertStartSuccess(BSON("_id"
+ << "mySet"
+ << "version" << 1 << "members"
+ << BSON_ARRAY(BSON("_id" << 0 << "host"
+ << "test1:1234")
+ << BSON("_id" << 1 << "host"
+ << "test2:1234")
+ << BSON("_id" << 2 << "host"
+ << "test3:1234")
+ << BSON("_id" << 3 << "host"
+ << "test4:1234")
+ << BSON("_id" << 4 << "host"
+ << "test5:1234"))),
+ HostAndPort("test1", 1234));
+ ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY));
+ myRid = getReplCoord()->getMyRID();
+ }
+};
+
+TEST_F(
+ StepDownTestFiveNode,
+ NodeReturnsExceededTimeLimitWhenStepDownIsRunAndCaughtUpMajorityExistsButWithoutElectableNode) {
OperationContextReplMock txn;
- OpTimeWithTermZero optime1(100, 1);
+ OpTime optimeLagged(Timestamp(100, 1), 1);
+ OpTime optimePrimary(Timestamp(100, 2), 1);
+
// All nodes are caught up
- getReplCoord()->setMyLastAppliedOpTime(optime1);
- getReplCoord()->setMyLastDurableOpTime(optime1);
- ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 1, optime1));
- ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 2, optime1));
+ getReplCoord()->setMyLastAppliedOpTime(optimePrimary);
+ getReplCoord()->setMyLastDurableOpTime(optimePrimary);
+ ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 1, optimeLagged));
+ ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 2, optimeLagged));
+ ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 3, optimeLagged));
+ ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 4, optimeLagged));
simulateSuccessfulV1Election();
enterNetwork();
getNet()->runUntil(getNet()->now() + Seconds(2));
ASSERT(getNet()->hasReadyRequests());
- NetworkInterfaceMock::NetworkOperationIterator noi = getNet()->getNextReadyRequest();
- RemoteCommandRequest request = noi->getRequest();
- log() << request.target.toString() << " processing " << request.cmdObj;
- ReplSetHeartbeatArgsV1 hbArgs;
- if (hbArgs.initialize(request.cmdObj).isOK()) {
- ReplSetHeartbeatResponse hbResp;
- hbResp.setSetName(hbArgs.getSetName());
- hbResp.setState(MemberState::RS_SECONDARY);
- hbResp.setConfigVersion(hbArgs.getConfigVersion());
- hbResp.setDurableOpTime(optime1);
- hbResp.setAppliedOpTime(optime1);
- BSONObjBuilder respObj;
- respObj << "ok" << 1;
- hbResp.addToBSON(&respObj, false);
- getNet()->scheduleResponse(noi, getNet()->now(), makeResponseStatus(respObj.obj()));
- }
- while (getNet()->hasReadyRequests()) {
- getNet()->blackHole(getNet()->getNextReadyRequest());
- }
+
+ // Make sure a majority are caught up (i.e. 3 out of 5). We catch up two secondaries since
+ // the primary counts as one towards majority
+ int numNodesCaughtUp = 2;
+ simulateHeartbeatResponses(optimePrimary, optimeLagged, numNodesCaughtUp, false);
+ getNet()->runReadyNetworkOperations();
+ exitNetwork();
+
+ ASSERT_TRUE(getReplCoord()->getMemberState().primary());
+ auto status = getReplCoord()->stepDown(&txn, false, Milliseconds(0), Milliseconds(1000));
+ ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, status);
+ ASSERT_TRUE(getReplCoord()->getMemberState().primary());
+}
+
+TEST_F(StepDownTestFiveNode,
+ NodeReturnsExceededTimeLimitWhenStepDownIsRunAndNoCaughtUpMajorityExists) {
+ OperationContextReplMock txn;
+ OpTime optimeLagged(Timestamp(100, 1), 1);
+ OpTime optimePrimary(Timestamp(100, 2), 1);
+
+ // All nodes are caught up
+ getReplCoord()->setMyLastAppliedOpTime(optimePrimary);
+ getReplCoord()->setMyLastDurableOpTime(optimePrimary);
+ ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 1, optimeLagged));
+ ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 2, optimeLagged));
+ ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 3, optimeLagged));
+ ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 4, optimeLagged));
+
+ simulateSuccessfulV1Election();
+
+ enterNetwork();
+ getNet()->runUntil(getNet()->now() + Seconds(2));
+ ASSERT(getNet()->hasReadyRequests());
+
+ // Make sure less than a majority are caught up (i.e. 2 out of 5) We catch up one secondary
+ // since the primary counts as one towards majority
+ int numNodesCaughtUp = 1;
+ simulateHeartbeatResponses(optimePrimary, optimeLagged, numNodesCaughtUp, true);
getNet()->runReadyNetworkOperations();
exitNetwork();
+ ASSERT_TRUE(getReplCoord()->getMemberState().primary());
+ auto status = getReplCoord()->stepDown(&txn, false, Milliseconds(0), Milliseconds(1000));
+ ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, status);
+ ASSERT_TRUE(getReplCoord()->getMemberState().primary());
+}
+
+TEST_F(
+ StepDownTestFiveNode,
+ NodeTransitionsToSecondaryImmediatelyWhenStepDownIsRunAndAnUpToDateMajorityWithElectableNodeExists) {
+ OperationContextReplMock txn;
+ OpTime optimeLagged(Timestamp(100, 1), 1);
+ OpTime optimePrimary(Timestamp(100, 2), 1);
+
+ // All nodes are caught up
+ getReplCoord()->setMyLastAppliedOpTime(optimePrimary);
+ getReplCoord()->setMyLastDurableOpTime(optimePrimary);
+ ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 1, optimeLagged));
+ ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 2, optimeLagged));
+ ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 3, optimeLagged));
+ ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 4, optimeLagged));
+
+ simulateSuccessfulV1Election();
+
+ enterNetwork();
+ getNet()->runUntil(getNet()->now() + Seconds(2));
+ ASSERT(getNet()->hasReadyRequests());
+
+ // Make sure a majority are caught up (i.e. 3 out of 5). We catch up two secondaries since
+ // the primary counts as one towards majority
+ int numNodesCaughtUp = 2;
+ simulateHeartbeatResponses(optimePrimary, optimeLagged, numNodesCaughtUp, true);
+ getNet()->runReadyNetworkOperations();
+ exitNetwork();
ASSERT_TRUE(getReplCoord()->getMemberState().primary());
ASSERT_OK(getReplCoord()->stepDown(&txn, false, Milliseconds(0), Milliseconds(1000)));
@@ -1666,8 +1793,9 @@ TEST_F(StepDownTest,
TEST_F(StepDownTest,
NodeTransitionsToSecondaryWhenASecondaryCatchesUpAfterTheFirstRoundOfHeartbeats) {
OperationContextReplMock txn;
- OpTimeWithTermZero optime1(100, 1);
- OpTimeWithTermZero optime2(100, 2);
+ OpTime optime1(Timestamp(100, 1), 1);
+ OpTime optime2(Timestamp(100, 2), 1);
+
// No secondary is caught up
auto repl = getReplCoord();
repl->setMyLastAppliedOpTime(optime2);
@@ -1677,6 +1805,8 @@ TEST_F(StepDownTest,
simulateSuccessfulV1Election();
+ ASSERT_TRUE(getReplCoord()->getMemberState().primary());
+
// Step down where the secondary actually has to catch up before the stepDown can succeed.
// On entering the network, _stepDownContinue should cancel the heartbeats scheduled for
// T + 2 seconds and send out a new round of heartbeats immediately.
@@ -1713,6 +1843,7 @@ TEST_F(StepDownTest,
log() << "Blackholing network request " << noi->getRequest().cmdObj;
getNet()->blackHole(noi);
}
+
getNet()->runReadyNetworkOperations();
exitNetwork();
@@ -1724,8 +1855,9 @@ TEST_F(StepDownTest,
TEST_F(StepDownTest,
NodeTransitionsToSecondaryWhenASecondaryCatchesUpDuringStepDownsSecondaryCatchupPeriod) {
OperationContextReplMock txn;
- OpTimeWithTermZero optime1(100, 1);
- OpTimeWithTermZero optime2(100, 2);
+ OpTime optime1(Timestamp(100, 1), 1);
+ OpTime optime2(Timestamp(100, 2), 1);
+
// No secondary is caught up
auto repl = getReplCoord();
repl->setMyLastAppliedOpTime(optime2);
diff --git a/src/mongo/db/repl/topology_coordinator.h b/src/mongo/db/repl/topology_coordinator.h
index de01144f796..5e8cda75505 100644
--- a/src/mongo/db/repl/topology_coordinator.h
+++ b/src/mongo/db/repl/topology_coordinator.h
@@ -379,12 +379,24 @@ public:
/**
* Tries to transition the coordinator from the leader role to the follower role.
*
- * Fails if "force" is not set and no follower is known to be up. It is illegal
- * to call this method if the node is not leader.
+ * If force==true, step down this node and return true immediately. Else, a step down
+ * succeeds only if the following conditions are met:
*
- * Returns whether or not the step down succeeded.
+ * C1. A majority set of nodes, M, in the replica set have optimes greater than or
+ * equal to the last applied optime of the primary.
+ *
+ * C2. If C1 holds, then there must exist at least one electable secondary node in the
+ * majority set M.
+ *
+ * If C1 and C2 hold, a step down occurs and this method returns true. Else, the step down
+ * fails and this method returns false.
+ *
+ * NOTE: It is illegal to call this method if the node is not a primary.
*/
- virtual bool stepDown(Date_t until, bool force, const OpTime& lastOpApplied) = 0;
+ virtual bool stepDown(Date_t until,
+ bool force,
+ const OpTime& lastOpApplied,
+ const OpTime& lastOpCommitted) = 0;
/**
* Sometimes a request to step down comes in (like via a heartbeat), but we don't have the
diff --git a/src/mongo/db/repl/topology_coordinator_impl.cpp b/src/mongo/db/repl/topology_coordinator_impl.cpp
index e245ddb3276..afd720d7869 100644
--- a/src/mongo/db/repl/topology_coordinator_impl.cpp
+++ b/src/mongo/db/repl/topology_coordinator_impl.cpp
@@ -2228,24 +2228,37 @@ void TopologyCoordinatorImpl::processLoseElection() {
}
}
-bool TopologyCoordinatorImpl::stepDown(Date_t until, bool force, const OpTime& lastOpApplied) {
- bool canStepDown = force;
- for (int i = 0; !canStepDown && i < _rsConfig.getNumMembers(); ++i) {
- if (i == _selfIndex) {
- continue;
- }
- UnelectableReasonMask reason = _getUnelectableReason(i, lastOpApplied);
- if (!reason && _hbdata[i].getAppliedOpTime() >= lastOpApplied) {
- canStepDown = true;
- }
+bool TopologyCoordinatorImpl::stepDown(Date_t until,
+ bool force,
+ const OpTime& lastOpApplied,
+ const OpTime& lastOpCommitted) {
+ // force==true overrides all other checks.
+ if (force) {
+ _stepDownUntil = until;
+ _stepDownSelfAndReplaceWith(-1);
+ return true;
}
- if (!canStepDown) {
+ // Ensure a majority of caught up nodes.
+ if (lastOpCommitted < lastOpApplied) {
return false;
}
- _stepDownUntil = until;
- _stepDownSelfAndReplaceWith(-1);
- return true;
+
+ // Now make sure we also have at least one caught up node that is also electable.
+ for (int memberIndex = 0; memberIndex < _rsConfig.getNumMembers(); memberIndex++) {
+ // ignore your self
+ if (memberIndex == _selfIndex) {
+ continue;
+ }
+ UnelectableReasonMask reason = _getUnelectableReason(memberIndex, lastOpApplied);
+ if (!reason && _hbdata.at(memberIndex).getAppliedOpTime() >= lastOpApplied) {
+ // Found a caught up and electable node, succeed with step down.
+ _stepDownUntil = until;
+ _stepDownSelfAndReplaceWith(-1);
+ return true;
+ }
+ }
+ return false;
}
void TopologyCoordinatorImpl::setFollowerMode(MemberState::MS newMode) {
diff --git a/src/mongo/db/repl/topology_coordinator_impl.h b/src/mongo/db/repl/topology_coordinator_impl.h
index fb4b7786cb3..21b9799fca2 100644
--- a/src/mongo/db/repl/topology_coordinator_impl.h
+++ b/src/mongo/db/repl/topology_coordinator_impl.h
@@ -219,7 +219,10 @@ public:
virtual void processLoseElection();
virtual Status checkShouldStandForElection(Date_t now, const OpTime& lastOpApplied) const;
virtual void setMyHeartbeatMessage(const Date_t now, const std::string& message);
- virtual bool stepDown(Date_t until, bool force, const OpTime& lastOpApplied);
+ virtual bool stepDown(Date_t until,
+ bool force,
+ const OpTime& lastOpApplied,
+ const OpTime& lastOpCommitted);
virtual bool stepDownIfPending();
virtual Date_t getStepDownTime() const;
virtual void prepareReplResponseMetadata(rpc::ReplSetMetadata* metadata,