SERVER-26747 replSetStepDown waits for caught up majority with electable secondary

(cherry picked from commit 627f25d2e64078a6de32116aa496ffc3c461ec67)
author: William Schultz <william.schultz@mongodb.com> 2016-11-01 14:47:07 -0400
committer: Judah Schvimer <judah@mongodb.com> 2016-12-28 14:33:03 -0500
commit: 4075543b3ef659c26798e291615d5271438b7fc2 (patch)
tree: e66c59d1f45089dbdad1bdd20677656092a0ba33
parent: 1c43b90c0483bc7574fecd0eaeee610480cc9217 (diff)
download: mongo-4075543b3ef659c26798e291615d5271438b7fc2.tar.gz
7 files changed, 358 insertions, 91 deletions
diff --git a/jstests/replsets/stepdown_needs_electable_secondary.js b/jstests/replsets/stepdown_needs_electable_secondary.js
new file mode 100644
index 00000000000..88490ee803f
--- /dev/null
+++ b/jstests/replsets/stepdown_needs_electable_secondary.js
@@ -0,0 +1,146 @@
+/**
+ * Test to ensure that replSetStepDown called on a primary will only succeed if a majority of nodes
+ * are caught up to it and that at least one node in this majority is electable. Tests this with a
+ * 5 node replica set.
+ *
+ * 1.  Initiate a 5-node replica set
+ * 2.  Disable replication to all secondaries
+ * 3.  Execute some writes on primary
+ * 4.  Try to step down primary and expect to fail
+ * 5.  Enable replication to one unelectable secondary, secondary B
+ * 6.  Await replication to secondary B by executing primary write with writeConcern:2
+ * 7.  Try to step down primary and expect failure
+ * 8.  Enable replication to a different unelectable secondary, secondary C
+ * 9.  Await replication to secondary C by executing primary write with writeConcern:3
+ * 10. Try to step down primary and expect failure
+ * 11. Enable replication to an electable secondary, secondary A
+ * 12. Await replication to secondary A by executing primary write with writeConcern:4
+ * 13. Try to step down primary and expect success
+ * 14. Assert that original primary is now a secondary
+ *
+ */
+(function() {
+    'use strict';
+    var name = 'stepdown_needs_electable_secondary';
+
+    var replTest = new ReplSetTest({name: name, nodes: 5});
+    var nodes = replTest.nodeList();
+
+    replTest.startSet();
+    replTest.initiate({
+        "_id": name,
+        "members": [
+            {"_id": 0, "host": nodes[0]},
+            {"_id": 1, "host": nodes[1]},
+            {"_id": 2, "host": nodes[2]},
+            {"_id": 3, "host": nodes[3], "priority": 0},  // unelectable
+            {"_id": 4, "host": nodes[4], "priority": 0}   // unelectable
+        ]
+    });
+
+    /* Disable all incoming writes to a node (secondary) */
+    function disableReplicationToNode(node) {
+        assert.commandWorked(node.getDB('admin').runCommand(
+                                 {configureFailPoint: 'rsSyncApplyStop', mode: 'alwaysOn'}),
+                             'Failed to enable rsSyncApplyStop failpoint.');
+    }
+
+    /* Re-enable all incoming writes to a node (secondary) */
+    function enableReplicationToNode(node) {
+        assert.commandWorked(
+            node.getDB('admin').runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'off'}),
+            'Failed to disable rsSyncApplyStop failpoint.');
+    }
+
+    function assertStepDownFailsWithExceededTimeLimit(node) {
+        assert.commandFailedWithCode(
+            node.getDB("admin").runCommand({replSetStepDown: 5, secondaryCatchUpPeriodSecs: 5}),
+            ErrorCodes.ExceededTimeLimit,
+            "step down did not fail with 'ExceededTimeLimit'");
+    }
+
+    function assertStepDownSucceeds(node) {
+        assert.throws(function() {
+            node.adminCommand({replSetStepDown: 60, secondaryCatchUpPeriodSecs: 60});
+        });
+    }
+
+    var primary = replTest.getPrimary();
+
+    jsTestLog("Blocking writes to all secondaries.");
+    replTest.liveNodes.slaves.forEach(disableReplicationToNode);
+
+    jsTestLog("Doing a write to primary.");
+    var testDB = replTest.getPrimary().getDB('testdb');
+    var coll = testDB.stepdown_needs_electable_secondary;
+    assert.writeOK(coll.insert({"dummy_key": "dummy_val"}, {writeConcern: {w: 1}}));
+
+    // Try to step down with only the primary caught up (1 node out of 5).
+    // stepDown should fail.
+    jsTestLog("Trying to step down primary with only 1 node out of 5 caught up.");
+    assertStepDownFailsWithExceededTimeLimit(primary);
+
+    // Get the two unelectable secondaries
+    var secondaryB_unelectable = replTest.nodes[3];
+    var secondaryC_unelectable = replTest.nodes[4];
+
+    // Get an electable secondary
+    var secondaryA_electable = replTest.getSecondaries().find(function(s) {
+        var nodeId = replTest.getNodeId(s);
+        return (nodeId !== 3 && nodeId !== 4);  // nodes 3 and 4 are set to be unelectable
+    });
+
+    // Enable writes to Secondary B (unelectable). Await replication.
+    // (2 out of 5 nodes caught up, 0 electable)
+    // stepDown should fail due to no caught up majority.
+    jsTestLog("Re-enabling writes to unelectable secondary: node #" +
+              replTest.getNodeId(secondaryB_unelectable) + ", " + secondaryB_unelectable);
+    enableReplicationToNode(secondaryB_unelectable);
+
+    // Wait for this secondary to catch up by issuing a write that must be replicated to 2 nodes
+    assert.writeOK(coll.insert({"dummy_key": "dummy_val"}, {writeConcern: {w: 2}}));
+
+    // Try to step down and fail
+    jsTestLog("Trying to step down primary with only 2 nodes out of 5 caught up.");
+    assertStepDownFailsWithExceededTimeLimit(primary);
+
+    // Enable writes to Secondary C (unelectable). Await replication.
+    // (3 out of 5 nodes caught up, 0 electable)
+    // stepDown should fail due to caught up majority without electable node.
+    jsTestLog("Re-enabling writes to unelectable secondary: node #" +
+              replTest.getNodeId(secondaryC_unelectable) + ", " + secondaryC_unelectable);
+    enableReplicationToNode(secondaryC_unelectable);
+
+    // Wait for this secondary to catch up by issuing a write that must be replicated to 3 nodes
+    assert.writeOK(coll.insert({"dummy_key": "dummy_val"}, {writeConcern: {w: 3}}));
+
+    // Try to step down and fail
+    jsTestLog("Trying to step down primary with a caught up majority that " +
+              "doesn't contain an electable node.");
+    assertStepDownFailsWithExceededTimeLimit(primary);
+
+    // Enable writes to Secondary A (electable). Await replication.
+    // (4 out of 5 nodes caught up, 1 electable)
+    // stepDown should succeed due to caught up majority containing an electable node.
+    jsTestLog("Re-enabling writes to electable secondary: node #" +
+              replTest.getNodeId(secondaryA_electable) + ", " + secondaryA_electable);
+    enableReplicationToNode(secondaryA_electable);
+
+    // Wait for this secondary to catch up by issuing a write that must be replicated to 4 nodes
+    assert.writeOK(coll.insert({"dummy_key": "dummy_val"}, {writeConcern: {w: 4}}));
+
+    // Try to step down. We expect success, so catch the exception thrown by 'replSetStepDown'.
+    jsTestLog("Trying to step down primary with a caught up majority that " +
+              "does contain an electable node.");
+
+    assertStepDownSucceeds(primary);
+
+    // Make sure that original primary has transitioned to SECONDARY state
+    jsTestLog("Wait for PRIMARY " + primary.host + " to completely step down.");
+    replTest.waitForState(primary, ReplSetTest.State.SECONDARY);
+
+    // Disable all fail points for clean shutdown
+    replTest.liveNodes.slaves.forEach(enableReplicationToNode);
+    replTest.stopSet();
+
+}());
diff --git a/jstests/replsets/stepdown_wrt_electable.js b/jstests/replsets/stepdown_wrt_electable.js
deleted file mode 100644
index c929f2a2c56..00000000000
--- a/jstests/replsets/stepdown_wrt_electable.js
+++ /dev/null
@@ -1,40 +0,0 @@
-// Test that replSetStepDown filters out non-electable nodes
-var replTest = new ReplSetTest({name: 'testSet', nodes: 2});
-var nodes = replTest.startSet();
-
-// setup config
-var c = replTest.getReplSetConfig();
-c.members[1].priority = 0;  // not electable
-replTest.initiate(c);
-
-var master = replTest.getPrimary();
-var testDB = master.getDB('test');
-var firstPrimary = testDB.isMaster().primary;
-
-// do a write to allow stepping down of the primary;
-// otherwise, the primary will refuse to step down
-testDB.foo.insert({x: 1});
-replTest.awaitReplication();
-
-// stepdown should fail since there is no-one to elect within 10 secs
-testDB.adminCommand({replSetStepDown: 5});
-assert(master.getDB("a").isMaster().ismaster, "not master");
-
-// step down the primary asyncronously so it doesn't kill this test
-var wait = startParallelShell("db.adminCommand({replSetStepDown:1000, force:true})", master.port);
-var exitCode = wait({checkExitSuccess: false});
-assert.neq(0, exitCode, "expected replSetStepDown to close the shell's connection");
-
-// check that the old primary is no longer master
-assert.soon(function() {
-    try {
-        var isMaster = master.getDB("a").isMaster();
-        printjson(isMaster);
-        return !(isMaster.ismaster);
-    } catch (e) {
-        return false;
-    }
-}, "they shouldn't be master, but are");
-
-// stop
-replTest.stopSet();
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index 5dcf7b9b1c3..5439c40f46b 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -1638,7 +1638,8 @@ void ReplicationCoordinatorImpl::_stepDownContinue(
         return;
     }
     bool forceNow = now >= waitUntil ? force : false;
-    if (_topCoord->stepDown(stepDownUntil, forceNow, getMyLastAppliedOpTime())) {
+    if (_topCoord->stepDown(
+            stepDownUntil, forceNow, getMyLastAppliedOpTime(), getLastCommittedOpTime())) {
         // Schedule work to (potentially) step back up once the stepdown period has ended.
         _replExecutor.scheduleWorkAt(stepDownUntil,
                                      stdx::bind(&ReplicationCoordinatorImpl::_handleTimePassing,
diff --git a/src/mongo/db/repl/replication_coordinator_impl_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_test.cpp
index f730e4deccd..2c791030e76 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_test.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_test.cpp
@@ -1377,7 +1377,6 @@ private:
     virtual void setUp() {
         ReplCoordTest::setUp();
         init("mySet/test1:1234,test2:1234,test3:1234");
-
         assertStartSuccess(BSON("_id"
                                 << "mySet"
                                 << "version" << 1 << "members"
@@ -1393,6 +1392,7 @@ private:
     }
 };
 
+
 TEST_F(ReplCoordTest, NodeReturnsBadValueWhenUpdateTermIsRunAgainstANonReplNode) {
     init(ReplSettings());
     ASSERT_TRUE(ReplicationCoordinator::modeNone == getReplCoord()->getReplicationMode());
@@ -1554,43 +1554,170 @@ TEST_F(StepDownTest,
     ASSERT_TRUE(getReplCoord()->getMemberState().primary());
 }
 
-TEST_F(StepDownTest,
-       NodeTransitionsToSecondaryImmediatelyWhenStepDownIsRunAndAnUpToDateElectableNodeExists) {
+/* Step Down Test for a 5-node replica set */
+class StepDownTestFiveNode : public StepDownTest {
+protected:
+    /*
+     * Simulate a round of heartbeat requests from the primary by manually setting
+     * the heartbeat response messages from each node. 'numNodesCaughtUp' will
+     * determine how many nodes return an optime that is up to date with the
+     * primary's optime. Sets electability of all caught up nodes to 'caughtUpAreElectable'
+     */
+    void simulateHeartbeatResponses(OpTime optimePrimary,
+                                    OpTime optimeLagged,
+                                    int numNodesCaughtUp,
+                                    bool caughtUpAreElectable) {
+        int hbNum = 1;
+        while (getNet()->hasReadyRequests()) {
+            NetworkInterfaceMock::NetworkOperationIterator noi = getNet()->getNextReadyRequest();
+            RemoteCommandRequest request = noi->getRequest();
+
+            // Only process heartbeat requests.
+            ASSERT_EQ(request.cmdObj.firstElement().fieldNameStringData().toString(),
+                      "replSetHeartbeat");
+
+            ReplSetHeartbeatArgsV1 hbArgs;
+            ASSERT_OK(hbArgs.initialize(request.cmdObj));
+
+            log() << request.target.toString() << " processing " << request.cmdObj;
+
+            // Catch up 'numNodesCaughtUp' nodes out of 5.
+            OpTime optimeResponse = (hbNum <= numNodesCaughtUp) ? optimePrimary : optimeLagged;
+            bool isElectable = (hbNum <= numNodesCaughtUp) ? caughtUpAreElectable : true;
+
+            ReplSetHeartbeatResponse hbResp;
+            hbResp.setSetName(hbArgs.getSetName());
+            hbResp.setState(MemberState::RS_SECONDARY);
+            hbResp.setConfigVersion(hbArgs.getConfigVersion());
+            hbResp.setDurableOpTime(optimeResponse);
+            hbResp.setAppliedOpTime(optimeResponse);
+            hbResp.setElectable(isElectable);
+            BSONObjBuilder respObj;
+            respObj << "ok" << 1;
+            hbResp.addToBSON(&respObj, false);
+            getNet()->scheduleResponse(noi, getNet()->now(), makeResponseStatus(respObj.obj()));
+            hbNum += 1;
+        }
+    }
+
+private:
+    virtual void setUp() {
+        ReplCoordTest::setUp();
+        init("mySet/test1:1234,test2:1234,test3:1234,test4:1234,test5:1234");
+
+        assertStartSuccess(BSON("_id"
+                                << "mySet"
+                                << "version" << 1 << "members"
+                                << BSON_ARRAY(BSON("_id" << 0 << "host"
+                                                         << "test1:1234")
+                                              << BSON("_id" << 1 << "host"
+                                                            << "test2:1234")
+                                              << BSON("_id" << 2 << "host"
+                                                            << "test3:1234")
+                                              << BSON("_id" << 3 << "host"
+                                                            << "test4:1234")
+                                              << BSON("_id" << 4 << "host"
+                                                            << "test5:1234"))),
+                           HostAndPort("test1", 1234));
+        ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY));
+        myRid = getReplCoord()->getMyRID();
+    }
+};
+
+TEST_F(
+    StepDownTestFiveNode,
+    NodeReturnsExceededTimeLimitWhenStepDownIsRunAndCaughtUpMajorityExistsButWithoutElectableNode) {
     OperationContextReplMock txn;
-    OpTimeWithTermZero optime1(100, 1);
+    OpTime optimeLagged(Timestamp(100, 1), 1);
+    OpTime optimePrimary(Timestamp(100, 2), 1);
+
     // All nodes are caught up
-    getReplCoord()->setMyLastAppliedOpTime(optime1);
-    getReplCoord()->setMyLastDurableOpTime(optime1);
-    ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 1, optime1));
-    ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 2, optime1));
+    getReplCoord()->setMyLastAppliedOpTime(optimePrimary);
+    getReplCoord()->setMyLastDurableOpTime(optimePrimary);
+    ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 1, optimeLagged));
+    ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 2, optimeLagged));
+    ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 3, optimeLagged));
+    ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 4, optimeLagged));
 
     simulateSuccessfulV1Election();
 
     enterNetwork();
     getNet()->runUntil(getNet()->now() + Seconds(2));
     ASSERT(getNet()->hasReadyRequests());
-    NetworkInterfaceMock::NetworkOperationIterator noi = getNet()->getNextReadyRequest();
-    RemoteCommandRequest request = noi->getRequest();
-    log() << request.target.toString() << " processing " << request.cmdObj;
-    ReplSetHeartbeatArgsV1 hbArgs;
-    if (hbArgs.initialize(request.cmdObj).isOK()) {
-        ReplSetHeartbeatResponse hbResp;
-        hbResp.setSetName(hbArgs.getSetName());
-        hbResp.setState(MemberState::RS_SECONDARY);
-        hbResp.setConfigVersion(hbArgs.getConfigVersion());
-        hbResp.setDurableOpTime(optime1);
-        hbResp.setAppliedOpTime(optime1);
-        BSONObjBuilder respObj;
-        respObj << "ok" << 1;
-        hbResp.addToBSON(&respObj, false);
-        getNet()->scheduleResponse(noi, getNet()->now(), makeResponseStatus(respObj.obj()));
-    }
-    while (getNet()->hasReadyRequests()) {
-        getNet()->blackHole(getNet()->getNextReadyRequest());
-    }
+
+    // Make sure a majority are caught up (i.e. 3 out of 5). We catch up two secondaries since
+    // the primary counts as one towards majority
+    int numNodesCaughtUp = 2;
+    simulateHeartbeatResponses(optimePrimary, optimeLagged, numNodesCaughtUp, false);
+    getNet()->runReadyNetworkOperations();
+    exitNetwork();
+
+    ASSERT_TRUE(getReplCoord()->getMemberState().primary());
+    auto status = getReplCoord()->stepDown(&txn, false, Milliseconds(0), Milliseconds(1000));
+    ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, status);
+    ASSERT_TRUE(getReplCoord()->getMemberState().primary());
+}
+
+TEST_F(StepDownTestFiveNode,
+       NodeReturnsExceededTimeLimitWhenStepDownIsRunAndNoCaughtUpMajorityExists) {
+    OperationContextReplMock txn;
+    OpTime optimeLagged(Timestamp(100, 1), 1);
+    OpTime optimePrimary(Timestamp(100, 2), 1);
+
+    // All nodes are caught up
+    getReplCoord()->setMyLastAppliedOpTime(optimePrimary);
+    getReplCoord()->setMyLastDurableOpTime(optimePrimary);
+    ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 1, optimeLagged));
+    ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 2, optimeLagged));
+    ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 3, optimeLagged));
+    ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 4, optimeLagged));
+
+    simulateSuccessfulV1Election();
+
+    enterNetwork();
+    getNet()->runUntil(getNet()->now() + Seconds(2));
+    ASSERT(getNet()->hasReadyRequests());
+
+    // Make sure less than a majority are caught up (i.e. 2 out of 5) We catch up one secondary
+    // since the primary counts as one towards majority
+    int numNodesCaughtUp = 1;
+    simulateHeartbeatResponses(optimePrimary, optimeLagged, numNodesCaughtUp, true);
     getNet()->runReadyNetworkOperations();
     exitNetwork();
 
+    ASSERT_TRUE(getReplCoord()->getMemberState().primary());
+    auto status = getReplCoord()->stepDown(&txn, false, Milliseconds(0), Milliseconds(1000));
+    ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, status);
+    ASSERT_TRUE(getReplCoord()->getMemberState().primary());
+}
+
+TEST_F(
+    StepDownTestFiveNode,
+    NodeTransitionsToSecondaryImmediatelyWhenStepDownIsRunAndAnUpToDateMajorityWithElectableNodeExists) {
+    OperationContextReplMock txn;
+    OpTime optimeLagged(Timestamp(100, 1), 1);
+    OpTime optimePrimary(Timestamp(100, 2), 1);
+
+    // All nodes are caught up
+    getReplCoord()->setMyLastAppliedOpTime(optimePrimary);
+    getReplCoord()->setMyLastDurableOpTime(optimePrimary);
+    ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 1, optimeLagged));
+    ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 2, optimeLagged));
+    ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 3, optimeLagged));
+    ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 4, optimeLagged));
+
+    simulateSuccessfulV1Election();
+
+    enterNetwork();
+    getNet()->runUntil(getNet()->now() + Seconds(2));
+    ASSERT(getNet()->hasReadyRequests());
+
+    // Make sure a majority are caught up (i.e. 3 out of 5). We catch up two secondaries since
+    // the primary counts as one towards majority
+    int numNodesCaughtUp = 2;
+    simulateHeartbeatResponses(optimePrimary, optimeLagged, numNodesCaughtUp, true);
+    getNet()->runReadyNetworkOperations();
+    exitNetwork();
 
     ASSERT_TRUE(getReplCoord()->getMemberState().primary());
     ASSERT_OK(getReplCoord()->stepDown(&txn, false, Milliseconds(0), Milliseconds(1000)));
@@ -1666,8 +1793,9 @@ TEST_F(StepDownTest,
 TEST_F(StepDownTest,
        NodeTransitionsToSecondaryWhenASecondaryCatchesUpAfterTheFirstRoundOfHeartbeats) {
     OperationContextReplMock txn;
-    OpTimeWithTermZero optime1(100, 1);
-    OpTimeWithTermZero optime2(100, 2);
+    OpTime optime1(Timestamp(100, 1), 1);
+    OpTime optime2(Timestamp(100, 2), 1);
+
     // No secondary is caught up
     auto repl = getReplCoord();
     repl->setMyLastAppliedOpTime(optime2);
@@ -1677,6 +1805,8 @@ TEST_F(StepDownTest,
 
     simulateSuccessfulV1Election();
 
+    ASSERT_TRUE(getReplCoord()->getMemberState().primary());
+
     // Step down where the secondary actually has to catch up before the stepDown can succeed.
     // On entering the network, _stepDownContinue should cancel the heartbeats scheduled for
     // T + 2 seconds and send out a new round of heartbeats immediately.
@@ -1713,6 +1843,7 @@ TEST_F(StepDownTest,
         log() << "Blackholing network request " << noi->getRequest().cmdObj;
         getNet()->blackHole(noi);
     }
+
     getNet()->runReadyNetworkOperations();
     exitNetwork();
 
@@ -1724,8 +1855,9 @@ TEST_F(StepDownTest,
 TEST_F(StepDownTest,
        NodeTransitionsToSecondaryWhenASecondaryCatchesUpDuringStepDownsSecondaryCatchupPeriod) {
     OperationContextReplMock txn;
-    OpTimeWithTermZero optime1(100, 1);
-    OpTimeWithTermZero optime2(100, 2);
+    OpTime optime1(Timestamp(100, 1), 1);
+    OpTime optime2(Timestamp(100, 2), 1);
+
     // No secondary is caught up
     auto repl = getReplCoord();
     repl->setMyLastAppliedOpTime(optime2);
diff --git a/src/mongo/db/repl/topology_coordinator.h b/src/mongo/db/repl/topology_coordinator.h
index de01144f796..5e8cda75505 100644
--- a/src/mongo/db/repl/topology_coordinator.h
+++ b/src/mongo/db/repl/topology_coordinator.h
@@ -379,12 +379,24 @@ public:
     /**
      * Tries to transition the coordinator from the leader role to the follower role.
      *
-     * Fails if "force" is not set and no follower is known to be up.  It is illegal
-     * to call this method if the node is not leader.
+     * If force==true, step down this node and return true immediately. Else, a step down
+     * succeeds only if the following conditions are met:
      *
-     * Returns whether or not the step down succeeded.
+     *      C1. A majority set of nodes, M, in the replica set have optimes greater than or
+     *      equal to the last applied optime of the primary.
+     *
+     *      C2. If C1 holds, then there must exist at least one electable secondary node in the
+     *      majority set M.
+     *
+     * If C1 and C2 hold, a step down occurs and this method returns true. Else, the step down
+     * fails and this method returns false.
+     *
+     * NOTE: It is illegal to call this method if the node is not a primary.
      */
-    virtual bool stepDown(Date_t until, bool force, const OpTime& lastOpApplied) = 0;
+    virtual bool stepDown(Date_t until,
+                          bool force,
+                          const OpTime& lastOpApplied,
+                          const OpTime& lastOpCommitted) = 0;
 
     /**
      * Sometimes a request to step down comes in (like via a heartbeat), but we don't have the
diff --git a/src/mongo/db/repl/topology_coordinator_impl.cpp b/src/mongo/db/repl/topology_coordinator_impl.cpp
index e245ddb3276..afd720d7869 100644
--- a/src/mongo/db/repl/topology_coordinator_impl.cpp
+++ b/src/mongo/db/repl/topology_coordinator_impl.cpp
@@ -2228,24 +2228,37 @@ void TopologyCoordinatorImpl::processLoseElection() {
     }
 }
 
-bool TopologyCoordinatorImpl::stepDown(Date_t until, bool force, const OpTime& lastOpApplied) {
-    bool canStepDown = force;
-    for (int i = 0; !canStepDown && i < _rsConfig.getNumMembers(); ++i) {
-        if (i == _selfIndex) {
-            continue;
-        }
-        UnelectableReasonMask reason = _getUnelectableReason(i, lastOpApplied);
-        if (!reason && _hbdata[i].getAppliedOpTime() >= lastOpApplied) {
-            canStepDown = true;
-        }
+bool TopologyCoordinatorImpl::stepDown(Date_t until,
+                                       bool force,
+                                       const OpTime& lastOpApplied,
+                                       const OpTime& lastOpCommitted) {
+    // force==true overrides all other checks.
+    if (force) {
+        _stepDownUntil = until;
+        _stepDownSelfAndReplaceWith(-1);
+        return true;
     }
 
-    if (!canStepDown) {
+    // Ensure a majority of caught up nodes.
+    if (lastOpCommitted < lastOpApplied) {
         return false;
     }
-    _stepDownUntil = until;
-    _stepDownSelfAndReplaceWith(-1);
-    return true;
+
+    // Now make sure we also have at least one caught up node that is also electable.
+    for (int memberIndex = 0; memberIndex < _rsConfig.getNumMembers(); memberIndex++) {
+        // ignore your self
+        if (memberIndex == _selfIndex) {
+            continue;
+        }
+        UnelectableReasonMask reason = _getUnelectableReason(memberIndex, lastOpApplied);
+        if (!reason && _hbdata.at(memberIndex).getAppliedOpTime() >= lastOpApplied) {
+            // Found a caught up and electable node, succeed with step down.
+            _stepDownUntil = until;
+            _stepDownSelfAndReplaceWith(-1);
+            return true;
+        }
+    }
+    return false;
 }
 
 void TopologyCoordinatorImpl::setFollowerMode(MemberState::MS newMode) {
diff --git a/src/mongo/db/repl/topology_coordinator_impl.h b/src/mongo/db/repl/topology_coordinator_impl.h
index fb4b7786cb3..21b9799fca2 100644
--- a/src/mongo/db/repl/topology_coordinator_impl.h
+++ b/src/mongo/db/repl/topology_coordinator_impl.h
@@ -219,7 +219,10 @@ public:
     virtual void processLoseElection();
     virtual Status checkShouldStandForElection(Date_t now, const OpTime& lastOpApplied) const;
     virtual void setMyHeartbeatMessage(const Date_t now, const std::string& message);
-    virtual bool stepDown(Date_t until, bool force, const OpTime& lastOpApplied);
+    virtual bool stepDown(Date_t until,
+                          bool force,
+                          const OpTime& lastOpApplied,
+                          const OpTime& lastOpCommitted);
     virtual bool stepDownIfPending();
     virtual Date_t getStepDownTime() const;
     virtual void prepareReplResponseMetadata(rpc::ReplSetMetadata* metadata,
author	William Schultz <william.schultz@mongodb.com>	2016-11-01 14:47:07 -0400
committer	Judah Schvimer <judah@mongodb.com>	2016-12-28 14:33:03 -0500
commit	4075543b3ef659c26798e291615d5271438b7fc2 (patch)
tree	e66c59d1f45089dbdad1bdd20677656092a0ba33
parent	1c43b90c0483bc7574fecd0eaeee610480cc9217 (diff)
download	mongo-4075543b3ef659c26798e291615d5271438b7fc2.tar.gz