SERVER-29500 Call for catchup takeover election when catchup takeover timeout fires

author: Samy Lanka <samy.lanka@10gen.com> 2017-07-05 15:18:06 -0400
committer: Samy Lanka <samy.lanka@10gen.com> 2017-08-02 15:29:26 -0400
commit: 5f1ce8b6765a25d45ba5e35063db417b3069c8d6 (patch)
tree: de942dd067b10bb2ae70f3a5b0f5601cd7fa87ec
parent: 743119c879ab2e5d1d8ca05aadf6fe29b1526a94 (diff)
download: mongo-5f1ce8b6765a25d45ba5e35063db417b3069c8d6.tar.gz
14 files changed, 682 insertions, 58 deletions
diff --git a/buildscripts/resmokeconfig/suites/replica_sets_pv0.yml b/buildscripts/resmokeconfig/suites/replica_sets_pv0.yml
index a5e8a37c1cd..b78e6fbb13e 100644
--- a/buildscripts/resmokeconfig/suites/replica_sets_pv0.yml
+++ b/buildscripts/resmokeconfig/suites/replica_sets_pv0.yml
@@ -4,7 +4,6 @@ selector:
   roots:
   - jstests/replsets/*.js
   exclude_files:
-  - jstests/replsets/catchup.js
   - jstests/replsets/config_server_checks.js
   - jstests/replsets/disallow_adding_initialized_node1.js
   - jstests/replsets/disallow_adding_initialized_node2.js
@@ -32,6 +31,10 @@ selector:
   - jstests/replsets/command_response_operation_time.js
   # Majority read concern not supported in PV0.
   - jstests/replsets/operation_time_read_and_write_concern.js
+  # Catchup not supported in PV0.
+  - jstests/replsets/catchup.js
+  - jstests/replsets/catchup_takeover_one_high_priority.js
+  - jstests/replsets/catchup_takeover_two_nodes_ahead.js
 
 executor:
   config:
diff --git a/jstests/libs/write_concern_util.js b/jstests/libs/write_concern_util.js
index 8a846cd7f83..4bbf6ad4d29 100644
--- a/jstests/libs/write_concern_util.js
+++ b/jstests/libs/write_concern_util.js
@@ -35,8 +35,12 @@ function stopServerReplication(conn) {
     assert.commandWorked(
         conn.adminCommand({configureFailPoint: 'stopReplProducer', mode: 'alwaysOn'}), errMsg);
 
-    // Wait until the fail point is actually hit.
-    checkLog.contains(conn, 'bgsync - stopReplProducer fail point enabled');
+    // Wait until the fail point is actually hit. Don't wait if the node is the primary, because
+    // the fail point won't be hit until the node transitions from being the primary.
+    if (assert.commandWorked(conn.adminCommand('replSetGetStatus')).myState !=
+        ReplSetTest.State.PRIMARY) {
+        checkLog.contains(conn, 'bgsync - stopReplProducer fail point enabled');
+    }
 }
 
 // Stops replication at all replicaset secondaries.
diff --git a/jstests/replsets/catchup_takeover_one_high_priority.js b/jstests/replsets/catchup_takeover_one_high_priority.js
new file mode 100644
index 00000000000..ed6d7faee50
--- /dev/null
+++ b/jstests/replsets/catchup_takeover_one_high_priority.js
@@ -0,0 +1,84 @@
+// Test to ensure that catchup takeover runs even if it isn't the highest
+// priority node and that once the high priority node is caught up,
+// it becomes primary again.
+
+// 3-node replica set with one high priority node.
+// Start replica set. Make node 0 primary and stop the replication
+// for the high priority node as well as isolate it. Have the
+// primary write something so node 2 is more than 2 seconds behind.
+// Write something else to ensure the third node is also lagged.
+// Reconnect the high priority node to the other nodes and make
+// the lagged node (node 1) the next primary.
+// Confirm that the most up-to-date node becomes primary.
+// Let the highest priority node catchup and then confirm
+// that it becomes primary.
+
+(function() {
+    'use strict';
+
+    load('jstests/replsets/rslib.js');
+
+    var name = 'catchup_takeover_one_high_priority';
+    var replSet =
+        new ReplSetTest({name: name, nodes: [{}, {}, {rsConfig: {priority: 2}}], useBridge: true});
+    var nodes = replSet.startSet();
+    replSet.initiate();
+
+    // Wait until node 2 becomes primary.
+    replSet.waitForState(2, ReplSetTest.State.PRIMARY, replSet.kDefaultTimeoutMS);
+    jsTestLog('node 2 is now primary');
+
+    replSet.awaitReplication();
+
+    // Stop replication and disconnect node 2 so that it cannot do a priority takeover.
+    stopServerReplication(nodes[2]);
+    nodes[2].disconnect(nodes[1]);
+    nodes[2].disconnect(nodes[0]);
+
+    // Ensure that node 0 becomes primary.
+    assert.commandWorked(nodes[0].adminCommand({replSetStepUp: 1}));
+    replSet.awaitNodesAgreeOnPrimary(replSet.kDefaultTimeoutMS, nodes.slice(0, 2));
+    assert.eq(ReplSetTest.State.PRIMARY,
+              assert.commandWorked(nodes[0].adminCommand('replSetGetStatus')).myState,
+              nodes[0].host + " was not primary after step-up");
+    jsTestLog('node 0 is now primary');
+
+    // Sleep for a few seconds to ensure that node 2's optime is more than 2 seconds behind.
+    // This will ensure it can't do a priority takeover until it catches up.
+    sleep(3000);
+
+    var primary = replSet.getPrimary();
+    var writeConcern = {writeConcern: {w: 2, wtimeout: replSet.kDefaultTimeoutMS}};
+    assert.writeOK(primary.getDB(name).bar.insert({y: 100}, writeConcern));
+
+    // Write something so that node 0 is ahead of node 1.
+    stopServerReplication(nodes[1]);
+    writeConcern = {writeConcern: {w: 1, wtimeout: replSet.kDefaultTimeoutMS}};
+    assert.writeOK(primary.getDB(name).bar.insert({x: 100}, writeConcern));
+
+    nodes[2].reconnect(nodes[0]);
+    nodes[2].reconnect(nodes[1]);
+
+    // Step up a lagged node.
+    assert.commandWorked(nodes[1].adminCommand({replSetStepUp: 1}));
+    replSet.awaitNodesAgreeOnPrimary(replSet.kDefaultTimeoutMS, nodes);
+    assert.eq(ReplSetTest.State.PRIMARY,
+              assert.commandWorked(nodes[1].adminCommand('replSetGetStatus')).myState,
+              nodes[1].host + " was not primary after step-up");
+    jsTestLog('node 1 is now primary, but cannot accept writes');
+
+    // Confirm that the most up-to-date node becomes primary
+    // after the default catchup delay.
+    replSet.waitForState(0, ReplSetTest.State.PRIMARY, 60 * 1000);
+    jsTestLog('node 0 performed catchup takeover and is now primary');
+
+    // Let the nodes catchup.
+    restartServerReplication(nodes[1]);
+    restartServerReplication(nodes[2]);
+
+    // Confirm that the highest priority node becomes primary
+    // after catching up.
+    replSet.waitForState(2, ReplSetTest.State.PRIMARY, 30 * 1000);
+    jsTestLog('node 2 performed priority takeover and is now primary');
+
+})();
+\ No newline at end of file
diff --git a/jstests/replsets/catchup_takeover_two_nodes_ahead.js b/jstests/replsets/catchup_takeover_two_nodes_ahead.js
new file mode 100644
index 00000000000..4be8fee9007
--- /dev/null
+++ b/jstests/replsets/catchup_takeover_two_nodes_ahead.js
@@ -0,0 +1,56 @@
+// Test to ensure that a catchup takeover happens when the primary is lagged.
+// Make sure that when two nodes are more caught up than the primary,
+// the most up-to-date node becomes the primary.
+
+// 5-node replica set
+// Start replica set. Ensure that node 0 becomes primary.
+// Stop the replication for some nodes and have the primary write something.
+// Stop replication for an up-to-date node and have the primary write something.
+// Now the primary is most-up-to-date and another node is more up-to-date than others.
+// Make a lagged node the next primary.
+// Confirm that the most up-to-date node becomes primary.
+
+(function() {
+    'use strict';
+
+    load('jstests/replsets/rslib.js');
+
+    var name = 'catchup_takeover_two_nodes_ahead';
+    var replSet = new ReplSetTest({name: name, nodes: 5});
+    var nodes = replSet.startSet();
+    replSet.initiate();
+
+    // Wait until all nodes get the "no-op" of "new primary" after initial sync.
+    waitUntilAllNodesCaughtUp(nodes);
+
+    // Write something so that nodes 0 and 1 are ahead.
+    stopServerReplication(nodes.slice(2, 5));
+    var primary = replSet.getPrimary();
+    var writeConcern = {writeConcern: {w: 2, wtimeout: replSet.kDefaultTimeoutMS}};
+    assert.writeOK(primary.getDB(name).bar.insert({x: 100}, writeConcern));
+
+    // Write something so that node 0 is ahead of node 1.
+    stopServerReplication(nodes[1]);
+    writeConcern = {writeConcern: {w: 1, wtimeout: replSet.kDefaultTimeoutMS}};
+    assert.writeOK(primary.getDB(name).bar.insert({y: 100}, writeConcern));
+
+    // Step up one of the lagged nodes.
+    assert.commandWorked(nodes[2].adminCommand({replSetStepUp: 1}));
+    replSet.awaitNodesAgreeOnPrimary(replSet.kDefaultTimeoutMS, nodes);
+    assert.eq(ReplSetTest.State.PRIMARY,
+              assert.commandWorked(nodes[2].adminCommand('replSetGetStatus')).myState,
+              nodes[2].host + " was not primary after step-up");
+    jsTestLog('node 2 is now primary, but cannot accept writes');
+
+    // Make sure that node 2 cannot write anything. Because it is lagged and replication
+    // has been stopped, it shouldn't be able to become master.
+    assert.writeErrorWithCode(nodes[2].getDB(name).bar.insert({z: 100}, writeConcern),
+                              ErrorCodes.NotMaster);
+
+    // Confirm that the most up-to-date node becomes primary
+    // after the default catchup delay.
+    replSet.waitForState(0, ReplSetTest.State.PRIMARY, 60 * 1000);
+
+    // Let the nodes catchup
+    restartServerReplication(nodes.slice(1, 5));
+})();
+\ No newline at end of file
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index e1f995f4d6f..84f5cb58031 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -3375,7 +3375,7 @@ Status ReplicationCoordinatorImpl::stepUpIfEligible() {
                       "Step-up command is only supported by Protocol Version 1");
     }
 
-    _startElectSelfIfEligibleV1(StartElectionV1Reason::kStepUpRequest);
+    _startElectSelfIfEligibleV1(TopologyCoordinator::StartElectionReason::kStepUpRequest);
     EventHandle finishEvent;
     {
         stdx::lock_guard<stdx::mutex> lk(_mutex);
diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h
index 1fbc2d059fc..27d698feb69 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.h
+++ b/src/mongo/db/repl/replication_coordinator_impl.h
@@ -1056,12 +1056,8 @@ private:
 
     /**
      * Callback which starts an election if this node is electable and using protocolVersion 1.
-     * "isPriorityTakeover" is used to determine if the caller was a priority takeover or not and
-     * log messages accordingly.
      */
-    enum StartElectionV1Reason { kElectionTimeout, kPriorityTakeover, kStepUpRequest };
-
-    void _startElectSelfIfEligibleV1(StartElectionV1Reason reason);
+    void _startElectSelfIfEligibleV1(TopologyCoordinator::StartElectionReason reason);
 
     /**
      * Resets the term of last vote to 0 to prevent any node from voting for term 0.
diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect.cpp
index 91b1c5dfc15..727d652630a 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_elect.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_elect.cpp
@@ -284,7 +284,9 @@ void ReplicationCoordinatorImpl::_recoverFromElectionTie(
     if (!status.isOK()) {
         LOG(2) << "ReplicationCoordinatorImpl::_recoverFromElectionTie -- " << status.reason();
     } else {
-        fassertStatusOK(28817, _topCoord->becomeCandidateIfElectable(now, false));
+        fassertStatusOK(28817,
+                        _topCoord->becomeCandidateIfElectable(
+                            now, TopologyCoordinator::StartElectionReason::kElectionTimeout));
         _startElectSelf_inlock();
     }
 }
diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp
index 1f75f7c9a75..ca7ef2f5d86 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp
@@ -859,9 +859,11 @@ public:
         return net->now();
     }
 
-    void performSuccessfulPriorityTakeover(Date_t priorityTakeoverTime) {
+    void performSuccessfulTakeover(Date_t takeoverTime,
+                                   TopologyCoordinator::StartElectionReason reason,
+                                   const LastVote& lastVoteExpected) {
         startCapturingLogMessages();
-        simulateSuccessfulV1ElectionAt(priorityTakeoverTime);
+        simulateSuccessfulV1ElectionAt(takeoverTime);
         getReplCoord()->waitForElectionFinish_forTest();
         stopCapturingLogMessages();
 
@@ -870,10 +872,13 @@ public:
         // Check last vote
         auto lastVote = getExternalState()->loadLocalLastVoteDocument(nullptr);
         ASSERT(lastVote.isOK());
-        ASSERT_EQ(0, lastVote.getValue().getCandidateIndex());
-        ASSERT_EQ(1, lastVote.getValue().getTerm());
+        ASSERT_EQ(lastVoteExpected.getCandidateIndex(), lastVote.getValue().getCandidateIndex());
+        ASSERT_EQ(lastVoteExpected.getTerm(), lastVote.getValue().getTerm());
 
-        ASSERT_EQUALS(1, countLogLinesContaining("Starting an election for a priority takeover"));
+        if (reason == TopologyCoordinator::StartElectionReason::kPriorityTakeover) {
+            ASSERT_EQUALS(1,
+                          countLogLinesContaining("Starting an election for a priority takeover"));
+        }
         ASSERT_EQUALS(1, countLogLinesContaining("election succeeded"));
     }
 
@@ -894,11 +899,22 @@ private:
 
         // Process all requests queued at the present time.
         while (net->hasReadyRequests()) {
-            auto noi = net->getNextReadyRequest();
+
+            // If we see that the next request isn't for a heartbeat, exit the function.
+            // This allows us to mock heartbeat responses with whatever info we want
+            // right up until another event happens (like an election). This is
+            // particularly important for simulating a catchup takeover because
+            // we need to know specific info about the primary.
+            auto noi = net->getFrontOfUnscheduledQueue();
+            auto&& nextRequest = noi->getRequest();
+            if (nextRequest.cmdObj.firstElement().fieldNameStringData() != "replSetHeartbeat") {
+                return;
+            }
+
+            noi = net->getNextReadyRequest();
             auto&& request = noi->getRequest();
 
             log() << request.target << " processing " << request.cmdObj;
-            ASSERT_EQUALS("replSetHeartbeat", request.cmdObj.firstElement().fieldNameStringData());
 
             // Make sure the heartbeat request is valid.
             ReplSetHeartbeatArgsV1 hbArgs;
@@ -1290,7 +1306,7 @@ TEST_F(TakeoverTest, CatchupTakeoverCanceledIfTransitionToRollback) {
     ASSERT_EQUALS(0, countLogLinesContaining("Starting an election for a catchup takeover"));
 }
 
-TEST_F(TakeoverTest, CatchupTakeoverElectionIsANoop) {
+TEST_F(TakeoverTest, SuccessfulCatchupTakeover) {
     BSONObj configObj = BSON("_id"
                              << "mySet"
                              << "version"
@@ -1318,6 +1334,11 @@ TEST_F(TakeoverTest, CatchupTakeoverElectionIsANoop) {
     replCoord->setMyLastAppliedOpTime(currentOptime);
     replCoord->setMyLastDurableOpTime(currentOptime);
 
+    // Update the term so that the current term is ahead of the term of
+    // the last applied op time. This means that the primary is still in
+    // catchup mode since it hasn't written anything this term.
+    ASSERT_EQUALS(ErrorCodes::StaleTerm, replCoord->updateTerm(&opCtx, replCoord->getTerm() + 1));
+
     // Make sure we're secondary and that no takeover has been scheduled.
     ASSERT_OK(replCoord->setFollowerMode(MemberState::RS_SECONDARY));
     ASSERT_FALSE(replCoord->getCatchupTakeover_forTest());
@@ -1333,12 +1354,164 @@ TEST_F(TakeoverTest, CatchupTakeoverElectionIsANoop) {
     ASSERT_EQUALS(config.getCatchUpTakeoverDelay(), catchupTakeoverDelay);
 
     startCapturingLogMessages();
-    now = respondToHeartbeatsUntil(config, catchupTakeoverTime, primaryHostAndPort, behindOptime);
+
+    // The catchup takeover will be scheduled at a time later than one election
+    // timeout after our initial heartbeat responses, so mock a few rounds of
+    // heartbeat responses to prevent a normal election timeout.
+    now = respondToHeartbeatsUntil(
+        config, catchupTakeoverTime, HostAndPort("node2", 12345), behindOptime);
+    stopCapturingLogMessages();
+
+    // Since the heartbeats go through the catchupTakeoverTimeout, this log
+    // message happens already (otherwise it would happen in performSuccessfulTakeover).
+    ASSERT_EQUALS(1, countLogLinesContaining("Starting an election for a catchup takeover"));
+
+    LastVote lastVoteExpected = LastVote(replCoord->getTerm() + 1, 0);
+    performSuccessfulTakeover(catchupTakeoverTime,
+                              TopologyCoordinator::StartElectionReason::kCatchupTakeover,
+                              lastVoteExpected);
+}
+
+TEST_F(TakeoverTest, PrimaryCatchesUpBeforeCatchupTakeover) {
+    BSONObj configObj = BSON("_id"
+                             << "mySet"
+                             << "version"
+                             << 1
+                             << "members"
+                             << BSON_ARRAY(BSON("_id" << 1 << "host"
+                                                      << "node1:12345")
+                                           << BSON("_id" << 2 << "host"
+                                                         << "node2:12345")
+                                           << BSON("_id" << 3 << "host"
+                                                         << "node3:12345"))
+                             << "protocolVersion"
+                             << 1);
+    assertStartSuccess(configObj, HostAndPort("node1", 12345));
+    ReplSetConfig config = assertMakeRSConfig(configObj);
+
+    auto replCoord = getReplCoord();
+    auto now = getNet()->now();
+
+    OperationContextNoop opCtx;
+    OpTime currentOptime(Timestamp(200, 1), 0);
+    replCoord->setMyLastAppliedOpTime(currentOptime);
+    replCoord->setMyLastDurableOpTime(currentOptime);
+    OpTime behindOptime(Timestamp(100, 1), 0);
+
+    // Update the term so that the current term is ahead of the term of
+    // the last applied op time.
+    ASSERT_EQUALS(ErrorCodes::StaleTerm, replCoord->updateTerm(&opCtx, replCoord->getTerm() + 1));
+
+    // Make sure we're secondary and that no catchup takeover has been scheduled.
+    ASSERT_OK(replCoord->setFollowerMode(MemberState::RS_SECONDARY));
+    ASSERT_FALSE(replCoord->getCatchupTakeover_forTest());
+
+    startCapturingLogMessages();
+
+    // Mock a first round of heartbeat responses, which should give us enough information to know
+    // that we are fresher than the current primary, prompting the scheduling of a catchup
+    // takeover.
+    now = respondToHeartbeatsUntil(config, now, HostAndPort("node2", 12345), behindOptime);
+
+    // Make sure that the catchup takeover has actually been scheduled and at the
+    // correct time.
+    ASSERT(replCoord->getCatchupTakeover_forTest());
+    auto catchupTakeoverTime = replCoord->getCatchupTakeover_forTest().get();
+    Milliseconds catchupTakeoverDelay = catchupTakeoverTime - now;
+    ASSERT_EQUALS(config.getCatchUpTakeoverDelay(), catchupTakeoverDelay);
+
+    // Mock another heartbeat where the primary is now up to date
+    // and run time through when catchup takeover was supposed to happen.
+    now = respondToHeartbeatsUntil(
+        config, now + catchupTakeoverDelay, HostAndPort("node2", 12345), currentOptime);
+
+    stopCapturingLogMessages();
+
+    // Make sure we're secondary and that no catchup takeover election happened.
+    ASSERT(replCoord->getMemberState().secondary());
+    ASSERT_FALSE(replCoord->getCatchupTakeover_forTest());
+    ASSERT_EQUALS(1, countLogLinesContaining("Not starting an election for a catchup takeover"));
+}
+
+TEST_F(TakeoverTest, PrimaryCatchesUpBeforeHighPriorityNodeCatchupTakeover) {
+    BSONObj configObj = BSON("_id"
+                             << "mySet"
+                             << "version"
+                             << 1
+                             << "members"
+                             << BSON_ARRAY(BSON("_id" << 1 << "host"
+                                                      << "node1:12345"
+                                                      << "priority"
+                                                      << 2)
+                                           << BSON("_id" << 2 << "host"
+                                                         << "node2:12345")
+                                           << BSON("_id" << 3 << "host"
+                                                         << "node3:12345"))
+                             << "protocolVersion"
+                             << 1);
+    assertStartSuccess(configObj, HostAndPort("node1", 12345));
+    ReplSetConfig config = assertMakeRSConfig(configObj);
+
+    auto replCoord = getReplCoord();
+    auto now = getNet()->now();
+
+    OperationContextNoop opCtx;
+    OpTime currentOptime(Timestamp(200, 1), 0);
+    replCoord->setMyLastAppliedOpTime(currentOptime);
+    replCoord->setMyLastDurableOpTime(currentOptime);
+    OpTime behindOptime(Timestamp(100, 1), 0);
+
+    // Update the term so that the current term is ahead of the term of
+    // the last applied op time.
+    ASSERT_EQUALS(ErrorCodes::StaleTerm, replCoord->updateTerm(&opCtx, replCoord->getTerm() + 1));
+
+    // Make sure we're secondary and that no catchup takeover has been scheduled.
+    ASSERT_OK(replCoord->setFollowerMode(MemberState::RS_SECONDARY));
+    ASSERT_FALSE(replCoord->getCatchupTakeover_forTest());
+
+    startCapturingLogMessages();
+
+    // Mock a first round of heartbeat responses, which should give us enough information to know
+    // that we are fresher than the current primary, prompting the scheduling of a catchup
+    // takeover.
+    now = respondToHeartbeatsUntil(config, now, HostAndPort("node2", 12345), behindOptime);
+
+    // Make sure that the catchup takeover has actually been scheduled and at the
+    // correct time.
+    ASSERT(replCoord->getCatchupTakeover_forTest());
+    auto catchupTakeoverTime = replCoord->getCatchupTakeover_forTest().get();
+    Milliseconds catchupTakeoverDelay = catchupTakeoverTime - now;
+    ASSERT_EQUALS(config.getCatchUpTakeoverDelay(), catchupTakeoverDelay);
+
+    // Mock another heartbeat where the primary is now up to date
+    // and run time through when catchup takeover was supposed to happen.
+    now = respondToHeartbeatsUntil(
+        config, now + catchupTakeoverDelay, HostAndPort("node2", 12345), currentOptime);
+
     stopCapturingLogMessages();
 
-    // Make sure that the catchup takeover fired as a NOOP.
+    // Make sure we're secondary and that no catchup takeover election happens.
     ASSERT(replCoord->getMemberState().secondary());
-    ASSERT_EQUALS(1, countLogLinesContaining("Starting an election for a catchup takeover [NOOP]"));
+    ASSERT_FALSE(replCoord->getCatchupTakeover_forTest());
+    ASSERT_EQUALS(1, countLogLinesContaining("Not starting an election for a catchup takeover"));
+
+    // Make sure that the priority takeover has now been scheduled and at the
+    // correct time.
+    ASSERT(replCoord->getPriorityTakeover_forTest());
+    auto priorityTakeoverTime = replCoord->getPriorityTakeover_forTest().get();
+    assertValidPriorityTakeoverDelay(config, now, priorityTakeoverTime, 0);
+
+    // The priority takeover might be scheduled at a time later than one election
+    // timeout after our initial heartbeat responses, so mock another round of
+    // heartbeat responses to prevent a normal election timeout.
+    Milliseconds halfElectionTimeout = config.getElectionTimeoutPeriod() / 2;
+    now = respondToHeartbeatsUntil(
+        config, now + halfElectionTimeout, HostAndPort("node2", 12345), currentOptime);
+
+    LastVote lastVoteExpected = LastVote(replCoord->getTerm() + 1, 0);
+    performSuccessfulTakeover(priorityTakeoverTime,
+                              TopologyCoordinator::StartElectionReason::kPriorityTakeover,
+                              lastVoteExpected);
 }
 
 TEST_F(TakeoverTest, SchedulesPriorityTakeoverIfNodeHasHigherPriorityThanCurrentPrimary) {
@@ -1437,7 +1610,10 @@ TEST_F(TakeoverTest, SuccessfulPriorityTakeover) {
     now = respondToHeartbeatsUntil(
         config, now + halfElectionTimeout, HostAndPort("node2", 12345), myOptime);
 
-    performSuccessfulPriorityTakeover(priorityTakeoverTime);
+    LastVote lastVoteExpected = LastVote(replCoord->getTerm() + 1, 0);
+    performSuccessfulTakeover(priorityTakeoverTime,
+                              TopologyCoordinator::StartElectionReason::kPriorityTakeover,
+                              lastVoteExpected);
 }
 
 TEST_F(TakeoverTest, DontCallForPriorityTakeoverWhenLaggedSameSecond) {
@@ -1513,7 +1689,10 @@ TEST_F(TakeoverTest, DontCallForPriorityTakeoverWhenLaggedSameSecond) {
     replCoord->setMyLastAppliedOpTime(closeEnoughOpTime);
     replCoord->setMyLastDurableOpTime(closeEnoughOpTime);
 
-    performSuccessfulPriorityTakeover(priorityTakeoverTime);
+    LastVote lastVoteExpected = LastVote(replCoord->getTerm() + 1, 0);
+    performSuccessfulTakeover(priorityTakeoverTime,
+                              TopologyCoordinator::StartElectionReason::kPriorityTakeover,
+                              lastVoteExpected);
 }
 
 TEST_F(TakeoverTest, DontCallForPriorityTakeoverWhenLaggedDifferentSecond) {
@@ -1588,7 +1767,10 @@ TEST_F(TakeoverTest, DontCallForPriorityTakeoverWhenLaggedDifferentSecond) {
     replCoord->setMyLastAppliedOpTime(closeEnoughOpTime);
     replCoord->setMyLastDurableOpTime(closeEnoughOpTime);
 
-    performSuccessfulPriorityTakeover(priorityTakeoverTime);
+    LastVote lastVoteExpected = LastVote(replCoord->getTerm() + 1, 0);
+    performSuccessfulTakeover(priorityTakeoverTime,
+                              TopologyCoordinator::StartElectionReason::kPriorityTakeover,
+                              lastVoteExpected);
 }
 
 TEST_F(ReplCoordTest, NodeCancelsElectionUponReceivingANewConfigDuringDryRun) {
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
index fbaeb9dca44..35199798605 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
@@ -286,7 +286,7 @@ stdx::unique_lock<stdx::mutex> ReplicationCoordinatorImpl::_handleHeartbeatRespo
                     _priorityTakeoverWhen,
                     stdx::bind(&ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1,
                                this,
-                               StartElectionV1Reason::kPriorityTakeover));
+                               TopologyCoordinator::StartElectionReason::kPriorityTakeover));
             }
             break;
         }
@@ -297,11 +297,10 @@ stdx::unique_lock<stdx::mutex> ReplicationCoordinatorImpl::_handleHeartbeatRespo
                 _catchupTakeoverWhen = _replExecutor->now() + catchupTakeoverDelay;
                 log() << "Scheduling catchup takeover at " << _catchupTakeoverWhen;
                 _catchupTakeoverCbh = _scheduleWorkAt(
-                    _catchupTakeoverWhen, [this](const executor::TaskExecutor::CallbackArgs& args) {
-                        stdx::lock_guard<stdx::mutex> lock(_mutex);
-                        _cancelCatchupTakeover_inlock();
-                        log() << "Starting an election for a catchup takeover [NOOP]";
-                    });
+                    _catchupTakeoverWhen,
+                    stdx::bind(&ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1,
+                               this,
+                               TopologyCoordinator::StartElectionReason::kCatchupTakeover));
             }
             break;
         }
@@ -767,10 +766,11 @@ void ReplicationCoordinatorImpl::_cancelAndRescheduleElectionTimeout_inlock() {
         _scheduleWorkAt(when,
                         stdx::bind(&ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1,
                                    this,
-                                   StartElectionV1Reason::kElectionTimeout));
+                                   TopologyCoordinator::StartElectionReason::kElectionTimeout));
 }
 
-void ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1(StartElectionV1Reason reason) {
+void ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1(
+    TopologyCoordinator::StartElectionReason reason) {
     if (!isV1ElectionProtocol()) {
         return;
     }
@@ -788,37 +788,43 @@ void ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1(StartElectionV1Reas
         }
     }
 
-    const auto status = _topCoord->becomeCandidateIfElectable(
-        _replExecutor->now(), reason == StartElectionV1Reason::kPriorityTakeover);
+    const auto status = _topCoord->becomeCandidateIfElectable(_replExecutor->now(), reason);
     if (!status.isOK()) {
         switch (reason) {
-            case StartElectionV1Reason::kElectionTimeout:
+            case TopologyCoordinator::TopologyCoordinator::StartElectionReason::kElectionTimeout:
                 log() << "Not starting an election, since we are not electable due to: "
                       << status.reason();
                 break;
-            case StartElectionV1Reason::kPriorityTakeover:
+            case TopologyCoordinator::StartElectionReason::kPriorityTakeover:
                 log() << "Not starting an election for a priority takeover, "
                       << "since we are not electable due to: " << status.reason();
                 break;
-            case StartElectionV1Reason::kStepUpRequest:
+            case TopologyCoordinator::StartElectionReason::kStepUpRequest:
                 log() << "Not starting an election for a replSetStepUp request, "
                       << "since we are not electable due to: " << status.reason();
                 break;
+            case TopologyCoordinator::StartElectionReason::kCatchupTakeover:
+                log() << "Not starting an election for a catchup takeover, "
+                      << "since we are not electable due to: " << status.reason();
+                break;
         }
         return;
     }
 
     switch (reason) {
-        case StartElectionV1Reason::kElectionTimeout:
+        case TopologyCoordinator::StartElectionReason::kElectionTimeout:
             log() << "Starting an election, since we've seen no PRIMARY in the past "
                   << _rsConfig.getElectionTimeoutPeriod();
             break;
-        case StartElectionV1Reason::kPriorityTakeover:
+        case TopologyCoordinator::StartElectionReason::kPriorityTakeover:
             log() << "Starting an election for a priority takeover";
             break;
-        case StartElectionV1Reason::kStepUpRequest:
+        case TopologyCoordinator::StartElectionReason::kStepUpRequest:
             log() << "Starting an election due to step up request";
             break;
+        case TopologyCoordinator::StartElectionReason::kCatchupTakeover:
+            log() << "Starting an election for a catchup takeover";
+            break;
     }
 
     _startElectSelfV1_inlock();
diff --git a/src/mongo/db/repl/topology_coordinator.h b/src/mongo/db/repl/topology_coordinator.h
index 377959fc47e..3c2df705d93 100644
--- a/src/mongo/db/repl/topology_coordinator.h
+++ b/src/mongo/db/repl/topology_coordinator.h
@@ -587,10 +587,17 @@ public:
      */
     virtual void setPrimaryIndex(long long primaryIndex) = 0;
 
+    enum StartElectionReason {
+        kElectionTimeout,
+        kPriorityTakeover,
+        kStepUpRequest,
+        kCatchupTakeover
+    };
+
     /**
      * Transitions to the candidate role if the node is electable.
      */
-    virtual Status becomeCandidateIfElectable(const Date_t now, bool isPriorityTakeover) = 0;
+    virtual Status becomeCandidateIfElectable(const Date_t now, StartElectionReason reason) = 0;
 
     /**
      * Updates the storage engine read committed support in the TopologyCoordinator options after
diff --git a/src/mongo/db/repl/topology_coordinator_impl.cpp b/src/mongo/db/repl/topology_coordinator_impl.cpp
index 8d707362fda..2d17af00d7a 100644
--- a/src/mongo/db/repl/topology_coordinator_impl.cpp
+++ b/src/mongo/db/repl/topology_coordinator_impl.cpp
@@ -740,7 +740,7 @@ Status TopologyCoordinatorImpl::prepareHeartbeatResponse(Date_t now,
     const OpTime lastOpDurable = getMyLastDurableOpTime();
 
     // Are we electable
-    response->setElectable(!_getMyUnelectableReason(now, false));
+    response->setElectable(!_getMyUnelectableReason(now, StartElectionReason::kElectionTimeout));
 
     // Heartbeat status message
     response->setHbMsg(_getHbmsg(now));
@@ -1182,7 +1182,7 @@ HeartbeatResponseAction TopologyCoordinatorImpl::setMemberAsDown(Date_t now,
     MemberData& hbData = _memberData.at(memberIndex);
     hbData.setDownValues(now, "no response within election timeout period");
 
-    if (CannotSeeMajority & _getMyUnelectableReason(now, false)) {
+    if (CannotSeeMajority & _getMyUnelectableReason(now, StartElectionReason::kElectionTimeout)) {
         if (_stepDownPending) {
             return HeartbeatResponseAction::makeNoAction();
         }
@@ -1487,7 +1487,8 @@ HeartbeatResponseAction TopologyCoordinatorImpl::_updatePrimaryFromHBData(
     // If we are primary, check if we can still see majority of the set;
     // stepdown if we can't.
     if (_iAmPrimary()) {
-        if (CannotSeeMajority & _getMyUnelectableReason(now, false)) {
+        if (CannotSeeMajority &
+            _getMyUnelectableReason(now, StartElectionReason::kElectionTimeout)) {
             if (_stepDownPending) {
                 return HeartbeatResponseAction::makeNoAction();
             }
@@ -1522,7 +1523,7 @@ HeartbeatResponseAction TopologyCoordinatorImpl::_updatePrimaryFromHBData(
         LOG(2) << "TopologyCoordinatorImpl::_updatePrimaryFromHBData - " << status.reason();
         return HeartbeatResponseAction::makeNoAction();
     }
-    fassertStatusOK(28816, becomeCandidateIfElectable(now, false));
+    fassertStatusOK(28816, becomeCandidateIfElectable(now, StartElectionReason::kElectionTimeout));
     return HeartbeatResponseAction::makeElectAction();
 }
 
@@ -1536,7 +1537,8 @@ Status TopologyCoordinatorImpl::checkShouldStandForElection(Date_t now) const {
         return {ErrorCodes::NodeNotElectable, "Not standing for election again; already candidate"};
     }
 
-    const UnelectableReasonMask unelectableReason = _getMyUnelectableReason(now, false);
+    const UnelectableReasonMask unelectableReason =
+        _getMyUnelectableReason(now, StartElectionReason::kElectionTimeout);
     if (NotCloseEnoughToLatestOptime & unelectableReason) {
         return {ErrorCodes::NodeNotElectable,
                 str::stream() << "Not standing for election because "
@@ -1633,6 +1635,40 @@ bool TopologyCoordinatorImpl::_amIFreshEnoughForPriorityTakeover() const {
     }
 }
 
+bool TopologyCoordinatorImpl::_amIFreshEnoughForCatchupTakeover() const {
+
+    const OpTime latestKnownOpTime = _latestKnownOpTime();
+
+    // Rules are:
+    // - We must have the freshest optime of all the up nodes.
+    // - We must specifically have a fresher optime than the primary (can't be equal).
+    // - The term of our last applied op must be less than the current term. This ensures that no
+    // writes have happened since the most recent election and that the primary is still in
+    // catchup mode.
+
+    // There is no point to a catchup takeover if we aren't the freshest node because
+    // another node would immediately perform another catchup takeover when we become primary.
+    const OpTime ourLastOpApplied = getMyLastAppliedOpTime();
+    if (ourLastOpApplied < latestKnownOpTime) {
+        return false;
+    }
+
+    if (_currentPrimaryIndex == -1) {
+        return false;
+    }
+
+    // If we aren't ahead of the primary, there is no point to having a catchup takeover.
+    const OpTime primaryLastOpApplied = _memberData[_currentPrimaryIndex].getLastAppliedOpTime();
+
+    if (ourLastOpApplied <= primaryLastOpApplied) {
+        return false;
+    }
+
+    // If the term of our last applied op is less than the current term, the primary didn't write
+    // anything and it is still in catchup mode.
+    return ourLastOpApplied.getTerm() < _term;
+}
+
 bool TopologyCoordinatorImpl::_iAmPrimary() const {
     if (_role == Role::leader) {
         invariant(_currentPrimaryIndex == _selfIndex);
@@ -1685,7 +1721,7 @@ int TopologyCoordinatorImpl::_getHighestPriorityElectableIndex(Date_t now) const
     int maxIndex = -1;
     for (int currentIndex = 0; currentIndex < _rsConfig.getNumMembers(); currentIndex++) {
         UnelectableReasonMask reason = currentIndex == _selfIndex
-            ? _getMyUnelectableReason(now, false)
+            ? _getMyUnelectableReason(now, StartElectionReason::kElectionTimeout)
             : _getUnelectableReason(currentIndex);
         if (None == reason && _isMemberHigherPriority(currentIndex, maxIndex)) {
             maxIndex = currentIndex;
@@ -2310,7 +2346,7 @@ TopologyCoordinatorImpl::UnelectableReasonMask TopologyCoordinatorImpl::_getUnel
 }
 
 TopologyCoordinatorImpl::UnelectableReasonMask TopologyCoordinatorImpl::_getMyUnelectableReason(
-    const Date_t now, bool isPriorityTakeover) const {
+    const Date_t now, StartElectionReason reason) const {
     UnelectableReasonMask result = None;
     const OpTime lastApplied = getMyLastAppliedOpTime();
     if (lastApplied.isNull()) {
@@ -2351,9 +2387,15 @@ TopologyCoordinatorImpl::UnelectableReasonMask TopologyCoordinatorImpl::_getMyUn
     } else {
         // Election rules only for protocol version 1.
         invariant(_rsConfig.getProtocolVersion() == 1);
-        if (isPriorityTakeover && !_amIFreshEnoughForPriorityTakeover()) {
+        if (reason == StartElectionReason::kPriorityTakeover &&
+            !_amIFreshEnoughForPriorityTakeover()) {
             result |= NotCloseEnoughToLatestForPriorityTakeover;
         }
+
+        if (reason == StartElectionReason::kCatchupTakeover &&
+            !_amIFreshEnoughForCatchupTakeover()) {
+            result |= NotFreshEnoughForCatchupTakeover;
+        }
     }
     return result;
 }
@@ -2426,6 +2468,14 @@ std::string TopologyCoordinatorImpl::_getUnelectableReasonString(
               "takeover - must be within "
            << priorityTakeoverFreshnessWindowSeconds << " seconds";
     }
+    if (ur & NotFreshEnoughForCatchupTakeover) {
+        if (hasWrittenToStream) {
+            ss << "; ";
+        }
+        hasWrittenToStream = true;
+        ss << "member is either not the most up-to-date member or not ahead of the primary, and "
+              "therefore cannot call for catchup takeover";
+    }
     if (ur & NotInitialized) {
         if (hasWrittenToStream) {
             ss << "; ";
@@ -2986,7 +3036,7 @@ void TopologyCoordinatorImpl::setPrimaryIndex(long long primaryIndex) {
 }
 
 Status TopologyCoordinatorImpl::becomeCandidateIfElectable(const Date_t now,
-                                                           bool isPriorityTakeover) {
+                                                           StartElectionReason reason) {
     if (_role == Role::leader) {
         return {ErrorCodes::NodeNotElectable, "Not standing for election again; already primary"};
     }
@@ -2995,8 +3045,7 @@ Status TopologyCoordinatorImpl::becomeCandidateIfElectable(const Date_t now,
         return {ErrorCodes::NodeNotElectable, "Not standing for election again; already candidate"};
     }
 
-    const UnelectableReasonMask unelectableReason =
-        _getMyUnelectableReason(now, isPriorityTakeover);
+    const UnelectableReasonMask unelectableReason = _getMyUnelectableReason(now, reason);
     if (unelectableReason) {
         return {ErrorCodes::NodeNotElectable,
                 str::stream() << "Not standing for election because "
diff --git a/src/mongo/db/repl/topology_coordinator_impl.h b/src/mongo/db/repl/topology_coordinator_impl.h
index f9f50f62871..5114f012be2 100644
--- a/src/mongo/db/repl/topology_coordinator_impl.h
+++ b/src/mongo/db/repl/topology_coordinator_impl.h
@@ -247,7 +247,7 @@ public:
     virtual MemberData* findMemberDataByMemberId(const int memberId);
     virtual MemberData* findMemberDataByRid(const OID rid);
     virtual MemberData* addSlaveMemberData(const OID rid);
-    virtual Status becomeCandidateIfElectable(const Date_t now, bool isPriorityTakeover);
+    virtual Status becomeCandidateIfElectable(const Date_t now, StartElectionReason reason);
     virtual void setStorageEngineSupportsReadCommitted(bool supported);
 
     virtual void restartHeartbeats();
@@ -295,6 +295,7 @@ private:
         VotedTooRecently = 1 << 8,
         RefusesToStand = 1 << 9,
         NotCloseEnoughToLatestForPriorityTakeover = 1 << 10,
+        NotFreshEnoughForCatchupTakeover = 1 << 11,
     };
     typedef int UnelectableReasonMask;
 
@@ -327,8 +328,13 @@ private:
     // Is our optime close enough to the latest known optime to call for a priority takeover.
     bool _amIFreshEnoughForPriorityTakeover() const;
 
+    // Is the primary node still in catchup mode and is our optime the latest
+    // known optime of all the up nodes.
+    bool _amIFreshEnoughForCatchupTakeover() const;
+
     // Returns reason why "self" member is unelectable
-    UnelectableReasonMask _getMyUnelectableReason(const Date_t now, bool isPriorityTakeover) const;
+    UnelectableReasonMask _getMyUnelectableReason(const Date_t now,
+                                                  StartElectionReason reason) const;
 
     // Returns reason why memberIndex is unelectable
     UnelectableReasonMask _getUnelectableReason(int memberIndex) const;
diff --git a/src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp b/src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp
index 72dbefe0def..8370f3595de 100644
--- a/src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp
+++ b/src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp
@@ -3829,6 +3829,232 @@ TEST_F(HeartbeatResponseTestV1, UpdateHeartbeatDataTermPreventsPriorityTakeover)
     ASSERT_EQUALS(2, getCurrentPrimaryIndex());
 }
 
+TEST_F(TopoCoordTest, FreshestNodeDoesCatchupTakeover) {
+    updateConfig(BSON("_id"
+                      << "rs0"
+                      << "version"
+                      << 5
+                      << "members"
+                      << BSON_ARRAY(BSON("_id" << 1 << "host"
+                                               << "host1:27017")
+                                    << BSON("_id" << 2 << "host"
+                                                  << "host2:27017")
+                                    << BSON("_id" << 3 << "host"
+                                                  << "host3:27017"))
+                      << "protocolVersion"
+                      << 1
+                      << "settings"
+
+                      << BSON("heartbeatTimeoutSecs" << 5)),
+                 0);
+
+    setSelfMemberState(MemberState::RS_SECONDARY);
+
+    OpTime currentOptime(Timestamp(200, 1), 0);
+    OpTime behindOptime(Timestamp(100, 1), 0);
+
+    // Create a mock heartbeat response to be able to compare who is the freshest node.
+    // The latest heartbeat responses are looked at for determining the latest optime
+    // and therefore freshness for catchup takeover.
+    ReplSetHeartbeatResponse hbResp = ReplSetHeartbeatResponse();
+    hbResp.setState(MemberState::RS_SECONDARY);
+    hbResp.setAppliedOpTime(currentOptime);
+    hbResp.setTerm(1);
+
+    Date_t firstRequestDate = unittest::assertGet(dateFromISOString("2014-08-29T13:00Z"));
+
+    getTopoCoord().prepareHeartbeatRequestV1(firstRequestDate, "rs0", HostAndPort("host2:27017"));
+    getTopoCoord().prepareHeartbeatRequestV1(firstRequestDate, "rs0", HostAndPort("host3:27017"));
+
+    // Set optimes so that I am the freshest node and strictly ahead of the primary.
+    getTopoCoord().getMyMemberData()->setLastAppliedOpTime(currentOptime, Date_t());
+    getTopoCoord().processHeartbeatResponse(firstRequestDate + Milliseconds(1000),
+                                            Milliseconds(999),
+                                            HostAndPort("host3:27017"),
+                                            StatusWith<ReplSetHeartbeatResponse>(hbResp));
+    hbResp.setAppliedOpTime(behindOptime);
+    hbResp.setState(MemberState::RS_PRIMARY);
+    getTopoCoord().processHeartbeatResponse(firstRequestDate + Milliseconds(1000),
+                                            Milliseconds(999),
+                                            HostAndPort("host2:27017"),
+                                            StatusWith<ReplSetHeartbeatResponse>(hbResp));
+    getTopoCoord().updateTerm(1, Date_t());
+
+    ASSERT_OK(getTopoCoord().becomeCandidateIfElectable(
+        Date_t(), TopologyCoordinator::StartElectionReason::kCatchupTakeover));
+}
+
+TEST_F(TopoCoordTest, StaleNodeDoesntDoCatchupTakeover) {
+    updateConfig(BSON("_id"
+                      << "rs0"
+                      << "version"
+                      << 5
+                      << "members"
+                      << BSON_ARRAY(BSON("_id" << 1 << "host"
+                                               << "host1:27017")
+                                    << BSON("_id" << 2 << "host"
+                                                  << "host2:27017")
+                                    << BSON("_id" << 3 << "host"
+                                                  << "host3:27017"))
+                      << "protocolVersion"
+                      << 1
+                      << "settings"
+
+                      << BSON("heartbeatTimeoutSecs" << 5)),
+                 0);
+
+    setSelfMemberState(MemberState::RS_SECONDARY);
+
+    OpTime currentOptime(Timestamp(200, 1), 0);
+    OpTime behindOptime(Timestamp(100, 1), 0);
+
+    // Create a mock heartbeat response to be able to compare who is the freshest node.
+    ReplSetHeartbeatResponse hbResp = ReplSetHeartbeatResponse();
+    hbResp.setState(MemberState::RS_SECONDARY);
+    hbResp.setAppliedOpTime(currentOptime);
+    hbResp.setTerm(1);
+
+    Date_t firstRequestDate = unittest::assertGet(dateFromISOString("2014-08-29T13:00Z"));
+
+    getTopoCoord().prepareHeartbeatRequestV1(firstRequestDate, "rs0", HostAndPort("host2:27017"));
+    getTopoCoord().prepareHeartbeatRequestV1(firstRequestDate, "rs0", HostAndPort("host3:27017"));
+
+    // Set optimes so that the other (non-primary) node is ahead of me.
+    getTopoCoord().getMyMemberData()->setLastAppliedOpTime(behindOptime, Date_t());
+    getTopoCoord().processHeartbeatResponse(firstRequestDate + Milliseconds(1000),
+                                            Milliseconds(999),
+                                            HostAndPort("host3:27017"),
+                                            StatusWith<ReplSetHeartbeatResponse>(hbResp));
+    hbResp.setAppliedOpTime(behindOptime);
+    hbResp.setState(MemberState::RS_PRIMARY);
+    getTopoCoord().processHeartbeatResponse(firstRequestDate + Milliseconds(1000),
+                                            Milliseconds(999),
+                                            HostAndPort("host2:27017"),
+                                            StatusWith<ReplSetHeartbeatResponse>(hbResp));
+    getTopoCoord().updateTerm(1, Date_t());
+
+    Status result = getTopoCoord().becomeCandidateIfElectable(
+        Date_t(), TopologyCoordinator::StartElectionReason::kCatchupTakeover);
+    ASSERT_NOT_OK(result);
+    ASSERT_STRING_CONTAINS(result.reason(),
+                           "member is either not the most up-to-date member or not ahead of the "
+                           "primary, and therefore cannot call for catchup takeover");
+}
+
+TEST_F(TopoCoordTest, NodeDoesntDoCatchupTakeoverHeartbeatSaysPrimaryCaughtUp) {
+    updateConfig(BSON("_id"
+                      << "rs0"
+                      << "version"
+                      << 5
+                      << "members"
+                      << BSON_ARRAY(BSON("_id" << 1 << "host"
+                                               << "host1:27017")
+                                    << BSON("_id" << 2 << "host"
+                                                  << "host2:27017")
+                                    << BSON("_id" << 3 << "host"
+                                                  << "host3:27017"))
+                      << "protocolVersion"
+                      << 1
+                      << "settings"
+
+                      << BSON("heartbeatTimeoutSecs" << 5)),
+                 0);
+
+    setSelfMemberState(MemberState::RS_SECONDARY);
+
+    OpTime currentOptime(Timestamp(200, 1), 0);
+
+    // Create a mock heartbeat response to be able to compare who is the freshest node.
+    ReplSetHeartbeatResponse hbResp = ReplSetHeartbeatResponse();
+    hbResp.setState(MemberState::RS_SECONDARY);
+    hbResp.setAppliedOpTime(currentOptime);
+    hbResp.setTerm(1);
+
+    Date_t firstRequestDate = unittest::assertGet(dateFromISOString("2014-08-29T13:00Z"));
+
+    getTopoCoord().prepareHeartbeatRequestV1(firstRequestDate, "rs0", HostAndPort("host2:27017"));
+    getTopoCoord().prepareHeartbeatRequestV1(firstRequestDate, "rs0", HostAndPort("host3:27017"));
+
+    // Set optimes so that the primary node is caught up with me.
+    getTopoCoord().getMyMemberData()->setLastAppliedOpTime(currentOptime, Date_t());
+    getTopoCoord().processHeartbeatResponse(firstRequestDate + Milliseconds(1000),
+                                            Milliseconds(999),
+                                            HostAndPort("host3:27017"),
+                                            StatusWith<ReplSetHeartbeatResponse>(hbResp));
+    hbResp.setState(MemberState::RS_PRIMARY);
+    getTopoCoord().processHeartbeatResponse(firstRequestDate + Milliseconds(1000),
+                                            Milliseconds(999),
+                                            HostAndPort("host2:27017"),
+                                            StatusWith<ReplSetHeartbeatResponse>(hbResp));
+    getTopoCoord().updateTerm(1, Date_t());
+
+    Status result = getTopoCoord().becomeCandidateIfElectable(
+        Date_t(), TopologyCoordinator::StartElectionReason::kCatchupTakeover);
+    ASSERT_NOT_OK(result);
+    ASSERT_STRING_CONTAINS(result.reason(),
+                           "member is either not the most up-to-date member or not ahead of the "
+                           "primary, and therefore cannot call for catchup takeover");
+}
+
+TEST_F(TopoCoordTest, NodeDoesntDoCatchupTakeoverIfTermNumbersSayPrimaryCaughtUp) {
+    updateConfig(BSON("_id"
+                      << "rs0"
+                      << "version"
+                      << 5
+                      << "members"
+                      << BSON_ARRAY(BSON("_id" << 1 << "host"
+                                               << "host1:27017")
+                                    << BSON("_id" << 2 << "host"
+                                                  << "host2:27017")
+                                    << BSON("_id" << 3 << "host"
+                                                  << "host3:27017"))
+                      << "protocolVersion"
+                      << 1
+                      << "settings"
+
+                      << BSON("heartbeatTimeoutSecs" << 5)),
+                 0);
+
+    setSelfMemberState(MemberState::RS_SECONDARY);
+
+    OpTime currentOptime(Timestamp(200, 1), 1);
+    OpTime behindOptime(Timestamp(100, 1), 0);
+
+    // Create a mock heartbeat response to be able to compare who is the freshest node.
+    ReplSetHeartbeatResponse hbResp = ReplSetHeartbeatResponse();
+    hbResp.setState(MemberState::RS_SECONDARY);
+    hbResp.setAppliedOpTime(currentOptime);
+    hbResp.setTerm(1);
+
+    Date_t firstRequestDate = unittest::assertGet(dateFromISOString("2014-08-29T13:00Z"));
+
+    getTopoCoord().prepareHeartbeatRequestV1(firstRequestDate, "rs0", HostAndPort("host2:27017"));
+    getTopoCoord().prepareHeartbeatRequestV1(firstRequestDate, "rs0", HostAndPort("host3:27017"));
+
+    // Simulates a scenario where the node hasn't received a heartbeat from the primary in a while
+    // but the primary is caught up and has written something. The node is aware of this change
+    // and as a result realizes the primary is caught up.
+    getTopoCoord().getMyMemberData()->setLastAppliedOpTime(currentOptime, Date_t());
+    getTopoCoord().processHeartbeatResponse(firstRequestDate + Milliseconds(1000),
+                                            Milliseconds(999),
+                                            HostAndPort("host3:27017"),
+                                            StatusWith<ReplSetHeartbeatResponse>(hbResp));
+    hbResp.setAppliedOpTime(behindOptime);
+    hbResp.setState(MemberState::RS_PRIMARY);
+    getTopoCoord().processHeartbeatResponse(firstRequestDate + Milliseconds(1000),
+                                            Milliseconds(999),
+                                            HostAndPort("host2:27017"),
+                                            StatusWith<ReplSetHeartbeatResponse>(hbResp));
+    getTopoCoord().updateTerm(1, Date_t());
+
+    Status result = getTopoCoord().becomeCandidateIfElectable(
+        Date_t(), TopologyCoordinator::StartElectionReason::kCatchupTakeover);
+    ASSERT_NOT_OK(result);
+    ASSERT_STRING_CONTAINS(result.reason(),
+                           "member is either not the most up-to-date member or not ahead of the "
+                           "primary, and therefore cannot call for catchup takeover");
+}
+
 TEST_F(HeartbeatResponseTestV1,
        ScheduleACatchupTakeoverWhenElectableAndReceiveHeartbeatFromPrimaryInCatchup) {
     updateConfig(BSON("_id"
@@ -3973,7 +4199,8 @@ TEST_F(HeartbeatResponseTestV1,
     ASSERT_NO_ACTION(nextAction.getAction());
     ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole());
     // We are electable now.
-    ASSERT_OK(getTopoCoord().becomeCandidateIfElectable(now(), false));
+    ASSERT_OK(getTopoCoord().becomeCandidateIfElectable(
+        now(), TopologyCoordinator::StartElectionReason::kElectionTimeout));
     ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole());
 }
 
@@ -3998,7 +4225,8 @@ TEST_F(HeartbeatResponseTestV1, ScheduleElectionWhenPrimaryIsMarkedDownAndWeAreE
     ASSERT_EQUALS(-1, getCurrentPrimaryIndex());
     ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole());
     // We are electable now.
-    ASSERT_OK(getTopoCoord().becomeCandidateIfElectable(now(), false));
+    ASSERT_OK(getTopoCoord().becomeCandidateIfElectable(
+        now(), TopologyCoordinator::StartElectionReason::kElectionTimeout));
     ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole());
 }
 
diff --git a/src/mongo/shell/replsettest.js b/src/mongo/shell/replsettest.js
index d0ceb2520d1..3666f6e1d21 100644
--- a/src/mongo/shell/replsettest.js
+++ b/src/mongo/shell/replsettest.js
@@ -602,8 +602,9 @@ var ReplSetTest = function(opts) {
     };
 
     /**
-     * Blocking call, which will wait for a primary to be elected for some pre-defined timeout and
-     * if primary is available will return a connection to it. Otherwise throws an exception.
+     * Blocking call, which will wait for a primary to be elected and become master for some
+     * pre-defined timeout. If a primary is available it will return a connection to it.
+     * Otherwise throws an exception.
      */
     this.getPrimary = function(timeout) {
         timeout = timeout || self.kDefaultTimeoutMS;
author	Samy Lanka <samy.lanka@10gen.com>	2017-07-05 15:18:06 -0400
committer	Samy Lanka <samy.lanka@10gen.com>	2017-08-02 15:29:26 -0400
commit	5f1ce8b6765a25d45ba5e35063db417b3069c8d6 (patch)
tree	de942dd067b10bb2ae70f3a5b0f5601cd7fa87ec
parent	743119c879ab2e5d1d8ca05aadf6fe29b1526a94 (diff)
download	mongo-5f1ce8b6765a25d45ba5e35063db417b3069c8d6.tar.gz