SERVER-41500 Track number of successful elections for each reason in serverStatus

(cherry picked from commit bf4f91a6087227295007535bf143a8dd20e6a6d5) SERVER-41510 Track the number of failed stepDowns in serverStatus (cherry picked from commit 4e8fbd52ca8cda811dc53bb160f3ca5074355dab) SERVER-41502 Track the number of times primary catchup concludes for each reason in serverStatus (cherry picked from commit 796214e8426b3036f4eb4bdcc4af9d218589778f)
author: Samyukta Lanka <samy.lanka@mongodb.com> 2019-09-26 17:19:01 +0000
committer: evergreen <evergreen@mongodb.com> 2019-09-26 17:19:01 +0000
commit: 1467632ac13b8f55f3ec53299619db44e8b2037a (patch)
tree: 33b018f35124a52c2e0b9e3e5fd50ae51310404d
parent: 39bc70a2df250696b983bb26c7aba4e0c8f2d3ac (diff)
download: mongo-1467632ac13b8f55f3ec53299619db44e8b2037a.tar.gz
25 files changed, 826 insertions, 87 deletions
diff --git a/jstests/replsets/catchup.js b/jstests/replsets/catchup.js
index 5b5dabf57ae..d8a8b3a5bf6 100644
--- a/jstests/replsets/catchup.js
+++ b/jstests/replsets/catchup.js
@@ -99,14 +99,29 @@
     rst.awaitReplication();
 
     jsTest.log("Case 1: The primary is up-to-date after refreshing heartbeats.");
+    let initialNewPrimaryStatus =
+        assert.commandWorked(rst.getSecondary().adminCommand({serverStatus: 1}));
+
     // Should complete transition to primary immediately.
     var newPrimary = stepUpNode(rst.getSecondary());
     // Should win an election and finish the transition very quickly.
     assert.eq(newPrimary, rst.getPrimary());
     rst.awaitReplication();
 
+    // Check that the 'numCatchUps' field has not been incremented in serverStatus.
+    let newNewPrimaryStatus = assert.commandWorked(newPrimary.adminCommand({serverStatus: 1}));
+    verifyServerStatusChange(initialNewPrimaryStatus.electionMetrics,
+                             newNewPrimaryStatus.electionMetrics,
+                             'numCatchUps',
+                             0);
+    // Check that the 'numCatchUpsAlreadyCaughtUp' field has been incremented in serverStatus, and
+    // that none of the other reasons for catchup concluding has been incremented.
+    verifyCatchUpConclusionReason(initialNewPrimaryStatus.electionMetrics,
+                                  newNewPrimaryStatus.electionMetrics,
+                                  'numCatchUpsAlreadyCaughtUp');
+
     jsTest.log("Case 2: The primary needs to catch up, succeeds in time.");
-    let initialNewPrimaryStatus =
+    initialNewPrimaryStatus =
         assert.commandWorked(rst.getSecondaries()[0].adminCommand({serverStatus: 1}));
 
     var stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp();
@@ -117,12 +132,17 @@
     assert.eq(stepUpResults.newPrimary, rst.getPrimary());
 
     // Check that the 'numCatchUps' field has been incremented in serverStatus.
-    let newNewPrimaryStatus =
+    newNewPrimaryStatus =
         assert.commandWorked(stepUpResults.newPrimary.adminCommand({serverStatus: 1}));
     verifyServerStatusChange(initialNewPrimaryStatus.electionMetrics,
                              newNewPrimaryStatus.electionMetrics,
                              'numCatchUps',
                              1);
+    // Check that the 'numCatchUpsSucceeded' field has been incremented in serverStatus, and that
+    // none of the other reasons for catchup concluding has been incremented.
+    verifyCatchUpConclusionReason(initialNewPrimaryStatus.electionMetrics,
+                                  newNewPrimaryStatus.electionMetrics,
+                                  'numCatchUpsSucceeded');
 
     // Wait for all secondaries to catch up
     rst.awaitReplication();
@@ -160,6 +180,9 @@
     rst.awaitReplication();
 
     jsTest.log("Case 4: The primary needs to catch up, fails due to timeout.");
+    initialNewPrimaryStatus =
+        assert.commandWorked(rst.getSecondaries()[0].adminCommand({serverStatus: 1}));
+
     // Reconfig to make the catchup timeout shorter.
     reconfigElectionAndCatchUpTimeout(conf.settings.electionTimeoutMillis, 10 * 1000);
 
@@ -169,6 +192,14 @@
     restartServerReplication(stepUpResults.newPrimary);
     assert.eq(stepUpResults.newPrimary, rst.getPrimary());
 
+    // Check that the 'numCatchUpsTimedOut' field has been incremented in serverStatus, and that
+    // none of the other reasons for catchup concluding has been incremented.
+    newNewPrimaryStatus =
+        assert.commandWorked(stepUpResults.newPrimary.adminCommand({serverStatus: 1}));
+    verifyCatchUpConclusionReason(initialNewPrimaryStatus.electionMetrics,
+                                  newNewPrimaryStatus.electionMetrics,
+                                  'numCatchUpsTimedOut');
+
     // Wait for the no-op "new primary" after winning an election, so that we know it has
     // finished transition to primary.
     assert.soon(function() {
@@ -185,9 +216,21 @@
     reconfigElectionAndCatchUpTimeout(conf.settings.electionTimeoutMillis, -1);
     stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp();
 
+    initialNewPrimaryStatus =
+        assert.commandWorked(stepUpResults.newPrimary.adminCommand({serverStatus: 1}));
+
     // Abort catchup.
     assert.commandWorked(stepUpResults.newPrimary.adminCommand({replSetAbortPrimaryCatchUp: 1}));
 
+    // Check that the 'numCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd' field has been
+    // incremented in serverStatus, and that none of the other reasons for catchup concluding has
+    // been incremented.
+    newNewPrimaryStatus =
+        assert.commandWorked(stepUpResults.newPrimary.adminCommand({serverStatus: 1}));
+    verifyCatchUpConclusionReason(initialNewPrimaryStatus.electionMetrics,
+                                  newNewPrimaryStatus.electionMetrics,
+                                  'numCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd');
+
     // Wait for the no-op "new primary" after winning an election, so that we know it has
     // finished transition to primary.
     assert.soon(function() {
@@ -201,6 +244,9 @@
     checkOpInOplog(stepUpResults.newPrimary, stepUpResults.latestOpOnOldPrimary, 0);
 
     jsTest.log("Case 6: The primary needs to catch up with no timeout, but steps down.");
+    initialNewPrimaryStatus =
+        assert.commandWorked(rst.getSecondaries()[0].adminCommand({serverStatus: 1}));
+
     var stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp();
 
     // Step-down command should abort catchup.
@@ -209,6 +255,15 @@
     } catch (e) {
         print(e);
     }
+
+    // Check that the 'numCatchUpsFailedWithError' field has been incremented in serverStatus, and
+    // that none of the other reasons for catchup concluding has been incremented.
+    newNewPrimaryStatus =
+        assert.commandWorked(stepUpResults.newPrimary.adminCommand({serverStatus: 1}));
+    verifyCatchUpConclusionReason(initialNewPrimaryStatus.electionMetrics,
+                                  newNewPrimaryStatus.electionMetrics,
+                                  'numCatchUpsFailedWithError');
+
     // Rename the primary.
     var steppedDownPrimary = stepUpResults.newPrimary;
     var newPrimary = rst.getPrimary();
diff --git a/jstests/replsets/catchup_takeover_two_nodes_ahead.js b/jstests/replsets/catchup_takeover_two_nodes_ahead.js
index 69dddd079a1..1efe741da51 100644
--- a/jstests/replsets/catchup_takeover_two_nodes_ahead.js
+++ b/jstests/replsets/catchup_takeover_two_nodes_ahead.js
@@ -14,6 +14,7 @@
     'use strict';
 
     load('jstests/replsets/rslib.js');
+    load('jstests/replsets/libs/election_metrics.js');
 
     var name = 'catchup_takeover_two_nodes_ahead';
     var replSet = new ReplSetTest({name: name, nodes: 5});
@@ -35,6 +36,9 @@
     writeConcern = {writeConcern: {w: 1, wtimeout: replSet.kDefaultTimeoutMS}};
     assert.writeOK(primary.getDB(name).bar.insert({y: 100}, writeConcern));
 
+    const initialPrimaryStatus = assert.commandWorked(primary.adminCommand({serverStatus: 1}));
+    const initialNode2Status = assert.commandWorked(nodes[2].adminCommand({serverStatus: 1}));
+
     // Step up one of the lagged nodes.
     assert.commandWorked(nodes[2].adminCommand({replSetStepUp: 1}));
     replSet.awaitNodesAgreeOnPrimary();
@@ -52,6 +56,21 @@
     // after the default catchup delay.
     replSet.waitForState(0, ReplSetTest.State.PRIMARY, 60 * 1000);
 
+    // Check that both the 'called' and 'successful' fields of the 'catchUpTakeover' election reason
+    // counter have been incremented in serverStatus.
+    const newPrimaryStatus = assert.commandWorked(primary.adminCommand({serverStatus: 1}));
+    verifyServerStatusElectionReasonCounterChange(initialPrimaryStatus.electionMetrics,
+                                                  newPrimaryStatus.electionMetrics,
+                                                  "catchUpTakeover",
+                                                  1);
+
+    // Check that the 'numCatchUpsFailedWithNewTerm' field has been incremented in serverStatus, and
+    // that none of the other reasons for catchup concluding has been incremented.
+    const newNode2Status = assert.commandWorked(nodes[2].adminCommand({serverStatus: 1}));
+    verifyCatchUpConclusionReason(initialNode2Status.electionMetrics,
+                                  newNode2Status.electionMetrics,
+                                  'numCatchUpsFailedWithNewTerm');
+
     // Wait until the old primary steps down so the connections won't be closed.
     replSet.waitForState(2, ReplSetTest.State.SECONDARY, replSet.kDefaultTimeoutMS);
     // Let the nodes catchup.
diff --git a/jstests/replsets/freeze_timeout.js b/jstests/replsets/freeze_timeout.js
new file mode 100644
index 00000000000..e61a3ea65f3
--- /dev/null
+++ b/jstests/replsets/freeze_timeout.js
@@ -0,0 +1,56 @@
+/**
+ * Tests that the freezeTimeout election reason counter in serverStatus is incremented in a single
+ * node replica set both after a freeze timeout and after a stepdown timeout expires.
+ */
+(function() {
+    "use strict";
+    load('jstests/replsets/libs/election_metrics.js');
+
+    jsTestLog('1: initialize single node replica set');
+    const replSet = new ReplSetTest({name: 'freeze_timeout', nodes: 1});
+    const nodes = replSet.startSet();
+    const config = replSet.getReplSetConfig();
+    replSet.initiate(config);
+    replSet.awaitReplication();
+    let primary = replSet.getPrimary();
+    const initialPrimaryStatus = assert.commandWorked(primary.adminCommand({serverStatus: 1}));
+
+    jsTestLog('2: step down primary');
+    assert.throws(function() {
+        primary.getDB("admin").runCommand({replSetStepDown: 10, force: 1});
+    });
+
+    jsTestLog('3: wait for stepped down node to become primary again');
+    primary = replSet.getPrimary();
+
+    // Check that both the 'called' and 'successful' fields of the 'freezeTimeout' election reason
+    // counter have been incremented in serverStatus. When a stepdown timeout expires in a single
+    // node replica set, an election is called for the same reason as is used when a freeze timeout
+    // expires.
+    let newPrimaryStatus = assert.commandWorked(primary.adminCommand({serverStatus: 1}));
+    verifyServerStatusElectionReasonCounterChange(
+        initialPrimaryStatus.electionMetrics, newPrimaryStatus.electionMetrics, "freezeTimeout", 1);
+
+    jsTestLog('4: step down primary again');
+    assert.throws(function() {
+        primary.getDB("admin").runCommand({replSetStepDown: 10, force: 1});
+    });
+
+    jsTestLog('5: freeze stepped down primary for 30 seconds');
+    primary.getDB("admin").runCommand({replSetFreeze: 30});
+    sleep(1000);
+
+    jsTestLog('6: unfreeze stepped down primary after waiting for 1 second');
+    primary.getDB("admin").runCommand({replSetFreeze: 0});
+
+    jsTestLog('7: wait for unfrozen node to become primary again');
+    primary = replSet.getPrimary();
+
+    // Check that both the 'called' and 'successful' fields of the 'freezeTimeout' election reason
+    // counter have been incremented again in serverStatus.
+    newPrimaryStatus = assert.commandWorked(primary.adminCommand({serverStatus: 1}));
+    verifyServerStatusElectionReasonCounterChange(
+        initialPrimaryStatus.electionMetrics, newPrimaryStatus.electionMetrics, "freezeTimeout", 2);
+
+    replSet.stopSet();
+})();
diff --git a/jstests/replsets/libs/election_metrics.js b/jstests/replsets/libs/election_metrics.js
index 91d6b4815e3..36307b2117e 100644
--- a/jstests/replsets/libs/election_metrics.js
+++ b/jstests/replsets/libs/election_metrics.js
@@ -41,3 +41,30 @@ function verifyServerStatusChange(initialStatus, newStatus, fieldName, expectedI
               newStatus[fieldName],
               `expected '${fieldName}' to increase by ${expectedIncrement}`);
 }
+
+/**
+ * Verifies that the given reason for primary catchup concluding is incremented in serverStatus, and
+ * that none of the other reasons are.
+ */
+function verifyCatchUpConclusionReason(initialStatus, newStatus, fieldName) {
+    const catchUpConclusionMetrics = [
+        "numCatchUpsSucceeded",
+        "numCatchUpsAlreadyCaughtUp",
+        "numCatchUpsSkipped",
+        "numCatchUpsTimedOut",
+        "numCatchUpsFailedWithError",
+        "numCatchUpsFailedWithNewTerm",
+        "numCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd"
+    ];
+
+    catchUpConclusionMetrics.forEach(function(metric) {
+        if (metric === fieldName) {
+            assert.eq(initialStatus[metric] + 1,
+                      newStatus[metric],
+                      `expected '${metric}' to increase by 1`);
+        } else {
+            assert.eq(
+                initialStatus[metric], newStatus[metric], `did not expect '${metric}' to increase`);
+        }
+    });
+}
diff --git a/jstests/replsets/priority_takeover_one_node_higher_priority.js b/jstests/replsets/priority_takeover_one_node_higher_priority.js
index b4385cc2376..b39bd47545f 100644
--- a/jstests/replsets/priority_takeover_one_node_higher_priority.js
+++ b/jstests/replsets/priority_takeover_one_node_higher_priority.js
@@ -6,6 +6,7 @@
 (function() {
     'use strict';
     load('jstests/replsets/rslib.js');
+    load('jstests/replsets/libs/election_metrics.js');
 
     var name = 'priority_takeover_one_node_higher_priority';
     var replSet = new ReplSetTest({
@@ -22,6 +23,8 @@
     replSet.waitForState(replSet.nodes[0], ReplSetTest.State.PRIMARY);
     var primary = replSet.getPrimary();
 
+    const initialPrimaryStatus = assert.commandWorked(primary.adminCommand({serverStatus: 1}));
+
     replSet.awaitSecondaryNodes();
     replSet.awaitReplication();
 
@@ -42,5 +45,14 @@
 
     // Eventually node 0 will stand for election again because it has a higher priorty.
     replSet.waitForState(replSet.nodes[0], ReplSetTest.State.PRIMARY);
+
+    // Check that both the 'called' and 'successful' fields of the 'priorityTakeover' election
+    // reason counter have been incremented in serverStatus.
+    const newPrimaryStatus = assert.commandWorked(primary.adminCommand({serverStatus: 1}));
+    verifyServerStatusElectionReasonCounterChange(initialPrimaryStatus.electionMetrics,
+                                                  newPrimaryStatus.electionMetrics,
+                                                  "priorityTakeover",
+                                                  1);
+
     replSet.stopSet();
 })();
diff --git a/jstests/replsets/replset1.js b/jstests/replsets/replset1.js
index 05aee648f21..8b1e72d6e46 100644
--- a/jstests/replsets/replset1.js
+++ b/jstests/replsets/replset1.js
@@ -2,6 +2,7 @@ var ssl_options1;
 var ssl_options2;
 var ssl_name;
 load("jstests/replsets/rslib.js");
+load('jstests/replsets/libs/election_metrics.js');
 var doTest = function(signal) {
 
     // Test basic replica set functionality.
@@ -29,6 +30,12 @@ var doTest = function(signal) {
     // elected master.
     var master = replTest.getPrimary();
 
+    // Check that both the 'called' and 'successful' fields of the 'electionTimeout' election reason
+    // counter have been incremented in serverStatus.
+    const primaryStatus = assert.commandWorked(master.adminCommand({serverStatus: 1}));
+    verifyServerStatusElectionReasonCounterValue(
+        primaryStatus.electionMetrics, "electionTimeout", 1);
+
     // Ensure the primary logs an n-op to the oplog upon transitioning to primary.
     assert.gt(master.getDB("local").oplog.rs.count({op: 'n', o: {msg: 'new primary'}}), 0);
 
diff --git a/jstests/replsets/server_election_metrics.js b/jstests/replsets/server_election_metrics.js
index ceae6a26ee2..23d018287ef 100644
--- a/jstests/replsets/server_election_metrics.js
+++ b/jstests/replsets/server_election_metrics.js
@@ -1,10 +1,10 @@
 /**
- * Tests the format of electionMetrics serverStatus section.
+ * Tests the format of 'electionMetrics' serverStatus section.
  */
 (function() {
     "use strict";
 
-    // Verifies that the electionMetrics serverStatus section has the given field.
+    // Verifies that the 'electionMetrics' serverStatus section has the given field.
     function verifyElectionMetricsField(serverStatusResponse, fieldName) {
         assert(serverStatusResponse.electionMetrics.hasOwnProperty(fieldName),
                () => (`The 'electionMetrics' serverStatus section did not have the '${fieldName}' \
@@ -12,7 +12,7 @@ field: \n${tojson(serverStatusResponse.electionMetrics)}`));
         return serverStatusResponse.electionMetrics[fieldName];
     }
 
-    // Verifies that the electionMetrics serverStatus section has a field for the given election
+    // Verifies that the 'electionMetrics' serverStatus section has a field for the given election
     // reason counter and that it has the subfields 'called' and 'successful'.
     function verifyElectionReasonCounterFields(serverStatusResponse, fieldName) {
         const field = verifyElectionMetricsField(serverStatusResponse, fieldName);
@@ -24,7 +24,7 @@ not have the 'called' field: \n${tojson(field)}`));
 not have the 'successful' field: \n${tojson(field)}`));
     }
 
-    // Verifies the format of the electionMetrics serverStatus section.
+    // Verifies the format of the 'electionMetrics' serverStatus section.
     function verifyElectionMetricsSSS(serverStatusResponse) {
         assert(serverStatusResponse.hasOwnProperty("electionMetrics"),
                () => (`Expected the serverStatus response to have an 'electionMetrics' field:
@@ -36,6 +36,14 @@ ${tojson(serverStatusResponse)}`));
         verifyElectionReasonCounterFields(serverStatusResponse, "freezeTimeout");
         verifyElectionMetricsField(serverStatusResponse, "numStepDownsCausedByHigherTerm");
         verifyElectionMetricsField(serverStatusResponse, "numCatchUps");
+        verifyElectionMetricsField(serverStatusResponse, "numCatchUpsSucceeded");
+        verifyElectionMetricsField(serverStatusResponse, "numCatchUpsAlreadyCaughtUp");
+        verifyElectionMetricsField(serverStatusResponse, "numCatchUpsSkipped");
+        verifyElectionMetricsField(serverStatusResponse, "numCatchUpsTimedOut");
+        verifyElectionMetricsField(serverStatusResponse, "numCatchUpsFailedWithError");
+        verifyElectionMetricsField(serverStatusResponse, "numCatchUpsFailedWithNewTerm");
+        verifyElectionMetricsField(serverStatusResponse,
+                                   "numCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd");
     }
 
     // Set up the replica set.
diff --git a/jstests/replsets/stepdown.js b/jstests/replsets/stepdown.js
index acfbbb0f6ee..78703f0ce98 100644
--- a/jstests/replsets/stepdown.js
+++ b/jstests/replsets/stepdown.js
@@ -2,6 +2,9 @@
  * Check that on a loss of primary, another node doesn't assume primary if it is stale. We force a
  * stepDown to test this.
  *
+ * This test also checks that the serverStatus command metrics replSetStepDown and
+ * replSetStepDownWithForce are incremented correctly.
+ *
  * This test requires the fsync command to force a secondary to be stale.
  * @tags: [requires_fsync]
  */
@@ -61,7 +64,7 @@ try {
         assert.writeOK(master.getDB("foo").bar.insert({x: i}));
     }
 
-    const intitialServerStatus = assert.commandWorked(master.adminCommand({serverStatus: 1}));
+    let intitialServerStatus = assert.commandWorked(master.adminCommand({serverStatus: 1}));
 
     jsTestLog('Do stepdown of primary ' + master + ' that should not work');
 
@@ -70,17 +73,57 @@ try {
         'Step down ' + master + ' expected error: ' +
         tojson(assert.commandFailed(master.getDB("admin").runCommand({replSetStepDown: 10}))));
 
-    // Check that the 'total' field of 'replSetStepDown' has been incremented in serverStatus and
-    // that it has not been incremented for 'replSetStepDownWithForce'.
+    // Check that the 'total' and 'failed' fields of 'replSetStepDown' have been incremented in
+    // serverStatus and that they have not been incremented for 'replSetStepDownWithForce'.
     let newServerStatus = assert.commandWorked(master.adminCommand({serverStatus: 1}));
     verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDown,
                              newServerStatus.metrics.commands.replSetStepDown,
                              "total",
                              1);
+    verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDown,
+                             newServerStatus.metrics.commands.replSetStepDown,
+                             "failed",
+                             1);
     verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDownWithForce,
                              newServerStatus.metrics.commands.replSetStepDownWithForce,
                              "total",
                              0);
+    verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDownWithForce,
+                             newServerStatus.metrics.commands.replSetStepDownWithForce,
+                             "failed",
+                             0);
+
+    // This section checks that the metrics are incremented accurately when the command fails due to
+    // an error occurring before stepDown is called in the replication coordinator, such as due to
+    // bad values or type mismatches in the arguments, or checkReplEnabledForCommand returning a bad
+    // status. The stepdown period being negative is one example of such an error, but success in
+    // this case gives us confidence that the behavior in the other cases is the same.
+
+    // Stepdown should fail because the stepdown period is negative
+    jsTestLog('Do stepdown of primary ' + master + ' that should not work');
+    assert.commandFailedWithCode(
+        master.getDB("admin").runCommand({replSetStepDown: -1, force: true}), ErrorCodes.BadValue);
+
+    // Check that the 'total' and 'failed' fields of 'replSetStepDown' and
+    // 'replSetStepDownWithForce' have been incremented in serverStatus.
+    intitialServerStatus = newServerStatus;
+    newServerStatus = assert.commandWorked(master.adminCommand({serverStatus: 1}));
+    verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDown,
+                             newServerStatus.metrics.commands.replSetStepDown,
+                             "total",
+                             1);
+    verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDown,
+                             newServerStatus.metrics.commands.replSetStepDown,
+                             "failed",
+                             1);
+    verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDownWithForce,
+                             newServerStatus.metrics.commands.replSetStepDownWithForce,
+                             "total",
+                             1);
+    verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDownWithForce,
+                             newServerStatus.metrics.commands.replSetStepDownWithForce,
+                             "failed",
+                             1);
 
     // The server will disconnect the client on a successful forced stepdown so we use the
     // presence of an exception to confirm the forced stepdown result.
@@ -93,22 +136,63 @@ try {
               tojson(exceptionFromForcedStepDown));
 
     // Check that the 'total' fields of 'replSetStepDown' and 'replSetStepDownWithForce' have been
-    // incremented in serverStatus.
+    // incremented in serverStatus and that their 'failed' fields have not been incremented.
+    intitialServerStatus = newServerStatus;
     newServerStatus = assert.commandWorked(master.adminCommand({serverStatus: 1}));
     verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDown,
                              newServerStatus.metrics.commands.replSetStepDown,
                              "total",
-                             2);
+                             1);
+    verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDown,
+                             newServerStatus.metrics.commands.replSetStepDown,
+                             "failed",
+                             0);
     verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDownWithForce,
                              newServerStatus.metrics.commands.replSetStepDownWithForce,
                              "total",
                              1);
+    verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDownWithForce,
+                             newServerStatus.metrics.commands.replSetStepDownWithForce,
+                             "failed",
+                             0);
 
     jsTestLog('Checking isMaster on ' + master);
     var r2 = assert.commandWorked(master.getDB("admin").runCommand({ismaster: 1}));
     jsTestLog('Result from running isMaster on ' + master + ': ' + tojson(r2));
     assert.eq(r2.ismaster, false);
     assert.eq(r2.secondary, true);
+
+    // This section checks that the metrics are incremented accurately when the command fails due to
+    // an error while stepping down. This is one reason the replSetStepDown command could fail once
+    // we call stepDown in the replication coordinator, but success in this case gives us confidence
+    // that the behavior in the other cases is the same.
+
+    // Stepdown should fail because the node is no longer primary
+    jsTestLog('Do stepdown of primary ' + master + ' that should not work');
+    assert.commandFailedWithCode(master.getDB("admin").runCommand(
+                                     {replSetStepDown: ReplSetTest.kDefaultTimeoutMS, force: true}),
+                                 ErrorCodes.NotMaster);
+
+    // Check that the 'total' and 'failed' fields of 'replSetStepDown' and
+    // 'replSetStepDownWithForce' have been incremented in serverStatus.
+    intitialServerStatus = newServerStatus;
+    newServerStatus = assert.commandWorked(master.adminCommand({serverStatus: 1}));
+    verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDown,
+                             newServerStatus.metrics.commands.replSetStepDown,
+                             "total",
+                             1);
+    verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDown,
+                             newServerStatus.metrics.commands.replSetStepDown,
+                             "failed",
+                             1);
+    verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDownWithForce,
+                             newServerStatus.metrics.commands.replSetStepDownWithForce,
+                             "total",
+                             1);
+    verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDownWithForce,
+                             newServerStatus.metrics.commands.replSetStepDownWithForce,
+                             "failed",
+                             1);
 } catch (e) {
     throw e;
 } finally {
diff --git a/jstests/replsets/stepup.js b/jstests/replsets/stepup.js
index 93586bfcae7..4c5d20d42c4 100644
--- a/jstests/replsets/stepup.js
+++ b/jstests/replsets/stepup.js
@@ -1,6 +1,7 @@
 // Tests the replSetStepUp command.
 
 load("jstests/replsets/rslib.js");
+load('jstests/replsets/libs/election_metrics.js');
 
 (function() {
     "use strict";
@@ -44,23 +45,29 @@ load("jstests/replsets/rslib.js");
     // Make sure the step up succeeded.
     assert.eq(secondary, rst.getPrimary());
 
-    // Verifies that the given election reason counter is incremented in the way we expect.
-    function verifyServerStatusChange(initialStatus, newStatus, fieldName, expectedIncrement) {
-        assert.eq(
-            initialStatus.electionMetrics[fieldName]["called"] + expectedIncrement,
-            newStatus.electionMetrics[fieldName]["called"],
-            "expected the 'called' field of " + fieldName + " to increase by " + expectedIncrement);
-    }
-
     const newSecondaryStatus = assert.commandWorked(secondary.adminCommand({serverStatus: 1}));
 
-    // Check that the 'called' field of stepUpCmd has been incremented in serverStatus, and that it
-    // has not been incremented in any of the other election reason counters.
-    verifyServerStatusChange(initialSecondaryStatus, newSecondaryStatus, "stepUpCmd", 1);
-    verifyServerStatusChange(initialSecondaryStatus, newSecondaryStatus, "priorityTakeover", 0);
-    verifyServerStatusChange(initialSecondaryStatus, newSecondaryStatus, "catchUpTakeover", 0);
-    verifyServerStatusChange(initialSecondaryStatus, newSecondaryStatus, "electionTimeout", 0);
-    verifyServerStatusChange(initialSecondaryStatus, newSecondaryStatus, "freezeTimeout", 0);
+    // Check that both the 'called' and 'successful' fields of stepUpCmd have been incremented in
+    // serverStatus, and that they have not been incremented in any of the other election reason
+    // counters.
+    verifyServerStatusElectionReasonCounterChange(
+        initialSecondaryStatus.electionMetrics, newSecondaryStatus.electionMetrics, "stepUpCmd", 1);
+    verifyServerStatusElectionReasonCounterChange(initialSecondaryStatus.electionMetrics,
+                                                  newSecondaryStatus.electionMetrics,
+                                                  "priorityTakeover",
+                                                  0);
+    verifyServerStatusElectionReasonCounterChange(initialSecondaryStatus.electionMetrics,
+                                                  newSecondaryStatus.electionMetrics,
+                                                  "catchUpTakeover",
+                                                  0);
+    verifyServerStatusElectionReasonCounterChange(initialSecondaryStatus.electionMetrics,
+                                                  newSecondaryStatus.electionMetrics,
+                                                  "electionTimeout",
+                                                  0);
+    verifyServerStatusElectionReasonCounterChange(initialSecondaryStatus.electionMetrics,
+                                                  newSecondaryStatus.electionMetrics,
+                                                  "freezeTimeout",
+                                                  0);
 
     rst.stopSet();
 })();
diff --git a/src/mongo/db/repl/bgsync.cpp b/src/mongo/db/repl/bgsync.cpp
index 08502a9b66b..1acac0bbca6 100644
--- a/src/mongo/db/repl/bgsync.cpp
+++ b/src/mongo/db/repl/bgsync.cpp
@@ -347,7 +347,11 @@ void BackgroundSync::_produce() {
             log() << "Our newest OpTime : " << lastOpTimeFetched;
             log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen
                   << " from " << syncSourceResp.getSyncSource();
-            _replCoord->abortCatchupIfNeeded().transitional_ignore();
+            auto status = _replCoord->abortCatchupIfNeeded(
+                ReplicationCoordinator::PrimaryCatchUpConclusionReason::kFailedWithError);
+            if (!status.isOK()) {
+                LOG(1) << "Aborting catch-up failed with status: " << status;
+            }
             return;
         }
 
@@ -596,7 +600,11 @@ void BackgroundSync::_runRollback(OperationContext* opCtx,
                                   StorageInterface* storageInterface) {
     if (_replCoord->getMemberState().primary()) {
         warning() << "Rollback situation detected in catch-up mode. Aborting catch-up mode.";
-        _replCoord->abortCatchupIfNeeded().transitional_ignore();
+        auto status = _replCoord->abortCatchupIfNeeded(
+            ReplicationCoordinator::PrimaryCatchUpConclusionReason::kFailedWithError);
+        if (!status.isOK()) {
+            LOG(1) << "Aborting catch-up failed with status: " << status;
+        }
         return;
     }
 
diff --git a/src/mongo/db/repl/election_reason_counter.h b/src/mongo/db/repl/election_reason_counter.h
index 8dd068181b5..801d9855a45 100644
--- a/src/mongo/db/repl/election_reason_counter.h
+++ b/src/mongo/db/repl/election_reason_counter.h
@@ -41,11 +41,17 @@ class ElectionReasonCounter : public ElectionReasonCounterBase {
 public:
     using ElectionReasonCounterBase::getCalled;
     using ElectionReasonCounterBase::setCalled;
+    using ElectionReasonCounterBase::getSuccessful;
+    using ElectionReasonCounterBase::setSuccessful;
 
     void incrementCalled() {
         setCalled(getCalled() + 1);
     }
 
+    void incrementSuccessful() {
+        setSuccessful(getSuccessful() + 1);
+    }
+
     ElectionReasonCounter parse(const IDLParserErrorContext& ctxt, const BSONObj& bsonObject);
 };
 
diff --git a/src/mongo/db/repl/repl_set_commands.cpp b/src/mongo/db/repl/repl_set_commands.cpp
index d5624256df8..48f9c96f4e7 100644
--- a/src/mongo/db/repl/repl_set_commands.cpp
+++ b/src/mongo/db/repl/repl_set_commands.cpp
@@ -455,7 +455,9 @@ public:
     CmdReplSetStepDown()
         : ReplSetCommand("replSetStepDown"),
           _stepDownCmdsWithForceExecutedMetric("commands.replSetStepDownWithForce.total",
-                                               &_stepDownCmdsWithForceExecuted) {}
+                                               &_stepDownCmdsWithForceExecuted),
+          _stepDownCmdsWithForceFailedMetric("commands.replSetStepDownWithForce.failed",
+                                             &_stepDownCmdsWithForceFailed) {}
     virtual bool run(OperationContext* opCtx,
                      const string&,
                      const BSONObj& cmdObj,
@@ -466,6 +468,12 @@ public:
             _stepDownCmdsWithForceExecuted.increment();
         }
 
+        auto onExitGuard = MakeGuard([&] {
+            if (force) {
+                _stepDownCmdsWithForceFailed.increment();
+            }
+        });
+
         Status status = ReplicationCoordinator::get(opCtx)->checkReplEnabledForCommand(&result);
         uassertStatusOK(status);
 
@@ -508,12 +516,16 @@ public:
         status = ReplicationCoordinator::get(opCtx)->stepDown(
             opCtx, force, Seconds(secondaryCatchUpPeriodSecs), Seconds(stepDownForSecs));
         uassertStatusOK(status);
+
+        onExitGuard.Dismiss();
         return true;
     }
 
 private:
     mutable Counter64 _stepDownCmdsWithForceExecuted;
+    mutable Counter64 _stepDownCmdsWithForceFailed;
     ServerStatusMetricField<Counter64> _stepDownCmdsWithForceExecutedMetric;
+    ServerStatusMetricField<Counter64> _stepDownCmdsWithForceFailedMetric;
 
     ActionSet getAuthActionSet() const override {
         return ActionSet{ActionType::replSetStateChange};
@@ -839,7 +851,9 @@ public:
         uassertStatusOK(status);
         log() << "Received replSetAbortPrimaryCatchUp request";
 
-        status = ReplicationCoordinator::get(opCtx)->abortCatchupIfNeeded();
+        status = ReplicationCoordinator::get(opCtx)->abortCatchupIfNeeded(
+            ReplicationCoordinator::PrimaryCatchUpConclusionReason::
+                kFailedWithReplSetAbortPrimaryCatchUpCmd);
         if (!status.isOK()) {
             log() << "replSetAbortPrimaryCatchUp request failed" << causedBy(status);
         }
diff --git a/src/mongo/db/repl/replication_coordinator.h b/src/mongo/db/repl/replication_coordinator.h
index 78fed57782c..9dfdeb9fe90 100644
--- a/src/mongo/db/repl/replication_coordinator.h
+++ b/src/mongo/db/repl/replication_coordinator.h
@@ -826,10 +826,20 @@ public:
 
     virtual ServiceContext* getServiceContext() = 0;
 
+    enum PrimaryCatchUpConclusionReason {
+        kSucceeded,
+        kAlreadyCaughtUp,
+        kSkipped,
+        kTimedOut,
+        kFailedWithError,
+        kFailedWithNewTerm,
+        kFailedWithReplSetAbortPrimaryCatchUpCmd
+    };
+
     /**
      * Abort catchup if the node is in catchup mode.
      */
-    virtual Status abortCatchupIfNeeded() = 0;
+    virtual Status abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) = 0;
 
     /**
      * Signals that drop pending collections have been removed from storage.
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index 5b42bc5be0a..e1bb889c80a 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -2640,7 +2640,6 @@ ReplicationCoordinatorImpl::_updateMemberStateFromTopologyCoordinator_inlock(
         _canAcceptNonLocalWrites.store(canAcceptWrites);
     }
 
-
     const MemberState newState = _topCoord->getMemberState();
     if (newState == _memberState) {
         if (_topCoord->getRole() == TopologyCoordinator::Role::kCandidate) {
@@ -2674,7 +2673,16 @@ ReplicationCoordinatorImpl::_updateMemberStateFromTopologyCoordinator_inlock(
     // Exit catchup mode if we're in it and enable replication producer and applier on stepdown.
     if (_memberState.primary()) {
         if (_catchupState) {
-            _catchupState->abort_inlock();
+            // _pendingTermUpdateDuringStepDown is set before stepping down due to hearing about a
+            // higher term, so that we can remember the term we heard and update our term as part of
+            // finishing stepdown. It is then unset toward the end of stepdown, after the function
+            // we are in is called. Thus we must be stepping down due to seeing a higher term if and
+            // only if _pendingTermUpdateDuringStepDown is set here.
+            if (_pendingTermUpdateDuringStepDown) {
+                _catchupState->abort_inlock(PrimaryCatchUpConclusionReason::kFailedWithNewTerm);
+            } else {
+                _catchupState->abort_inlock(PrimaryCatchUpConclusionReason::kFailedWithError);
+            }
         }
         _applierState = ApplierState::Running;
         _externalState->startProducerIfStopped();
@@ -2825,7 +2833,7 @@ void ReplicationCoordinatorImpl::CatchupState::start_inlock() {
 
     // No catchup in single node replica set.
     if (_repl->_rsConfig.getNumMembers() == 1) {
-        abort_inlock();
+        abort_inlock(PrimaryCatchUpConclusionReason::kSkipped);
         return;
     }
 
@@ -2834,7 +2842,7 @@ void ReplicationCoordinatorImpl::CatchupState::start_inlock() {
     // When catchUpTimeoutMillis is 0, we skip doing catchup entirely.
     if (catchupTimeout == ReplSetConfig::kCatchUpDisabled) {
         log() << "Skipping primary catchup since the catchup timeout is 0.";
-        abort_inlock();
+        abort_inlock(PrimaryCatchUpConclusionReason::kSkipped);
         return;
     }
 
@@ -2849,7 +2857,7 @@ void ReplicationCoordinatorImpl::CatchupState::start_inlock() {
             return;
         }
         log() << "Catchup timed out after becoming primary.";
-        abort_inlock();
+        abort_inlock(PrimaryCatchUpConclusionReason::kTimedOut);
     };
 
     // Deal with infinity and overflow - no timeout.
@@ -2862,15 +2870,18 @@ void ReplicationCoordinatorImpl::CatchupState::start_inlock() {
     auto status = _repl->_replExecutor->scheduleWorkAt(timeoutDate, timeoutCB);
     if (!status.isOK()) {
         log() << "Failed to schedule catchup timeout work.";
-        abort_inlock();
+        abort_inlock(PrimaryCatchUpConclusionReason::kFailedWithError);
         return;
     }
     _timeoutCbh = status.getValue();
 }
 
-void ReplicationCoordinatorImpl::CatchupState::abort_inlock() {
+void ReplicationCoordinatorImpl::CatchupState::abort_inlock(PrimaryCatchUpConclusionReason reason) {
     invariant(_repl->_getMemberState_inlock().primary());
 
+    ReplicationMetrics::get(getGlobalServiceContext())
+        .incrementNumCatchUpsConcludedForReason(reason);
+
     log() << "Exited primary catch-up mode.";
     // Clean up its own members.
     if (_timeoutCbh) {
@@ -2898,7 +2909,7 @@ void ReplicationCoordinatorImpl::CatchupState::signalHeartbeatUpdate_inlock() {
     if (*targetOpTime <= myLastApplied) {
         log() << "Caught up to the latest optime known via heartbeats after becoming primary. "
               << "Target optime: " << *targetOpTime << ". My Last Applied: " << myLastApplied;
-        abort_inlock();
+        abort_inlock(PrimaryCatchUpConclusionReason::kAlreadyCaughtUp);
         return;
     }
 
@@ -2923,14 +2934,14 @@ void ReplicationCoordinatorImpl::CatchupState::signalHeartbeatUpdate_inlock() {
         if (*targetOpTime <= myLastApplied) {
             log() << "Caught up to the latest known optime successfully after becoming primary. "
                   << "Target optime: " << *targetOpTime << ". My Last Applied: " << myLastApplied;
-            abort_inlock();
+            abort_inlock(PrimaryCatchUpConclusionReason::kSucceeded);
         }
     };
     _waiter = stdx::make_unique<CallbackWaiter>(*targetOpTime, targetOpTimeCB);
     _repl->_opTimeWaiterList.add_inlock(_waiter.get());
 }
 
-Status ReplicationCoordinatorImpl::abortCatchupIfNeeded() {
+Status ReplicationCoordinatorImpl::abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) {
     if (!isV1ElectionProtocol()) {
         return Status(ErrorCodes::CommandNotSupported,
                       "Primary catch-up is only supported by Protocol Version 1");
@@ -2938,7 +2949,7 @@ Status ReplicationCoordinatorImpl::abortCatchupIfNeeded() {
 
     stdx::lock_guard<stdx::mutex> lk(_mutex);
     if (_catchupState) {
-        _catchupState->abort_inlock();
+        _catchupState->abort_inlock(reason);
         return Status::OK();
     }
     return Status(ErrorCodes::IllegalOperation, "The node is not in catch-up mode.");
diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h
index 8dfd864e3e6..31cc38b801e 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.h
+++ b/src/mongo/db/repl/replication_coordinator_impl.h
@@ -312,7 +312,7 @@ public:
 
     virtual Status stepUpIfEligible(bool skipDryRun) override;
 
-    virtual Status abortCatchupIfNeeded() override;
+    virtual Status abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) override;
 
     void signalDropPendingCollectionsRemovedFromStorage() final;
 
@@ -553,7 +553,7 @@ private:
         // start() can only be called once.
         void start_inlock();
         // Reset the state itself to destruct the state.
-        void abort_inlock();
+        void abort_inlock(PrimaryCatchUpConclusionReason reason);
         // Heartbeat calls this function to update the target optime.
         void signalHeartbeatUpdate_inlock();
 
@@ -863,12 +863,14 @@ private:
      * This job will be scheduled to run in DB worker threads.
      */
     void _writeLastVoteForMyElection(LastVote lastVote,
-                                     const executor::TaskExecutor::CallbackArgs& cbData);
+                                     const executor::TaskExecutor::CallbackArgs& cbData,
+                                     TopologyCoordinator::StartElectionReason reason);
 
     /**
      * Starts VoteRequester to run the real election when last vote write has completed.
      */
-    void _startVoteRequester_inlock(long long newTerm);
+    void _startVoteRequester_inlock(long long newTerm,
+                                    TopologyCoordinator::StartElectionReason reason);
 
     /**
      * Callback called when the VoteRequester has completed; checks the results and
@@ -876,7 +878,8 @@ private:
      * "originalTerm" was the term during which the election began, if the term has since
      * changed, do not step up as primary.
      */
-    void _onVoteRequestComplete(long long originalTerm);
+    void _onVoteRequestComplete(long long originalTerm,
+                                TopologyCoordinator::StartElectionReason reason);
 
     /**
      * Callback called after a random delay, to prevent repeated election ties.
diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp
index 607e28fcc2a..c5c8c44df93 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp
@@ -233,8 +233,8 @@ void ReplicationCoordinatorImpl::_startRealElection_inlock(
     LastVote lastVote{newTerm, _selfIndex};
 
     auto cbStatus = _replExecutor->scheduleWork(
-        [this, lastVote](const executor::TaskExecutor::CallbackArgs& cbData) {
-            _writeLastVoteForMyElection(lastVote, cbData);
+        [this, lastVote, reason](const executor::TaskExecutor::CallbackArgs& cbData) {
+            _writeLastVoteForMyElection(lastVote, cbData, reason);
         });
     if (cbStatus.getStatus() == ErrorCodes::ShutdownInProgress) {
         return;
@@ -244,7 +244,9 @@ void ReplicationCoordinatorImpl::_startRealElection_inlock(
 }
 
 void ReplicationCoordinatorImpl::_writeLastVoteForMyElection(
-    LastVote lastVote, const executor::TaskExecutor::CallbackArgs& cbData) {
+    LastVote lastVote,
+    const executor::TaskExecutor::CallbackArgs& cbData,
+    TopologyCoordinator::StartElectionReason reason) {
     // storeLocalLastVoteDocument can call back in to the replication coordinator,
     // so _mutex must be unlocked here.  However, we cannot return until we
     // lock it because we want to lose the election on cancel or error and
@@ -274,13 +276,14 @@ void ReplicationCoordinatorImpl::_writeLastVoteForMyElection(
               << lastVote.getTerm() << ", current term: " << _topCoord->getTerm();
         return;
     }
-    _startVoteRequester_inlock(lastVote.getTerm());
+    _startVoteRequester_inlock(lastVote.getTerm(), reason);
     _replExecutor->signalEvent(_electionDryRunFinishedEvent);
 
     lossGuard.dismiss();
 }
 
-void ReplicationCoordinatorImpl::_startVoteRequester_inlock(long long newTerm) {
+void ReplicationCoordinatorImpl::_startVoteRequester_inlock(
+    long long newTerm, TopologyCoordinator::StartElectionReason reason) {
     const auto lastOpTime = _getMyLastAppliedOpTime_inlock();
 
     _voteRequester.reset(new VoteRequester);
@@ -291,15 +294,17 @@ void ReplicationCoordinatorImpl::_startVoteRequester_inlock(long long newTerm) {
     }
     fassert(28643, nextPhaseEvh.getStatus());
     _replExecutor
-        ->onEvent(
-            nextPhaseEvh.getValue(),
-            [=](const executor::TaskExecutor::CallbackArgs&) { _onVoteRequestComplete(newTerm); })
+        ->onEvent(nextPhaseEvh.getValue(),
+                  [=](const executor::TaskExecutor::CallbackArgs&) {
+                      _onVoteRequestComplete(newTerm, reason);
+                  })
         .status_with_transitional_ignore();
 }
 
 MONGO_FAIL_POINT_DEFINE(electionHangsBeforeUpdateMemberState);
 
-void ReplicationCoordinatorImpl::_onVoteRequestComplete(long long newTerm) {
+void ReplicationCoordinatorImpl::_onVoteRequestComplete(
+    long long newTerm, TopologyCoordinator::StartElectionReason reason) {
     stdx::unique_lock<stdx::mutex> lk(_mutex);
     LoseElectionGuardV1 lossGuard(this);
 
@@ -323,6 +328,8 @@ void ReplicationCoordinatorImpl::_onVoteRequestComplete(long long newTerm) {
             return;
         case VoteRequester::Result::kSuccessfullyElected:
             log() << "election succeeded, assuming primary role in term " << _topCoord->getTerm();
+            ReplicationMetrics::get(getServiceContext())
+                .incrementNumElectionsSuccessfulForReason(reason);
             break;
         case VoteRequester::Result::kPrimaryRespondedNo:
             // This is impossible because we would only require the primary's
diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp
index 88fc5cd5208..4970940746c 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp
@@ -242,6 +242,20 @@ TEST_F(ReplCoordTest, ElectionSucceedsWhenNodeIsTheOnlyNode) {
     getReplCoord()->fillIsMasterForReplSet(&imResponse, {});
     ASSERT_TRUE(imResponse.isMaster()) << imResponse.toBSON().toString();
     ASSERT_FALSE(imResponse.isSecondary()) << imResponse.toBSON().toString();
+
+    // Check that only the 'numCatchUpsSkipped' primary catchup conclusion reason was incremented.
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtxPtr.get()).getNumCatchUpsSucceeded_forTesting());
+    ASSERT_EQ(0,
+              ReplicationMetrics::get(opCtxPtr.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+    ASSERT_EQ(1, ReplicationMetrics::get(opCtxPtr.get()).getNumCatchUpsSkipped_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtxPtr.get()).getNumCatchUpsTimedOut_forTesting());
+    ASSERT_EQ(0,
+              ReplicationMetrics::get(opCtxPtr.get()).getNumCatchUpsFailedWithError_forTesting());
+    ASSERT_EQ(0,
+              ReplicationMetrics::get(opCtxPtr.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+    ASSERT_EQ(0,
+              ReplicationMetrics::get(opCtxPtr.get())
+                  .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
 }
 
 TEST_F(ReplCoordTest, ElectionSucceedsWhenAllNodesVoteYea) {
@@ -276,15 +290,21 @@ TEST_F(ReplCoordTest, ElectionSucceedsWhenAllNodesVoteYea) {
     stopCapturingLogMessages();
     ASSERT_EQUALS(1, countLogLinesContaining("election succeeded"));
 
-    // Check that the numElectionTimeoutsCalled election metric has been incremented, and that none
-    // of the metrics that track the number of elections called for other reasons has been
-    // incremented.
+    // Check that the numElectionTimeoutsCalled and the numElectionTimeoutsSuccessful election
+    // metrics have been incremented, and that none of the metrics that track the number of
+    // elections called or successful for other reasons has been incremented.
     ServiceContext* svcCtx = getServiceContext();
     ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumStepUpCmdsCalled_forTesting());
     ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumPriorityTakeoversCalled_forTesting());
     ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumCatchUpTakeoversCalled_forTesting());
     ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumElectionTimeoutsCalled_forTesting());
     ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumFreezeTimeoutsCalled_forTesting());
+    ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumStepUpCmdsSuccessful_forTesting());
+    ASSERT_EQUALS(0,
+                  ReplicationMetrics::get(svcCtx).getNumPriorityTakeoversSuccessful_forTesting());
+    ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumCatchUpTakeoversSuccessful_forTesting());
+    ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumElectionTimeoutsSuccessful_forTesting());
+    ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumFreezeTimeoutsSuccessful_forTesting());
 }
 
 TEST_F(ReplCoordTest, ElectionSucceedsWhenMaxSevenNodesVoteYea) {
@@ -1481,15 +1501,21 @@ TEST_F(TakeoverTest, SuccessfulCatchupTakeover) {
                               TopologyCoordinator::StartElectionReason::kCatchupTakeover,
                               lastVoteExpected);
 
-    // Check that the numCatchUpTakeoversCalled election metric has been incremented, and that none
-    // of the metrics that track the number of elections called for other reasons has been
-    // incremented.
+    // Check that the numCatchUpTakeoversCalled and the numCatchUpTakeoversSuccessful election
+    // metrics have been incremented, and that none of the metrics that track the number of
+    // elections called or successful for other reasons has been incremented.
     ServiceContext* svcCtx = getServiceContext();
     ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumStepUpCmdsCalled_forTesting());
     ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumPriorityTakeoversCalled_forTesting());
     ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumCatchUpTakeoversCalled_forTesting());
     ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumElectionTimeoutsCalled_forTesting());
     ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumFreezeTimeoutsCalled_forTesting());
+    ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumStepUpCmdsSuccessful_forTesting());
+    ASSERT_EQUALS(0,
+                  ReplicationMetrics::get(svcCtx).getNumPriorityTakeoversSuccessful_forTesting());
+    ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumCatchUpTakeoversSuccessful_forTesting());
+    ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumElectionTimeoutsSuccessful_forTesting());
+    ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumFreezeTimeoutsSuccessful_forTesting());
 }
 
 TEST_F(TakeoverTest, CatchupTakeoverDryRunFailsPrimarySaysNo) {
@@ -1848,15 +1874,21 @@ TEST_F(TakeoverTest, SuccessfulPriorityTakeover) {
                               TopologyCoordinator::StartElectionReason::kPriorityTakeover,
                               lastVoteExpected);
 
-    // Check that the numPriorityTakeoversCalled election metric has been incremented, and that none
-    // of the metrics that track the number of elections called for other reasons has been
-    // incremented.
+    // Check that the numPriorityTakeoversCalled and the numPriorityTakeoversSuccessful election
+    // metrics have been incremented, and that none of the metrics that track the number of
+    // elections called or successful for other reasons has been incremented.
     ServiceContext* svcCtx = getServiceContext();
     ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumStepUpCmdsCalled_forTesting());
     ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumPriorityTakeoversCalled_forTesting());
     ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumCatchUpTakeoversCalled_forTesting());
     ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumElectionTimeoutsCalled_forTesting());
     ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumFreezeTimeoutsCalled_forTesting());
+    ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumStepUpCmdsSuccessful_forTesting());
+    ASSERT_EQUALS(1,
+                  ReplicationMetrics::get(svcCtx).getNumPriorityTakeoversSuccessful_forTesting());
+    ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumCatchUpTakeoversSuccessful_forTesting());
+    ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumElectionTimeoutsSuccessful_forTesting());
+    ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumFreezeTimeoutsSuccessful_forTesting());
 }
 
 TEST_F(TakeoverTest, DontCallForPriorityTakeoverWhenLaggedSameSecond) {
@@ -2309,6 +2341,18 @@ TEST_F(PrimaryCatchUpTest, PrimaryDoesNotNeedToCatchUp) {
 
     // Check that the number of elections requiring primary catchup was not incremented.
     ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+    // Check that only the 'numCatchUpsAlreadyCaughtUp' primary catchup conclusion reason was
+    // incremented.
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+    ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+    ASSERT_EQ(0,
+              ReplicationMetrics::get(opCtx.get())
+                  .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
 }
 
 // Heartbeats set a future target OpTime and we reached that successfully.
@@ -2335,6 +2379,17 @@ TEST_F(PrimaryCatchUpTest, CatchupSucceeds) {
 
     // Check that the number of elections requiring primary catchup was incremented.
     ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+    // Check that only the 'numCatchUpsSucceeded' primary catchup conclusion reason was incremented.
+    ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+    ASSERT_EQ(0,
+              ReplicationMetrics::get(opCtx.get())
+                  .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
 }
 
 TEST_F(PrimaryCatchUpTest, CatchupTimeout) {
@@ -2358,6 +2413,17 @@ TEST_F(PrimaryCatchUpTest, CatchupTimeout) {
 
     // Check that the number of elections requiring primary catchup was incremented.
     ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+    // Check that only the 'numCatchUpsTimedOut' primary catchup conclusion reason was incremented.
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+    ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+    ASSERT_EQ(0,
+              ReplicationMetrics::get(opCtx.get())
+                  .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
 }
 
 TEST_F(PrimaryCatchUpTest, CannotSeeAllNodes) {
@@ -2386,6 +2452,18 @@ TEST_F(PrimaryCatchUpTest, CannotSeeAllNodes) {
 
     // Check that the number of elections requiring primary catchup was not incremented.
     ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+    // Check that only the 'numCatchUpsAlreadyCaughtUp' primary catchup conclusion reason was
+    // incremented.
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+    ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+    ASSERT_EQ(0,
+              ReplicationMetrics::get(opCtx.get())
+                  .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
 }
 
 TEST_F(PrimaryCatchUpTest, HeartbeatTimeout) {
@@ -2414,6 +2492,18 @@ TEST_F(PrimaryCatchUpTest, HeartbeatTimeout) {
 
     // Check that the number of elections requiring primary catchup was not incremented.
     ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+    // Check that only the 'numCatchUpsAlreadyCaughtUp' primary catchup conclusion reason was
+    // incremented.
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+    ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+    ASSERT_EQ(0,
+              ReplicationMetrics::get(opCtx.get())
+                  .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
 }
 
 TEST_F(PrimaryCatchUpTest, PrimaryStepsDownBeforeHeartbeatRefreshing) {
@@ -2440,6 +2530,18 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownBeforeHeartbeatRefreshing) {
 
     // Check that the number of elections requiring primary catchup was not incremented.
     ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+    // Since the primary stepped down in catchup mode because it saw a higher term, check that only
+    // the 'numCatchUpsFailedWithNewTerm' primary catchup conclusion reason was incremented.
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+    ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+    ASSERT_EQ(0,
+              ReplicationMetrics::get(opCtx.get())
+                  .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
 }
 
 TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringCatchUp) {
@@ -2472,6 +2574,18 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringCatchUp) {
 
     // Check that the number of elections requiring primary catchup was incremented.
     ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+    // Since the primary stepped down in catchup mode because it saw a higher term, check that only
+    // the 'numCatchUpsFailedWithNewTerm' primary catchup conclusion reason was incremented.
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+    ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+    ASSERT_EQ(0,
+              ReplicationMetrics::get(opCtx.get())
+                  .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
 }
 
 TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringDrainMode) {
@@ -2493,6 +2607,21 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringDrainMode) {
     stopCapturingLogMessages();
     ASSERT_EQUALS(1, countLogLinesContaining("Caught up to the latest"));
 
+    // Check that the number of elections requiring primary catchup was incremented.
+    auto opCtx = makeOperationContext();
+    ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+    // Check that only the 'numCatchUpsSucceeded' primary catchup conclusion reason was incremented.
+    ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+    ASSERT_EQ(0,
+              ReplicationMetrics::get(opCtx.get())
+                  .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
+
     // Step down during drain mode.
     TopologyCoordinator::UpdateTermResult updateTermResult;
     auto evh = replCoord->updateTerm_forTest(2, &updateTermResult);
@@ -2511,7 +2640,6 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringDrainMode) {
         net->scheduleResponse(noi, net->now(), makeHeartbeatResponse(time2));
     });
     ASSERT(replCoord->getApplierState() == ApplierState::Draining);
-    auto opCtx = makeOperationContext();
     {
         Lock::GlobalLock lock(opCtx.get(), MODE_IX);
         ASSERT_FALSE(replCoord->canAcceptWritesForDatabase(opCtx.get(), "test"));
@@ -2521,8 +2649,20 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringDrainMode) {
     ASSERT(replCoord->getApplierState() == ApplierState::Stopped);
     ASSERT_TRUE(replCoord->canAcceptWritesForDatabase(opCtx.get(), "test"));
 
-    // Check that the number of elections requiring primary catchup was incremented.
+    // Check that the number of elections requiring primary catchup was not incremented again.
     ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+    // Check that only the 'numCatchUpsAlreadyCaughtUp' primary catchup conclusion reason was
+    // incremented.
+    ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+    ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+    ASSERT_EQ(0,
+              ReplicationMetrics::get(opCtx.get())
+                  .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
 }
 
 TEST_F(PrimaryCatchUpTest, FreshestNodeBecomesAvailableLater) {
@@ -2589,6 +2729,17 @@ TEST_F(PrimaryCatchUpTest, FreshestNodeBecomesAvailableLater) {
 
     // Check that the number of elections requiring primary catchup was incremented.
     ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+    // Check that only the 'numCatchUpsSucceeded' primary catchup conclusion reason was incremented.
+    ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+    ASSERT_EQ(0,
+              ReplicationMetrics::get(opCtx.get())
+                  .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
 }
 
 TEST_F(PrimaryCatchUpTest, InfiniteTimeoutAndAbort) {
@@ -2623,7 +2774,9 @@ TEST_F(PrimaryCatchUpTest, InfiniteTimeoutAndAbort) {
     ASSERT(getReplCoord()->getApplierState() == ApplierState::Running);
 
     // Simulate a user initiated abort.
-    ASSERT_OK(getReplCoord()->abortCatchupIfNeeded());
+    ASSERT_OK(getReplCoord()->abortCatchupIfNeeded(
+        ReplicationCoordinator::PrimaryCatchUpConclusionReason::
+            kFailedWithReplSetAbortPrimaryCatchUpCmd));
     ASSERT(getReplCoord()->getApplierState() == ApplierState::Draining);
 
     stopCapturingLogMessages();
@@ -2637,6 +2790,18 @@ TEST_F(PrimaryCatchUpTest, InfiniteTimeoutAndAbort) {
 
     // Check that the number of elections requiring primary catchup was incremented.
     ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+    // Check that only the 'numCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd' primary catchup
+    // conclusion reason was incremented.
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+    ASSERT_EQ(1,
+              ReplicationMetrics::get(opCtx.get())
+                  .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
 }
 
 TEST_F(PrimaryCatchUpTest, ZeroTimeout) {
@@ -2654,6 +2819,57 @@ TEST_F(PrimaryCatchUpTest, ZeroTimeout) {
 
     // Check that the number of elections requiring primary catchup was not incremented.
     ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+    // Check that only the 'numCatchUpsSkipped' primary catchup conclusion reason was incremented.
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+    ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+    ASSERT_EQ(0,
+              ReplicationMetrics::get(opCtx.get())
+                  .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
+}
+
+TEST_F(PrimaryCatchUpTest, CatchUpFailsDueToPrimaryStepDown) {
+    startCapturingLogMessages();
+
+    OpTime time1(Timestamp(100, 1), 0);
+    OpTime time2(Timestamp(100, 2), 0);
+    ReplSetConfig config = setUp3NodeReplSetAndRunForElection(time1);
+    // Step down in the middle of catchup.
+    auto abortTime = getNet()->now() + config.getCatchUpTimeoutPeriod() / 2;
+    replyHeartbeatsAndRunUntil(abortTime, [this, time2](const NetworkOpIter noi) {
+        // Other nodes are ahead of me.
+        getNet()->scheduleResponse(noi, getNet()->now(), makeHeartbeatResponse(time2));
+    });
+    ASSERT(getReplCoord()->getApplierState() == ApplierState::Running);
+
+    auto opCtx = makeOperationContext();
+    ASSERT_OK(getReplCoord()->stepDown(opCtx.get(), true, Milliseconds(0), Milliseconds(1000)));
+    ASSERT_TRUE(getReplCoord()->getMemberState().secondary());
+    ASSERT(getReplCoord()->getApplierState() == ApplierState::Running);
+
+    stopCapturingLogMessages();
+    ASSERT_EQUALS(1, countLogLinesContaining("Exited primary catch-up mode"));
+    ASSERT_EQUALS(0, countLogLinesContaining("Caught up to the latest"));
+    ASSERT_EQUALS(0, countLogLinesContaining("Catchup timed out"));
+
+    // Check that the number of elections requiring primary catchup was incremented.
+    ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+    // Check that only the 'numCatchUpsFailedWithError' primary catchup conclusion reason was
+    // incremented.
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+    ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+    ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+    ASSERT_EQ(0,
+              ReplicationMetrics::get(opCtx.get())
+                  .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
 }
 
 }  // namespace
diff --git a/src/mongo/db/repl/replication_coordinator_impl_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_test.cpp
index 55bbe9d249f..e4f6bca7fed 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_test.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_test.cpp
@@ -2216,15 +2216,21 @@ TEST_F(ReplCoordTest, SingleNodeReplSetUnfreeze) {
     getNet()->exitNetwork();
     ASSERT_TRUE(getReplCoord()->getMemberState().primary());
 
-    // Check that the numFreezeTimeoutsCalled election metric has been incremented, and that none
-    // of the metrics that track the number of elections called for other reasons has been
-    // incremented.
+    // Check that the numFreezeTimeoutsCalled and the numFreezeTimeoutsSuccessful election metrics
+    // have been incremented, and that none of the metrics that track the number of elections called
+    // or successful for other reasons has been incremented.
     ServiceContext* svcCtx = getServiceContext();
     ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumStepUpCmdsCalled_forTesting());
     ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumPriorityTakeoversCalled_forTesting());
     ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumCatchUpTakeoversCalled_forTesting());
     ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumElectionTimeoutsCalled_forTesting());
     ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumFreezeTimeoutsCalled_forTesting());
+    ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumStepUpCmdsSuccessful_forTesting());
+    ASSERT_EQUALS(0,
+                  ReplicationMetrics::get(svcCtx).getNumPriorityTakeoversSuccessful_forTesting());
+    ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumCatchUpTakeoversSuccessful_forTesting());
+    ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumElectionTimeoutsSuccessful_forTesting());
+    ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumFreezeTimeoutsSuccessful_forTesting());
 }
 
 TEST_F(ReplCoordTest, NodeBecomesPrimaryAgainWhenStepDownTimeoutExpiresInASingleNodeSet) {
@@ -2241,15 +2247,21 @@ TEST_F(ReplCoordTest, NodeBecomesPrimaryAgainWhenStepDownTimeoutExpiresInASingle
     auto opCtx = makeOperationContext();
     runSingleNodeElection(opCtx.get());
 
-    // Check that the numElectionTimeoutsCalled election metric has been incremented, and that none
-    // of the metrics that track the number of elections called for other reasons has been
-    // incremented.
+    // Check that the numElectionTimeoutsCalled and the numElectionTimeoutsSuccessful election
+    // metrics have been incremented, and that none of the metrics that track the number of
+    // elections called or successful for other reasons has been incremented.
     ServiceContext* svcCtx = getServiceContext();
     ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumStepUpCmdsCalled_forTesting());
     ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumPriorityTakeoversCalled_forTesting());
     ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumCatchUpTakeoversCalled_forTesting());
     ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumElectionTimeoutsCalled_forTesting());
     ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumFreezeTimeoutsCalled_forTesting());
+    ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumStepUpCmdsSuccessful_forTesting());
+    ASSERT_EQUALS(0,
+                  ReplicationMetrics::get(svcCtx).getNumPriorityTakeoversSuccessful_forTesting());
+    ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumCatchUpTakeoversSuccessful_forTesting());
+    ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumElectionTimeoutsSuccessful_forTesting());
+    ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumFreezeTimeoutsSuccessful_forTesting());
 
     ASSERT_OK(getReplCoord()->stepDown(opCtx.get(), true, Milliseconds(0), Milliseconds(1000)));
     getNet()->enterNetwork();  // Must do this before inspecting the topocoord
@@ -2266,15 +2278,22 @@ TEST_F(ReplCoordTest, NodeBecomesPrimaryAgainWhenStepDownTimeoutExpiresInASingle
     getNet()->exitNetwork();
     ASSERT_TRUE(getReplCoord()->getMemberState().primary());
 
-    // Check that the numFreezeTimeoutsCalled election metric has been incremented, and that none
-    // of the metrics that track the number of elections called for other reasons has been
-    // incremented. When a stepdown timeout expires in a single node replica set, an election is
-    // called for the same reason as is used when a freeze timeout expires.
+    // Check that the numFreezeTimeoutsCalled and the numFreezeTimeoutsSuccessful election metrics
+    // have been incremented, and that none of the metrics that track the number of elections called
+    // or successful for other reasons has been incremented. When a stepdown timeout expires in a
+    // single node replica set, an election is called for the same reason as is used when a freeze
+    // timeout expires.
     ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumStepUpCmdsCalled_forTesting());
     ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumPriorityTakeoversCalled_forTesting());
     ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumCatchUpTakeoversCalled_forTesting());
     ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumElectionTimeoutsCalled_forTesting());
     ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumFreezeTimeoutsCalled_forTesting());
+    ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumStepUpCmdsSuccessful_forTesting());
+    ASSERT_EQUALS(0,
+                  ReplicationMetrics::get(svcCtx).getNumPriorityTakeoversSuccessful_forTesting());
+    ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumCatchUpTakeoversSuccessful_forTesting());
+    ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumElectionTimeoutsSuccessful_forTesting());
+    ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumFreezeTimeoutsSuccessful_forTesting());
 }
 
 TEST_F(
diff --git a/src/mongo/db/repl/replication_coordinator_mock.cpp b/src/mongo/db/repl/replication_coordinator_mock.cpp
index 6bc921b4628..5e06cf589f4 100644
--- a/src/mongo/db/repl/replication_coordinator_mock.cpp
+++ b/src/mongo/db/repl/replication_coordinator_mock.cpp
@@ -489,7 +489,7 @@ void ReplicationCoordinatorMock::alwaysAllowWrites(bool allowWrites) {
     _alwaysAllowWrites = allowWrites;
 }
 
-Status ReplicationCoordinatorMock::abortCatchupIfNeeded() {
+Status ReplicationCoordinatorMock::abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) {
     return Status::OK();
 }
 
diff --git a/src/mongo/db/repl/replication_coordinator_mock.h b/src/mongo/db/repl/replication_coordinator_mock.h
index 7decacbdc96..7cc450ed939 100644
--- a/src/mongo/db/repl/replication_coordinator_mock.h
+++ b/src/mongo/db/repl/replication_coordinator_mock.h
@@ -285,7 +285,7 @@ public:
         return _service;
     }
 
-    virtual Status abortCatchupIfNeeded() override;
+    virtual Status abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) override;
 
     void signalDropPendingCollectionsRemovedFromStorage() final;
 
diff --git a/src/mongo/db/repl/replication_metrics.cpp b/src/mongo/db/repl/replication_metrics.cpp
index bcc04ea0f53..0f9a3491019 100644
--- a/src/mongo/db/repl/replication_metrics.cpp
+++ b/src/mongo/db/repl/replication_metrics.cpp
@@ -59,31 +59,59 @@ void ReplicationMetrics::incrementNumElectionsCalledForReason(
         case TopologyCoordinator::StartElectionReason::kStepUpRequestSkipDryRun: {
             ElectionReasonCounter& stepUpCmd = _electionMetrics.getStepUpCmd();
             stepUpCmd.incrementCalled();
-            _electionMetrics.setStepUpCmd(stepUpCmd);
             break;
         }
         case TopologyCoordinator::StartElectionReason::kPriorityTakeover: {
             ElectionReasonCounter& priorityTakeover = _electionMetrics.getPriorityTakeover();
             priorityTakeover.incrementCalled();
-            _electionMetrics.setPriorityTakeover(priorityTakeover);
             break;
         }
         case TopologyCoordinator::StartElectionReason::kCatchupTakeover: {
             ElectionReasonCounter& catchUpTakeover = _electionMetrics.getCatchUpTakeover();
             catchUpTakeover.incrementCalled();
-            _electionMetrics.setCatchUpTakeover(catchUpTakeover);
             break;
         }
         case TopologyCoordinator::StartElectionReason::kElectionTimeout: {
             ElectionReasonCounter& electionTimeout = _electionMetrics.getElectionTimeout();
             electionTimeout.incrementCalled();
-            _electionMetrics.setElectionTimeout(electionTimeout);
             break;
         }
         case TopologyCoordinator::StartElectionReason::kSingleNodePromptElection: {
             ElectionReasonCounter& freezeTimeout = _electionMetrics.getFreezeTimeout();
             freezeTimeout.incrementCalled();
-            _electionMetrics.setFreezeTimeout(freezeTimeout);
+            break;
+        }
+    }
+}
+
+void ReplicationMetrics::incrementNumElectionsSuccessfulForReason(
+    TopologyCoordinator::StartElectionReason reason) {
+    stdx::lock_guard<stdx::mutex> lk(_mutex);
+    switch (reason) {
+        case TopologyCoordinator::StartElectionReason::kStepUpRequest:
+        case TopologyCoordinator::StartElectionReason::kStepUpRequestSkipDryRun: {
+            ElectionReasonCounter& stepUpCmd = _electionMetrics.getStepUpCmd();
+            stepUpCmd.incrementSuccessful();
+            break;
+        }
+        case TopologyCoordinator::StartElectionReason::kPriorityTakeover: {
+            ElectionReasonCounter& priorityTakeover = _electionMetrics.getPriorityTakeover();
+            priorityTakeover.incrementSuccessful();
+            break;
+        }
+        case TopologyCoordinator::StartElectionReason::kCatchupTakeover: {
+            ElectionReasonCounter& catchUpTakeover = _electionMetrics.getCatchUpTakeover();
+            catchUpTakeover.incrementSuccessful();
+            break;
+        }
+        case TopologyCoordinator::StartElectionReason::kElectionTimeout: {
+            ElectionReasonCounter& electionTimeout = _electionMetrics.getElectionTimeout();
+            electionTimeout.incrementSuccessful();
+            break;
+        }
+        case TopologyCoordinator::StartElectionReason::kSingleNodePromptElection: {
+            ElectionReasonCounter& freezeTimeout = _electionMetrics.getFreezeTimeout();
+            freezeTimeout.incrementSuccessful();
             break;
         }
     }
@@ -100,6 +128,40 @@ void ReplicationMetrics::incrementNumCatchUps() {
     _electionMetrics.setNumCatchUps(_electionMetrics.getNumCatchUps() + 1);
 }
 
+void ReplicationMetrics::incrementNumCatchUpsConcludedForReason(
+    ReplicationCoordinator::PrimaryCatchUpConclusionReason reason) {
+    stdx::lock_guard<stdx::mutex> lk(_mutex);
+    switch (reason) {
+        case ReplicationCoordinator::PrimaryCatchUpConclusionReason::kSucceeded:
+            _electionMetrics.setNumCatchUpsSucceeded(_electionMetrics.getNumCatchUpsSucceeded() +
+                                                     1);
+            break;
+        case ReplicationCoordinator::PrimaryCatchUpConclusionReason::kAlreadyCaughtUp:
+            _electionMetrics.setNumCatchUpsAlreadyCaughtUp(
+                _electionMetrics.getNumCatchUpsAlreadyCaughtUp() + 1);
+            break;
+        case ReplicationCoordinator::PrimaryCatchUpConclusionReason::kSkipped:
+            _electionMetrics.setNumCatchUpsSkipped(_electionMetrics.getNumCatchUpsSkipped() + 1);
+            break;
+        case ReplicationCoordinator::PrimaryCatchUpConclusionReason::kTimedOut:
+            _electionMetrics.setNumCatchUpsTimedOut(_electionMetrics.getNumCatchUpsTimedOut() + 1);
+            break;
+        case ReplicationCoordinator::PrimaryCatchUpConclusionReason::kFailedWithError:
+            _electionMetrics.setNumCatchUpsFailedWithError(
+                _electionMetrics.getNumCatchUpsFailedWithError() + 1);
+            break;
+        case ReplicationCoordinator::PrimaryCatchUpConclusionReason::kFailedWithNewTerm:
+            _electionMetrics.setNumCatchUpsFailedWithNewTerm(
+                _electionMetrics.getNumCatchUpsFailedWithNewTerm() + 1);
+            break;
+        case ReplicationCoordinator::PrimaryCatchUpConclusionReason::
+            kFailedWithReplSetAbortPrimaryCatchUpCmd:
+            _electionMetrics.setNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd(
+                _electionMetrics.getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd() + 1);
+            break;
+    }
+}
+
 int ReplicationMetrics::getNumStepUpCmdsCalled_forTesting() {
     stdx::lock_guard<stdx::mutex> lk(_mutex);
     return _electionMetrics.getStepUpCmd().getCalled();
@@ -130,11 +192,71 @@ int ReplicationMetrics::getNumStepDownsCausedByHigherTerm_forTesting() {
     return _electionMetrics.getNumStepDownsCausedByHigherTerm();
 }
 
+int ReplicationMetrics::getNumStepUpCmdsSuccessful_forTesting() {
+    stdx::lock_guard<stdx::mutex> lk(_mutex);
+    return _electionMetrics.getStepUpCmd().getSuccessful();
+}
+
+int ReplicationMetrics::getNumPriorityTakeoversSuccessful_forTesting() {
+    stdx::lock_guard<stdx::mutex> lk(_mutex);
+    return _electionMetrics.getPriorityTakeover().getSuccessful();
+}
+
+int ReplicationMetrics::getNumCatchUpTakeoversSuccessful_forTesting() {
+    stdx::lock_guard<stdx::mutex> lk(_mutex);
+    return _electionMetrics.getCatchUpTakeover().getSuccessful();
+}
+
+int ReplicationMetrics::getNumElectionTimeoutsSuccessful_forTesting() {
+    stdx::lock_guard<stdx::mutex> lk(_mutex);
+    return _electionMetrics.getElectionTimeout().getSuccessful();
+}
+
+int ReplicationMetrics::getNumFreezeTimeoutsSuccessful_forTesting() {
+    stdx::lock_guard<stdx::mutex> lk(_mutex);
+    return _electionMetrics.getFreezeTimeout().getSuccessful();
+}
+
 int ReplicationMetrics::getNumCatchUps_forTesting() {
     stdx::lock_guard<stdx::mutex> lk(_mutex);
     return _electionMetrics.getNumCatchUps();
 }
 
+int ReplicationMetrics::getNumCatchUpsSucceeded_forTesting() {
+    stdx::lock_guard<stdx::mutex> lk(_mutex);
+    return _electionMetrics.getNumCatchUpsSucceeded();
+}
+
+int ReplicationMetrics::getNumCatchUpsAlreadyCaughtUp_forTesting() {
+    stdx::lock_guard<stdx::mutex> lk(_mutex);
+    return _electionMetrics.getNumCatchUpsAlreadyCaughtUp();
+}
+
+int ReplicationMetrics::getNumCatchUpsSkipped_forTesting() {
+    stdx::lock_guard<stdx::mutex> lk(_mutex);
+    return _electionMetrics.getNumCatchUpsSkipped();
+}
+
+int ReplicationMetrics::getNumCatchUpsTimedOut_forTesting() {
+    stdx::lock_guard<stdx::mutex> lk(_mutex);
+    return _electionMetrics.getNumCatchUpsTimedOut();
+}
+
+int ReplicationMetrics::getNumCatchUpsFailedWithError_forTesting() {
+    stdx::lock_guard<stdx::mutex> lk(_mutex);
+    return _electionMetrics.getNumCatchUpsFailedWithError();
+}
+
+int ReplicationMetrics::getNumCatchUpsFailedWithNewTerm_forTesting() {
+    stdx::lock_guard<stdx::mutex> lk(_mutex);
+    return _electionMetrics.getNumCatchUpsFailedWithNewTerm();
+}
+
+int ReplicationMetrics::getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting() {
+    stdx::lock_guard<stdx::mutex> lk(_mutex);
+    return _electionMetrics.getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd();
+}
+
 BSONObj ReplicationMetrics::getElectionMetricsBSON() {
     stdx::lock_guard<stdx::mutex> lk(_mutex);
     return _electionMetrics.toBSON();
diff --git a/src/mongo/db/repl/replication_metrics.h b/src/mongo/db/repl/replication_metrics.h
index 51dcd66ec77..de2c9876f68 100644
--- a/src/mongo/db/repl/replication_metrics.h
+++ b/src/mongo/db/repl/replication_metrics.h
@@ -49,16 +49,31 @@ public:
     ~ReplicationMetrics();
 
     void incrementNumElectionsCalledForReason(TopologyCoordinator::StartElectionReason reason);
+    void incrementNumElectionsSuccessfulForReason(TopologyCoordinator::StartElectionReason reason);
     void incrementNumStepDownsCausedByHigherTerm();
     void incrementNumCatchUps();
+    void incrementNumCatchUpsConcludedForReason(
+        ReplicationCoordinator::PrimaryCatchUpConclusionReason reason);
 
     int getNumStepUpCmdsCalled_forTesting();
     int getNumPriorityTakeoversCalled_forTesting();
     int getNumCatchUpTakeoversCalled_forTesting();
     int getNumElectionTimeoutsCalled_forTesting();
     int getNumFreezeTimeoutsCalled_forTesting();
+    int getNumStepUpCmdsSuccessful_forTesting();
+    int getNumPriorityTakeoversSuccessful_forTesting();
+    int getNumCatchUpTakeoversSuccessful_forTesting();
+    int getNumElectionTimeoutsSuccessful_forTesting();
+    int getNumFreezeTimeoutsSuccessful_forTesting();
     int getNumStepDownsCausedByHigherTerm_forTesting();
     int getNumCatchUps_forTesting();
+    int getNumCatchUpsSucceeded_forTesting();
+    int getNumCatchUpsAlreadyCaughtUp_forTesting();
+    int getNumCatchUpsSkipped_forTesting();
+    int getNumCatchUpsTimedOut_forTesting();
+    int getNumCatchUpsFailedWithError_forTesting();
+    int getNumCatchUpsFailedWithNewTerm_forTesting();
+    int getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting();
 
     BSONObj getElectionMetricsBSON();
 
diff --git a/src/mongo/db/repl/replication_metrics.idl b/src/mongo/db/repl/replication_metrics.idl
index e500814e44e..c9ddd9f2027 100644
--- a/src/mongo/db/repl/replication_metrics.idl
+++ b/src/mongo/db/repl/replication_metrics.idl
@@ -86,6 +86,39 @@ structs:
                               was behind"
                 type: long
                 default: 0
+            numCatchUpsSucceeded:
+                description: "Number of times primary catchup concluded because the primary caught
+                              up to the latest known optime successfully"
+                type: long
+                default: 0
+            numCatchUpsAlreadyCaughtUp:
+                description: "Number of times primary catchup concluded because the primary was
+                              already caught up"
+                type: long
+                default: 0
+            numCatchUpsSkipped:
+                description: "Number of times primary catchup concluded because it was skipped"
+                type: long
+                default: 0
+            numCatchUpsTimedOut:
+                description: "Number of times primary catchup concluded because it timed out"
+                type: long
+                default: 0
+            numCatchUpsFailedWithError:
+                description: "Number of times primary catchup concluded because it failed with an
+                              error"
+                type: long
+                default: 0
+            numCatchUpsFailedWithNewTerm:
+                description: "Number of times primary catchup concluded because the primary stepped
+                              down on seeing a higher term"
+                type: long
+                default: 0
+            numCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd:
+                description: "Number of times primary catchup concluded because of the
+                              replSetAbortPrimaryCatchUp command"
+                type: long
+                default: 0
 
     ElectionCandidateMetrics:
         description: "Stores metrics that are specific to the last election in which the node was a
diff --git a/src/mongo/embedded/replication_coordinator_embedded.cpp b/src/mongo/embedded/replication_coordinator_embedded.cpp
index 185c0b276ef..2b36011e3b1 100644
--- a/src/mongo/embedded/replication_coordinator_embedded.cpp
+++ b/src/mongo/embedded/replication_coordinator_embedded.cpp
@@ -321,7 +321,7 @@ Status ReplicationCoordinatorEmbedded::processReplSetInitiate(OperationContext*,
     UASSERT_NOT_IMPLEMENTED;
 }
 
-Status ReplicationCoordinatorEmbedded::abortCatchupIfNeeded() {
+Status ReplicationCoordinatorEmbedded::abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) {
     UASSERT_NOT_IMPLEMENTED;
 }
 
diff --git a/src/mongo/embedded/replication_coordinator_embedded.h b/src/mongo/embedded/replication_coordinator_embedded.h
index afedb2b2e2e..b591d352f43 100644
--- a/src/mongo/embedded/replication_coordinator_embedded.h
+++ b/src/mongo/embedded/replication_coordinator_embedded.h
@@ -235,7 +235,7 @@ public:
 
     Status stepUpIfEligible(bool skipDryRun) override;
 
-    Status abortCatchupIfNeeded() override;
+    Status abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) override;
 
     void signalDropPendingCollectionsRemovedFromStorage() final;
author	Samyukta Lanka <samy.lanka@mongodb.com>	2019-09-26 17:19:01 +0000
committer	evergreen <evergreen@mongodb.com>	2019-09-26 17:19:01 +0000
commit	1467632ac13b8f55f3ec53299619db44e8b2037a (patch)
tree	33b018f35124a52c2e0b9e3e5fd50ae51310404d
parent	39bc70a2df250696b983bb26c7aba4e0c8f2d3ac (diff)
download	mongo-1467632ac13b8f55f3ec53299619db44e8b2037a.tar.gz