summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSamyukta Lanka <samy.lanka@mongodb.com>2019-09-26 17:19:01 +0000
committerevergreen <evergreen@mongodb.com>2019-09-26 17:19:01 +0000
commit1467632ac13b8f55f3ec53299619db44e8b2037a (patch)
tree33b018f35124a52c2e0b9e3e5fd50ae51310404d
parent39bc70a2df250696b983bb26c7aba4e0c8f2d3ac (diff)
downloadmongo-1467632ac13b8f55f3ec53299619db44e8b2037a.tar.gz
SERVER-41500 Track number of successful elections for each reason in serverStatus
(cherry picked from commit bf4f91a6087227295007535bf143a8dd20e6a6d5) SERVER-41510 Track the number of failed stepDowns in serverStatus (cherry picked from commit 4e8fbd52ca8cda811dc53bb160f3ca5074355dab) SERVER-41502 Track the number of times primary catchup concludes for each reason in serverStatus (cherry picked from commit 796214e8426b3036f4eb4bdcc4af9d218589778f)
-rw-r--r--jstests/replsets/catchup.js59
-rw-r--r--jstests/replsets/catchup_takeover_two_nodes_ahead.js19
-rw-r--r--jstests/replsets/freeze_timeout.js56
-rw-r--r--jstests/replsets/libs/election_metrics.js27
-rw-r--r--jstests/replsets/priority_takeover_one_node_higher_priority.js12
-rw-r--r--jstests/replsets/replset1.js7
-rw-r--r--jstests/replsets/server_election_metrics.js16
-rw-r--r--jstests/replsets/stepdown.js94
-rw-r--r--jstests/replsets/stepup.js37
-rw-r--r--src/mongo/db/repl/bgsync.cpp12
-rw-r--r--src/mongo/db/repl/election_reason_counter.h6
-rw-r--r--src/mongo/db/repl/repl_set_commands.cpp18
-rw-r--r--src/mongo/db/repl/replication_coordinator.h12
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp33
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.h13
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp25
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp240
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_test.cpp39
-rw-r--r--src/mongo/db/repl/replication_coordinator_mock.cpp2
-rw-r--r--src/mongo/db/repl/replication_coordinator_mock.h2
-rw-r--r--src/mongo/db/repl/replication_metrics.cpp132
-rw-r--r--src/mongo/db/repl/replication_metrics.h15
-rw-r--r--src/mongo/db/repl/replication_metrics.idl33
-rw-r--r--src/mongo/embedded/replication_coordinator_embedded.cpp2
-rw-r--r--src/mongo/embedded/replication_coordinator_embedded.h2
25 files changed, 826 insertions, 87 deletions
diff --git a/jstests/replsets/catchup.js b/jstests/replsets/catchup.js
index 5b5dabf57ae..d8a8b3a5bf6 100644
--- a/jstests/replsets/catchup.js
+++ b/jstests/replsets/catchup.js
@@ -99,14 +99,29 @@
rst.awaitReplication();
jsTest.log("Case 1: The primary is up-to-date after refreshing heartbeats.");
+ let initialNewPrimaryStatus =
+ assert.commandWorked(rst.getSecondary().adminCommand({serverStatus: 1}));
+
// Should complete transition to primary immediately.
var newPrimary = stepUpNode(rst.getSecondary());
// Should win an election and finish the transition very quickly.
assert.eq(newPrimary, rst.getPrimary());
rst.awaitReplication();
+ // Check that the 'numCatchUps' field has not been incremented in serverStatus.
+ let newNewPrimaryStatus = assert.commandWorked(newPrimary.adminCommand({serverStatus: 1}));
+ verifyServerStatusChange(initialNewPrimaryStatus.electionMetrics,
+ newNewPrimaryStatus.electionMetrics,
+ 'numCatchUps',
+ 0);
+ // Check that the 'numCatchUpsAlreadyCaughtUp' field has been incremented in serverStatus, and
+ // that none of the other reasons for catchup concluding has been incremented.
+ verifyCatchUpConclusionReason(initialNewPrimaryStatus.electionMetrics,
+ newNewPrimaryStatus.electionMetrics,
+ 'numCatchUpsAlreadyCaughtUp');
+
jsTest.log("Case 2: The primary needs to catch up, succeeds in time.");
- let initialNewPrimaryStatus =
+ initialNewPrimaryStatus =
assert.commandWorked(rst.getSecondaries()[0].adminCommand({serverStatus: 1}));
var stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp();
@@ -117,12 +132,17 @@
assert.eq(stepUpResults.newPrimary, rst.getPrimary());
// Check that the 'numCatchUps' field has been incremented in serverStatus.
- let newNewPrimaryStatus =
+ newNewPrimaryStatus =
assert.commandWorked(stepUpResults.newPrimary.adminCommand({serverStatus: 1}));
verifyServerStatusChange(initialNewPrimaryStatus.electionMetrics,
newNewPrimaryStatus.electionMetrics,
'numCatchUps',
1);
+ // Check that the 'numCatchUpsSucceeded' field has been incremented in serverStatus, and that
+ // none of the other reasons for catchup concluding has been incremented.
+ verifyCatchUpConclusionReason(initialNewPrimaryStatus.electionMetrics,
+ newNewPrimaryStatus.electionMetrics,
+ 'numCatchUpsSucceeded');
// Wait for all secondaries to catch up
rst.awaitReplication();
@@ -160,6 +180,9 @@
rst.awaitReplication();
jsTest.log("Case 4: The primary needs to catch up, fails due to timeout.");
+ initialNewPrimaryStatus =
+ assert.commandWorked(rst.getSecondaries()[0].adminCommand({serverStatus: 1}));
+
// Reconfig to make the catchup timeout shorter.
reconfigElectionAndCatchUpTimeout(conf.settings.electionTimeoutMillis, 10 * 1000);
@@ -169,6 +192,14 @@
restartServerReplication(stepUpResults.newPrimary);
assert.eq(stepUpResults.newPrimary, rst.getPrimary());
+ // Check that the 'numCatchUpsTimedOut' field has been incremented in serverStatus, and that
+ // none of the other reasons for catchup concluding has been incremented.
+ newNewPrimaryStatus =
+ assert.commandWorked(stepUpResults.newPrimary.adminCommand({serverStatus: 1}));
+ verifyCatchUpConclusionReason(initialNewPrimaryStatus.electionMetrics,
+ newNewPrimaryStatus.electionMetrics,
+ 'numCatchUpsTimedOut');
+
// Wait for the no-op "new primary" after winning an election, so that we know it has
// finished transition to primary.
assert.soon(function() {
@@ -185,9 +216,21 @@
reconfigElectionAndCatchUpTimeout(conf.settings.electionTimeoutMillis, -1);
stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp();
+ initialNewPrimaryStatus =
+ assert.commandWorked(stepUpResults.newPrimary.adminCommand({serverStatus: 1}));
+
// Abort catchup.
assert.commandWorked(stepUpResults.newPrimary.adminCommand({replSetAbortPrimaryCatchUp: 1}));
+ // Check that the 'numCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd' field has been
+ // incremented in serverStatus, and that none of the other reasons for catchup concluding has
+ // been incremented.
+ newNewPrimaryStatus =
+ assert.commandWorked(stepUpResults.newPrimary.adminCommand({serverStatus: 1}));
+ verifyCatchUpConclusionReason(initialNewPrimaryStatus.electionMetrics,
+ newNewPrimaryStatus.electionMetrics,
+ 'numCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd');
+
// Wait for the no-op "new primary" after winning an election, so that we know it has
// finished transition to primary.
assert.soon(function() {
@@ -201,6 +244,9 @@
checkOpInOplog(stepUpResults.newPrimary, stepUpResults.latestOpOnOldPrimary, 0);
jsTest.log("Case 6: The primary needs to catch up with no timeout, but steps down.");
+ initialNewPrimaryStatus =
+ assert.commandWorked(rst.getSecondaries()[0].adminCommand({serverStatus: 1}));
+
var stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp();
// Step-down command should abort catchup.
@@ -209,6 +255,15 @@
} catch (e) {
print(e);
}
+
+ // Check that the 'numCatchUpsFailedWithError' field has been incremented in serverStatus, and
+ // that none of the other reasons for catchup concluding has been incremented.
+ newNewPrimaryStatus =
+ assert.commandWorked(stepUpResults.newPrimary.adminCommand({serverStatus: 1}));
+ verifyCatchUpConclusionReason(initialNewPrimaryStatus.electionMetrics,
+ newNewPrimaryStatus.electionMetrics,
+ 'numCatchUpsFailedWithError');
+
// Rename the primary.
var steppedDownPrimary = stepUpResults.newPrimary;
var newPrimary = rst.getPrimary();
diff --git a/jstests/replsets/catchup_takeover_two_nodes_ahead.js b/jstests/replsets/catchup_takeover_two_nodes_ahead.js
index 69dddd079a1..1efe741da51 100644
--- a/jstests/replsets/catchup_takeover_two_nodes_ahead.js
+++ b/jstests/replsets/catchup_takeover_two_nodes_ahead.js
@@ -14,6 +14,7 @@
'use strict';
load('jstests/replsets/rslib.js');
+ load('jstests/replsets/libs/election_metrics.js');
var name = 'catchup_takeover_two_nodes_ahead';
var replSet = new ReplSetTest({name: name, nodes: 5});
@@ -35,6 +36,9 @@
writeConcern = {writeConcern: {w: 1, wtimeout: replSet.kDefaultTimeoutMS}};
assert.writeOK(primary.getDB(name).bar.insert({y: 100}, writeConcern));
+ const initialPrimaryStatus = assert.commandWorked(primary.adminCommand({serverStatus: 1}));
+ const initialNode2Status = assert.commandWorked(nodes[2].adminCommand({serverStatus: 1}));
+
// Step up one of the lagged nodes.
assert.commandWorked(nodes[2].adminCommand({replSetStepUp: 1}));
replSet.awaitNodesAgreeOnPrimary();
@@ -52,6 +56,21 @@
// after the default catchup delay.
replSet.waitForState(0, ReplSetTest.State.PRIMARY, 60 * 1000);
+ // Check that both the 'called' and 'successful' fields of the 'catchUpTakeover' election reason
+ // counter have been incremented in serverStatus.
+ const newPrimaryStatus = assert.commandWorked(primary.adminCommand({serverStatus: 1}));
+ verifyServerStatusElectionReasonCounterChange(initialPrimaryStatus.electionMetrics,
+ newPrimaryStatus.electionMetrics,
+ "catchUpTakeover",
+ 1);
+
+ // Check that the 'numCatchUpsFailedWithNewTerm' field has been incremented in serverStatus, and
+ // that none of the other reasons for catchup concluding has been incremented.
+ const newNode2Status = assert.commandWorked(nodes[2].adminCommand({serverStatus: 1}));
+ verifyCatchUpConclusionReason(initialNode2Status.electionMetrics,
+ newNode2Status.electionMetrics,
+ 'numCatchUpsFailedWithNewTerm');
+
// Wait until the old primary steps down so the connections won't be closed.
replSet.waitForState(2, ReplSetTest.State.SECONDARY, replSet.kDefaultTimeoutMS);
// Let the nodes catchup.
diff --git a/jstests/replsets/freeze_timeout.js b/jstests/replsets/freeze_timeout.js
new file mode 100644
index 00000000000..e61a3ea65f3
--- /dev/null
+++ b/jstests/replsets/freeze_timeout.js
@@ -0,0 +1,56 @@
+/**
+ * Tests that the freezeTimeout election reason counter in serverStatus is incremented in a single
+ * node replica set both after a freeze timeout and after a stepdown timeout expires.
+ */
+(function() {
+ "use strict";
+ load('jstests/replsets/libs/election_metrics.js');
+
+ jsTestLog('1: initialize single node replica set');
+ const replSet = new ReplSetTest({name: 'freeze_timeout', nodes: 1});
+ const nodes = replSet.startSet();
+ const config = replSet.getReplSetConfig();
+ replSet.initiate(config);
+ replSet.awaitReplication();
+ let primary = replSet.getPrimary();
+ const initialPrimaryStatus = assert.commandWorked(primary.adminCommand({serverStatus: 1}));
+
+ jsTestLog('2: step down primary');
+ assert.throws(function() {
+ primary.getDB("admin").runCommand({replSetStepDown: 10, force: 1});
+ });
+
+ jsTestLog('3: wait for stepped down node to become primary again');
+ primary = replSet.getPrimary();
+
+ // Check that both the 'called' and 'successful' fields of the 'freezeTimeout' election reason
+ // counter have been incremented in serverStatus. When a stepdown timeout expires in a single
+ // node replica set, an election is called for the same reason as is used when a freeze timeout
+ // expires.
+ let newPrimaryStatus = assert.commandWorked(primary.adminCommand({serverStatus: 1}));
+ verifyServerStatusElectionReasonCounterChange(
+ initialPrimaryStatus.electionMetrics, newPrimaryStatus.electionMetrics, "freezeTimeout", 1);
+
+ jsTestLog('4: step down primary again');
+ assert.throws(function() {
+ primary.getDB("admin").runCommand({replSetStepDown: 10, force: 1});
+ });
+
+ jsTestLog('5: freeze stepped down primary for 30 seconds');
+ primary.getDB("admin").runCommand({replSetFreeze: 30});
+ sleep(1000);
+
+ jsTestLog('6: unfreeze stepped down primary after waiting for 1 second');
+ primary.getDB("admin").runCommand({replSetFreeze: 0});
+
+ jsTestLog('7: wait for unfrozen node to become primary again');
+ primary = replSet.getPrimary();
+
+ // Check that both the 'called' and 'successful' fields of the 'freezeTimeout' election reason
+ // counter have been incremented again in serverStatus.
+ newPrimaryStatus = assert.commandWorked(primary.adminCommand({serverStatus: 1}));
+ verifyServerStatusElectionReasonCounterChange(
+ initialPrimaryStatus.electionMetrics, newPrimaryStatus.electionMetrics, "freezeTimeout", 2);
+
+ replSet.stopSet();
+})();
diff --git a/jstests/replsets/libs/election_metrics.js b/jstests/replsets/libs/election_metrics.js
index 91d6b4815e3..36307b2117e 100644
--- a/jstests/replsets/libs/election_metrics.js
+++ b/jstests/replsets/libs/election_metrics.js
@@ -41,3 +41,30 @@ function verifyServerStatusChange(initialStatus, newStatus, fieldName, expectedI
newStatus[fieldName],
`expected '${fieldName}' to increase by ${expectedIncrement}`);
}
+
+/**
+ * Verifies that the given reason for primary catchup concluding is incremented in serverStatus, and
+ * that none of the other reasons are.
+ */
+function verifyCatchUpConclusionReason(initialStatus, newStatus, fieldName) {
+ const catchUpConclusionMetrics = [
+ "numCatchUpsSucceeded",
+ "numCatchUpsAlreadyCaughtUp",
+ "numCatchUpsSkipped",
+ "numCatchUpsTimedOut",
+ "numCatchUpsFailedWithError",
+ "numCatchUpsFailedWithNewTerm",
+ "numCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd"
+ ];
+
+ catchUpConclusionMetrics.forEach(function(metric) {
+ if (metric === fieldName) {
+ assert.eq(initialStatus[metric] + 1,
+ newStatus[metric],
+ `expected '${metric}' to increase by 1`);
+ } else {
+ assert.eq(
+ initialStatus[metric], newStatus[metric], `did not expect '${metric}' to increase`);
+ }
+ });
+}
diff --git a/jstests/replsets/priority_takeover_one_node_higher_priority.js b/jstests/replsets/priority_takeover_one_node_higher_priority.js
index b4385cc2376..b39bd47545f 100644
--- a/jstests/replsets/priority_takeover_one_node_higher_priority.js
+++ b/jstests/replsets/priority_takeover_one_node_higher_priority.js
@@ -6,6 +6,7 @@
(function() {
'use strict';
load('jstests/replsets/rslib.js');
+ load('jstests/replsets/libs/election_metrics.js');
var name = 'priority_takeover_one_node_higher_priority';
var replSet = new ReplSetTest({
@@ -22,6 +23,8 @@
replSet.waitForState(replSet.nodes[0], ReplSetTest.State.PRIMARY);
var primary = replSet.getPrimary();
+ const initialPrimaryStatus = assert.commandWorked(primary.adminCommand({serverStatus: 1}));
+
replSet.awaitSecondaryNodes();
replSet.awaitReplication();
@@ -42,5 +45,14 @@
// Eventually node 0 will stand for election again because it has a higher priorty.
replSet.waitForState(replSet.nodes[0], ReplSetTest.State.PRIMARY);
+
+ // Check that both the 'called' and 'successful' fields of the 'priorityTakeover' election
+ // reason counter have been incremented in serverStatus.
+ const newPrimaryStatus = assert.commandWorked(primary.adminCommand({serverStatus: 1}));
+ verifyServerStatusElectionReasonCounterChange(initialPrimaryStatus.electionMetrics,
+ newPrimaryStatus.electionMetrics,
+ "priorityTakeover",
+ 1);
+
replSet.stopSet();
})();
diff --git a/jstests/replsets/replset1.js b/jstests/replsets/replset1.js
index 05aee648f21..8b1e72d6e46 100644
--- a/jstests/replsets/replset1.js
+++ b/jstests/replsets/replset1.js
@@ -2,6 +2,7 @@ var ssl_options1;
var ssl_options2;
var ssl_name;
load("jstests/replsets/rslib.js");
+load('jstests/replsets/libs/election_metrics.js');
var doTest = function(signal) {
// Test basic replica set functionality.
@@ -29,6 +30,12 @@ var doTest = function(signal) {
// elected master.
var master = replTest.getPrimary();
+ // Check that both the 'called' and 'successful' fields of the 'electionTimeout' election reason
+ // counter have been incremented in serverStatus.
+ const primaryStatus = assert.commandWorked(master.adminCommand({serverStatus: 1}));
+ verifyServerStatusElectionReasonCounterValue(
+ primaryStatus.electionMetrics, "electionTimeout", 1);
+
// Ensure the primary logs an n-op to the oplog upon transitioning to primary.
assert.gt(master.getDB("local").oplog.rs.count({op: 'n', o: {msg: 'new primary'}}), 0);
diff --git a/jstests/replsets/server_election_metrics.js b/jstests/replsets/server_election_metrics.js
index ceae6a26ee2..23d018287ef 100644
--- a/jstests/replsets/server_election_metrics.js
+++ b/jstests/replsets/server_election_metrics.js
@@ -1,10 +1,10 @@
/**
- * Tests the format of electionMetrics serverStatus section.
+ * Tests the format of 'electionMetrics' serverStatus section.
*/
(function() {
"use strict";
- // Verifies that the electionMetrics serverStatus section has the given field.
+ // Verifies that the 'electionMetrics' serverStatus section has the given field.
function verifyElectionMetricsField(serverStatusResponse, fieldName) {
assert(serverStatusResponse.electionMetrics.hasOwnProperty(fieldName),
() => (`The 'electionMetrics' serverStatus section did not have the '${fieldName}' \
@@ -12,7 +12,7 @@ field: \n${tojson(serverStatusResponse.electionMetrics)}`));
return serverStatusResponse.electionMetrics[fieldName];
}
- // Verifies that the electionMetrics serverStatus section has a field for the given election
+ // Verifies that the 'electionMetrics' serverStatus section has a field for the given election
// reason counter and that it has the subfields 'called' and 'successful'.
function verifyElectionReasonCounterFields(serverStatusResponse, fieldName) {
const field = verifyElectionMetricsField(serverStatusResponse, fieldName);
@@ -24,7 +24,7 @@ not have the 'called' field: \n${tojson(field)}`));
not have the 'successful' field: \n${tojson(field)}`));
}
- // Verifies the format of the electionMetrics serverStatus section.
+ // Verifies the format of the 'electionMetrics' serverStatus section.
function verifyElectionMetricsSSS(serverStatusResponse) {
assert(serverStatusResponse.hasOwnProperty("electionMetrics"),
() => (`Expected the serverStatus response to have an 'electionMetrics' field:
@@ -36,6 +36,14 @@ ${tojson(serverStatusResponse)}`));
verifyElectionReasonCounterFields(serverStatusResponse, "freezeTimeout");
verifyElectionMetricsField(serverStatusResponse, "numStepDownsCausedByHigherTerm");
verifyElectionMetricsField(serverStatusResponse, "numCatchUps");
+ verifyElectionMetricsField(serverStatusResponse, "numCatchUpsSucceeded");
+ verifyElectionMetricsField(serverStatusResponse, "numCatchUpsAlreadyCaughtUp");
+ verifyElectionMetricsField(serverStatusResponse, "numCatchUpsSkipped");
+ verifyElectionMetricsField(serverStatusResponse, "numCatchUpsTimedOut");
+ verifyElectionMetricsField(serverStatusResponse, "numCatchUpsFailedWithError");
+ verifyElectionMetricsField(serverStatusResponse, "numCatchUpsFailedWithNewTerm");
+ verifyElectionMetricsField(serverStatusResponse,
+ "numCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd");
}
// Set up the replica set.
diff --git a/jstests/replsets/stepdown.js b/jstests/replsets/stepdown.js
index acfbbb0f6ee..78703f0ce98 100644
--- a/jstests/replsets/stepdown.js
+++ b/jstests/replsets/stepdown.js
@@ -2,6 +2,9 @@
* Check that on a loss of primary, another node doesn't assume primary if it is stale. We force a
* stepDown to test this.
*
+ * This test also checks that the serverStatus command metrics replSetStepDown and
+ * replSetStepDownWithForce are incremented correctly.
+ *
* This test requires the fsync command to force a secondary to be stale.
* @tags: [requires_fsync]
*/
@@ -61,7 +64,7 @@ try {
assert.writeOK(master.getDB("foo").bar.insert({x: i}));
}
- const intitialServerStatus = assert.commandWorked(master.adminCommand({serverStatus: 1}));
+ let intitialServerStatus = assert.commandWorked(master.adminCommand({serverStatus: 1}));
jsTestLog('Do stepdown of primary ' + master + ' that should not work');
@@ -70,17 +73,57 @@ try {
'Step down ' + master + ' expected error: ' +
tojson(assert.commandFailed(master.getDB("admin").runCommand({replSetStepDown: 10}))));
- // Check that the 'total' field of 'replSetStepDown' has been incremented in serverStatus and
- // that it has not been incremented for 'replSetStepDownWithForce'.
+ // Check that the 'total' and 'failed' fields of 'replSetStepDown' have been incremented in
+ // serverStatus and that they have not been incremented for 'replSetStepDownWithForce'.
let newServerStatus = assert.commandWorked(master.adminCommand({serverStatus: 1}));
verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDown,
newServerStatus.metrics.commands.replSetStepDown,
"total",
1);
+ verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDown,
+ newServerStatus.metrics.commands.replSetStepDown,
+ "failed",
+ 1);
verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDownWithForce,
newServerStatus.metrics.commands.replSetStepDownWithForce,
"total",
0);
+ verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDownWithForce,
+ newServerStatus.metrics.commands.replSetStepDownWithForce,
+ "failed",
+ 0);
+
+ // This section checks that the metrics are incremented accurately when the command fails due to
+ // an error occurring before stepDown is called in the replication coordinator, such as due to
+ // bad values or type mismatches in the arguments, or checkReplEnabledForCommand returning a bad
+ // status. The stepdown period being negative is one example of such an error, but success in
+ // this case gives us confidence that the behavior in the other cases is the same.
+
+ // Stepdown should fail because the stepdown period is negative
+ jsTestLog('Do stepdown of primary ' + master + ' that should not work');
+ assert.commandFailedWithCode(
+ master.getDB("admin").runCommand({replSetStepDown: -1, force: true}), ErrorCodes.BadValue);
+
+ // Check that the 'total' and 'failed' fields of 'replSetStepDown' and
+ // 'replSetStepDownWithForce' have been incremented in serverStatus.
+ intitialServerStatus = newServerStatus;
+ newServerStatus = assert.commandWorked(master.adminCommand({serverStatus: 1}));
+ verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDown,
+ newServerStatus.metrics.commands.replSetStepDown,
+ "total",
+ 1);
+ verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDown,
+ newServerStatus.metrics.commands.replSetStepDown,
+ "failed",
+ 1);
+ verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDownWithForce,
+ newServerStatus.metrics.commands.replSetStepDownWithForce,
+ "total",
+ 1);
+ verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDownWithForce,
+ newServerStatus.metrics.commands.replSetStepDownWithForce,
+ "failed",
+ 1);
// The server will disconnect the client on a successful forced stepdown so we use the
// presence of an exception to confirm the forced stepdown result.
@@ -93,22 +136,63 @@ try {
tojson(exceptionFromForcedStepDown));
// Check that the 'total' fields of 'replSetStepDown' and 'replSetStepDownWithForce' have been
- // incremented in serverStatus.
+ // incremented in serverStatus and that their 'failed' fields have not been incremented.
+ intitialServerStatus = newServerStatus;
newServerStatus = assert.commandWorked(master.adminCommand({serverStatus: 1}));
verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDown,
newServerStatus.metrics.commands.replSetStepDown,
"total",
- 2);
+ 1);
+ verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDown,
+ newServerStatus.metrics.commands.replSetStepDown,
+ "failed",
+ 0);
verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDownWithForce,
newServerStatus.metrics.commands.replSetStepDownWithForce,
"total",
1);
+ verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDownWithForce,
+ newServerStatus.metrics.commands.replSetStepDownWithForce,
+ "failed",
+ 0);
jsTestLog('Checking isMaster on ' + master);
var r2 = assert.commandWorked(master.getDB("admin").runCommand({ismaster: 1}));
jsTestLog('Result from running isMaster on ' + master + ': ' + tojson(r2));
assert.eq(r2.ismaster, false);
assert.eq(r2.secondary, true);
+
+ // This section checks that the metrics are incremented accurately when the command fails due to
+ // an error while stepping down. This is one reason the replSetStepDown command could fail once
+ // we call stepDown in the replication coordinator, but success in this case gives us confidence
+ // that the behavior in the other cases is the same.
+
+ // Stepdown should fail because the node is no longer primary
+ jsTestLog('Do stepdown of primary ' + master + ' that should not work');
+ assert.commandFailedWithCode(master.getDB("admin").runCommand(
+ {replSetStepDown: ReplSetTest.kDefaultTimeoutMS, force: true}),
+ ErrorCodes.NotMaster);
+
+ // Check that the 'total' and 'failed' fields of 'replSetStepDown' and
+ // 'replSetStepDownWithForce' have been incremented in serverStatus.
+ intitialServerStatus = newServerStatus;
+ newServerStatus = assert.commandWorked(master.adminCommand({serverStatus: 1}));
+ verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDown,
+ newServerStatus.metrics.commands.replSetStepDown,
+ "total",
+ 1);
+ verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDown,
+ newServerStatus.metrics.commands.replSetStepDown,
+ "failed",
+ 1);
+ verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDownWithForce,
+ newServerStatus.metrics.commands.replSetStepDownWithForce,
+ "total",
+ 1);
+ verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDownWithForce,
+ newServerStatus.metrics.commands.replSetStepDownWithForce,
+ "failed",
+ 1);
} catch (e) {
throw e;
} finally {
diff --git a/jstests/replsets/stepup.js b/jstests/replsets/stepup.js
index 93586bfcae7..4c5d20d42c4 100644
--- a/jstests/replsets/stepup.js
+++ b/jstests/replsets/stepup.js
@@ -1,6 +1,7 @@
// Tests the replSetStepUp command.
load("jstests/replsets/rslib.js");
+load('jstests/replsets/libs/election_metrics.js');
(function() {
"use strict";
@@ -44,23 +45,29 @@ load("jstests/replsets/rslib.js");
// Make sure the step up succeeded.
assert.eq(secondary, rst.getPrimary());
- // Verifies that the given election reason counter is incremented in the way we expect.
- function verifyServerStatusChange(initialStatus, newStatus, fieldName, expectedIncrement) {
- assert.eq(
- initialStatus.electionMetrics[fieldName]["called"] + expectedIncrement,
- newStatus.electionMetrics[fieldName]["called"],
- "expected the 'called' field of " + fieldName + " to increase by " + expectedIncrement);
- }
-
const newSecondaryStatus = assert.commandWorked(secondary.adminCommand({serverStatus: 1}));
- // Check that the 'called' field of stepUpCmd has been incremented in serverStatus, and that it
- // has not been incremented in any of the other election reason counters.
- verifyServerStatusChange(initialSecondaryStatus, newSecondaryStatus, "stepUpCmd", 1);
- verifyServerStatusChange(initialSecondaryStatus, newSecondaryStatus, "priorityTakeover", 0);
- verifyServerStatusChange(initialSecondaryStatus, newSecondaryStatus, "catchUpTakeover", 0);
- verifyServerStatusChange(initialSecondaryStatus, newSecondaryStatus, "electionTimeout", 0);
- verifyServerStatusChange(initialSecondaryStatus, newSecondaryStatus, "freezeTimeout", 0);
+ // Check that both the 'called' and 'successful' fields of stepUpCmd have been incremented in
+ // serverStatus, and that they have not been incremented in any of the other election reason
+ // counters.
+ verifyServerStatusElectionReasonCounterChange(
+ initialSecondaryStatus.electionMetrics, newSecondaryStatus.electionMetrics, "stepUpCmd", 1);
+ verifyServerStatusElectionReasonCounterChange(initialSecondaryStatus.electionMetrics,
+ newSecondaryStatus.electionMetrics,
+ "priorityTakeover",
+ 0);
+ verifyServerStatusElectionReasonCounterChange(initialSecondaryStatus.electionMetrics,
+ newSecondaryStatus.electionMetrics,
+ "catchUpTakeover",
+ 0);
+ verifyServerStatusElectionReasonCounterChange(initialSecondaryStatus.electionMetrics,
+ newSecondaryStatus.electionMetrics,
+ "electionTimeout",
+ 0);
+ verifyServerStatusElectionReasonCounterChange(initialSecondaryStatus.electionMetrics,
+ newSecondaryStatus.electionMetrics,
+ "freezeTimeout",
+ 0);
rst.stopSet();
})();
diff --git a/src/mongo/db/repl/bgsync.cpp b/src/mongo/db/repl/bgsync.cpp
index 08502a9b66b..1acac0bbca6 100644
--- a/src/mongo/db/repl/bgsync.cpp
+++ b/src/mongo/db/repl/bgsync.cpp
@@ -347,7 +347,11 @@ void BackgroundSync::_produce() {
log() << "Our newest OpTime : " << lastOpTimeFetched;
log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen
<< " from " << syncSourceResp.getSyncSource();
- _replCoord->abortCatchupIfNeeded().transitional_ignore();
+ auto status = _replCoord->abortCatchupIfNeeded(
+ ReplicationCoordinator::PrimaryCatchUpConclusionReason::kFailedWithError);
+ if (!status.isOK()) {
+ LOG(1) << "Aborting catch-up failed with status: " << status;
+ }
return;
}
@@ -596,7 +600,11 @@ void BackgroundSync::_runRollback(OperationContext* opCtx,
StorageInterface* storageInterface) {
if (_replCoord->getMemberState().primary()) {
warning() << "Rollback situation detected in catch-up mode. Aborting catch-up mode.";
- _replCoord->abortCatchupIfNeeded().transitional_ignore();
+ auto status = _replCoord->abortCatchupIfNeeded(
+ ReplicationCoordinator::PrimaryCatchUpConclusionReason::kFailedWithError);
+ if (!status.isOK()) {
+ LOG(1) << "Aborting catch-up failed with status: " << status;
+ }
return;
}
diff --git a/src/mongo/db/repl/election_reason_counter.h b/src/mongo/db/repl/election_reason_counter.h
index 8dd068181b5..801d9855a45 100644
--- a/src/mongo/db/repl/election_reason_counter.h
+++ b/src/mongo/db/repl/election_reason_counter.h
@@ -41,11 +41,17 @@ class ElectionReasonCounter : public ElectionReasonCounterBase {
public:
using ElectionReasonCounterBase::getCalled;
using ElectionReasonCounterBase::setCalled;
+ using ElectionReasonCounterBase::getSuccessful;
+ using ElectionReasonCounterBase::setSuccessful;
void incrementCalled() {
setCalled(getCalled() + 1);
}
+ void incrementSuccessful() {
+ setSuccessful(getSuccessful() + 1);
+ }
+
ElectionReasonCounter parse(const IDLParserErrorContext& ctxt, const BSONObj& bsonObject);
};
diff --git a/src/mongo/db/repl/repl_set_commands.cpp b/src/mongo/db/repl/repl_set_commands.cpp
index d5624256df8..48f9c96f4e7 100644
--- a/src/mongo/db/repl/repl_set_commands.cpp
+++ b/src/mongo/db/repl/repl_set_commands.cpp
@@ -455,7 +455,9 @@ public:
CmdReplSetStepDown()
: ReplSetCommand("replSetStepDown"),
_stepDownCmdsWithForceExecutedMetric("commands.replSetStepDownWithForce.total",
- &_stepDownCmdsWithForceExecuted) {}
+ &_stepDownCmdsWithForceExecuted),
+ _stepDownCmdsWithForceFailedMetric("commands.replSetStepDownWithForce.failed",
+ &_stepDownCmdsWithForceFailed) {}
virtual bool run(OperationContext* opCtx,
const string&,
const BSONObj& cmdObj,
@@ -466,6 +468,12 @@ public:
_stepDownCmdsWithForceExecuted.increment();
}
+ auto onExitGuard = MakeGuard([&] {
+ if (force) {
+ _stepDownCmdsWithForceFailed.increment();
+ }
+ });
+
Status status = ReplicationCoordinator::get(opCtx)->checkReplEnabledForCommand(&result);
uassertStatusOK(status);
@@ -508,12 +516,16 @@ public:
status = ReplicationCoordinator::get(opCtx)->stepDown(
opCtx, force, Seconds(secondaryCatchUpPeriodSecs), Seconds(stepDownForSecs));
uassertStatusOK(status);
+
+ onExitGuard.Dismiss();
return true;
}
private:
mutable Counter64 _stepDownCmdsWithForceExecuted;
+ mutable Counter64 _stepDownCmdsWithForceFailed;
ServerStatusMetricField<Counter64> _stepDownCmdsWithForceExecutedMetric;
+ ServerStatusMetricField<Counter64> _stepDownCmdsWithForceFailedMetric;
ActionSet getAuthActionSet() const override {
return ActionSet{ActionType::replSetStateChange};
@@ -839,7 +851,9 @@ public:
uassertStatusOK(status);
log() << "Received replSetAbortPrimaryCatchUp request";
- status = ReplicationCoordinator::get(opCtx)->abortCatchupIfNeeded();
+ status = ReplicationCoordinator::get(opCtx)->abortCatchupIfNeeded(
+ ReplicationCoordinator::PrimaryCatchUpConclusionReason::
+ kFailedWithReplSetAbortPrimaryCatchUpCmd);
if (!status.isOK()) {
log() << "replSetAbortPrimaryCatchUp request failed" << causedBy(status);
}
diff --git a/src/mongo/db/repl/replication_coordinator.h b/src/mongo/db/repl/replication_coordinator.h
index 78fed57782c..9dfdeb9fe90 100644
--- a/src/mongo/db/repl/replication_coordinator.h
+++ b/src/mongo/db/repl/replication_coordinator.h
@@ -826,10 +826,20 @@ public:
virtual ServiceContext* getServiceContext() = 0;
+ enum PrimaryCatchUpConclusionReason {
+ kSucceeded,
+ kAlreadyCaughtUp,
+ kSkipped,
+ kTimedOut,
+ kFailedWithError,
+ kFailedWithNewTerm,
+ kFailedWithReplSetAbortPrimaryCatchUpCmd
+ };
+
/**
* Abort catchup if the node is in catchup mode.
*/
- virtual Status abortCatchupIfNeeded() = 0;
+ virtual Status abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) = 0;
/**
* Signals that drop pending collections have been removed from storage.
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index 5b42bc5be0a..e1bb889c80a 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -2640,7 +2640,6 @@ ReplicationCoordinatorImpl::_updateMemberStateFromTopologyCoordinator_inlock(
_canAcceptNonLocalWrites.store(canAcceptWrites);
}
-
const MemberState newState = _topCoord->getMemberState();
if (newState == _memberState) {
if (_topCoord->getRole() == TopologyCoordinator::Role::kCandidate) {
@@ -2674,7 +2673,16 @@ ReplicationCoordinatorImpl::_updateMemberStateFromTopologyCoordinator_inlock(
// Exit catchup mode if we're in it and enable replication producer and applier on stepdown.
if (_memberState.primary()) {
if (_catchupState) {
- _catchupState->abort_inlock();
+ // _pendingTermUpdateDuringStepDown is set before stepping down due to hearing about a
+ // higher term, so that we can remember the term we heard and update our term as part of
+ // finishing stepdown. It is then unset toward the end of stepdown, after the function
+ // we are in is called. Thus we must be stepping down due to seeing a higher term if and
+ // only if _pendingTermUpdateDuringStepDown is set here.
+ if (_pendingTermUpdateDuringStepDown) {
+ _catchupState->abort_inlock(PrimaryCatchUpConclusionReason::kFailedWithNewTerm);
+ } else {
+ _catchupState->abort_inlock(PrimaryCatchUpConclusionReason::kFailedWithError);
+ }
}
_applierState = ApplierState::Running;
_externalState->startProducerIfStopped();
@@ -2825,7 +2833,7 @@ void ReplicationCoordinatorImpl::CatchupState::start_inlock() {
// No catchup in single node replica set.
if (_repl->_rsConfig.getNumMembers() == 1) {
- abort_inlock();
+ abort_inlock(PrimaryCatchUpConclusionReason::kSkipped);
return;
}
@@ -2834,7 +2842,7 @@ void ReplicationCoordinatorImpl::CatchupState::start_inlock() {
// When catchUpTimeoutMillis is 0, we skip doing catchup entirely.
if (catchupTimeout == ReplSetConfig::kCatchUpDisabled) {
log() << "Skipping primary catchup since the catchup timeout is 0.";
- abort_inlock();
+ abort_inlock(PrimaryCatchUpConclusionReason::kSkipped);
return;
}
@@ -2849,7 +2857,7 @@ void ReplicationCoordinatorImpl::CatchupState::start_inlock() {
return;
}
log() << "Catchup timed out after becoming primary.";
- abort_inlock();
+ abort_inlock(PrimaryCatchUpConclusionReason::kTimedOut);
};
// Deal with infinity and overflow - no timeout.
@@ -2862,15 +2870,18 @@ void ReplicationCoordinatorImpl::CatchupState::start_inlock() {
auto status = _repl->_replExecutor->scheduleWorkAt(timeoutDate, timeoutCB);
if (!status.isOK()) {
log() << "Failed to schedule catchup timeout work.";
- abort_inlock();
+ abort_inlock(PrimaryCatchUpConclusionReason::kFailedWithError);
return;
}
_timeoutCbh = status.getValue();
}
-void ReplicationCoordinatorImpl::CatchupState::abort_inlock() {
+void ReplicationCoordinatorImpl::CatchupState::abort_inlock(PrimaryCatchUpConclusionReason reason) {
invariant(_repl->_getMemberState_inlock().primary());
+ ReplicationMetrics::get(getGlobalServiceContext())
+ .incrementNumCatchUpsConcludedForReason(reason);
+
log() << "Exited primary catch-up mode.";
// Clean up its own members.
if (_timeoutCbh) {
@@ -2898,7 +2909,7 @@ void ReplicationCoordinatorImpl::CatchupState::signalHeartbeatUpdate_inlock() {
if (*targetOpTime <= myLastApplied) {
log() << "Caught up to the latest optime known via heartbeats after becoming primary. "
<< "Target optime: " << *targetOpTime << ". My Last Applied: " << myLastApplied;
- abort_inlock();
+ abort_inlock(PrimaryCatchUpConclusionReason::kAlreadyCaughtUp);
return;
}
@@ -2923,14 +2934,14 @@ void ReplicationCoordinatorImpl::CatchupState::signalHeartbeatUpdate_inlock() {
if (*targetOpTime <= myLastApplied) {
log() << "Caught up to the latest known optime successfully after becoming primary. "
<< "Target optime: " << *targetOpTime << ". My Last Applied: " << myLastApplied;
- abort_inlock();
+ abort_inlock(PrimaryCatchUpConclusionReason::kSucceeded);
}
};
_waiter = stdx::make_unique<CallbackWaiter>(*targetOpTime, targetOpTimeCB);
_repl->_opTimeWaiterList.add_inlock(_waiter.get());
}
-Status ReplicationCoordinatorImpl::abortCatchupIfNeeded() {
+Status ReplicationCoordinatorImpl::abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) {
if (!isV1ElectionProtocol()) {
return Status(ErrorCodes::CommandNotSupported,
"Primary catch-up is only supported by Protocol Version 1");
@@ -2938,7 +2949,7 @@ Status ReplicationCoordinatorImpl::abortCatchupIfNeeded() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
if (_catchupState) {
- _catchupState->abort_inlock();
+ _catchupState->abort_inlock(reason);
return Status::OK();
}
return Status(ErrorCodes::IllegalOperation, "The node is not in catch-up mode.");
diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h
index 8dfd864e3e6..31cc38b801e 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.h
+++ b/src/mongo/db/repl/replication_coordinator_impl.h
@@ -312,7 +312,7 @@ public:
virtual Status stepUpIfEligible(bool skipDryRun) override;
- virtual Status abortCatchupIfNeeded() override;
+ virtual Status abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) override;
void signalDropPendingCollectionsRemovedFromStorage() final;
@@ -553,7 +553,7 @@ private:
// start() can only be called once.
void start_inlock();
// Reset the state itself to destruct the state.
- void abort_inlock();
+ void abort_inlock(PrimaryCatchUpConclusionReason reason);
// Heartbeat calls this function to update the target optime.
void signalHeartbeatUpdate_inlock();
@@ -863,12 +863,14 @@ private:
* This job will be scheduled to run in DB worker threads.
*/
void _writeLastVoteForMyElection(LastVote lastVote,
- const executor::TaskExecutor::CallbackArgs& cbData);
+ const executor::TaskExecutor::CallbackArgs& cbData,
+ TopologyCoordinator::StartElectionReason reason);
/**
* Starts VoteRequester to run the real election when last vote write has completed.
*/
- void _startVoteRequester_inlock(long long newTerm);
+ void _startVoteRequester_inlock(long long newTerm,
+ TopologyCoordinator::StartElectionReason reason);
/**
* Callback called when the VoteRequester has completed; checks the results and
@@ -876,7 +878,8 @@ private:
* "originalTerm" was the term during which the election began, if the term has since
* changed, do not step up as primary.
*/
- void _onVoteRequestComplete(long long originalTerm);
+ void _onVoteRequestComplete(long long originalTerm,
+ TopologyCoordinator::StartElectionReason reason);
/**
* Callback called after a random delay, to prevent repeated election ties.
diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp
index 607e28fcc2a..c5c8c44df93 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp
@@ -233,8 +233,8 @@ void ReplicationCoordinatorImpl::_startRealElection_inlock(
LastVote lastVote{newTerm, _selfIndex};
auto cbStatus = _replExecutor->scheduleWork(
- [this, lastVote](const executor::TaskExecutor::CallbackArgs& cbData) {
- _writeLastVoteForMyElection(lastVote, cbData);
+ [this, lastVote, reason](const executor::TaskExecutor::CallbackArgs& cbData) {
+ _writeLastVoteForMyElection(lastVote, cbData, reason);
});
if (cbStatus.getStatus() == ErrorCodes::ShutdownInProgress) {
return;
@@ -244,7 +244,9 @@ void ReplicationCoordinatorImpl::_startRealElection_inlock(
}
void ReplicationCoordinatorImpl::_writeLastVoteForMyElection(
- LastVote lastVote, const executor::TaskExecutor::CallbackArgs& cbData) {
+ LastVote lastVote,
+ const executor::TaskExecutor::CallbackArgs& cbData,
+ TopologyCoordinator::StartElectionReason reason) {
// storeLocalLastVoteDocument can call back in to the replication coordinator,
// so _mutex must be unlocked here. However, we cannot return until we
// lock it because we want to lose the election on cancel or error and
@@ -274,13 +276,14 @@ void ReplicationCoordinatorImpl::_writeLastVoteForMyElection(
<< lastVote.getTerm() << ", current term: " << _topCoord->getTerm();
return;
}
- _startVoteRequester_inlock(lastVote.getTerm());
+ _startVoteRequester_inlock(lastVote.getTerm(), reason);
_replExecutor->signalEvent(_electionDryRunFinishedEvent);
lossGuard.dismiss();
}
-void ReplicationCoordinatorImpl::_startVoteRequester_inlock(long long newTerm) {
+void ReplicationCoordinatorImpl::_startVoteRequester_inlock(
+ long long newTerm, TopologyCoordinator::StartElectionReason reason) {
const auto lastOpTime = _getMyLastAppliedOpTime_inlock();
_voteRequester.reset(new VoteRequester);
@@ -291,15 +294,17 @@ void ReplicationCoordinatorImpl::_startVoteRequester_inlock(long long newTerm) {
}
fassert(28643, nextPhaseEvh.getStatus());
_replExecutor
- ->onEvent(
- nextPhaseEvh.getValue(),
- [=](const executor::TaskExecutor::CallbackArgs&) { _onVoteRequestComplete(newTerm); })
+ ->onEvent(nextPhaseEvh.getValue(),
+ [=](const executor::TaskExecutor::CallbackArgs&) {
+ _onVoteRequestComplete(newTerm, reason);
+ })
.status_with_transitional_ignore();
}
MONGO_FAIL_POINT_DEFINE(electionHangsBeforeUpdateMemberState);
-void ReplicationCoordinatorImpl::_onVoteRequestComplete(long long newTerm) {
+void ReplicationCoordinatorImpl::_onVoteRequestComplete(
+ long long newTerm, TopologyCoordinator::StartElectionReason reason) {
stdx::unique_lock<stdx::mutex> lk(_mutex);
LoseElectionGuardV1 lossGuard(this);
@@ -323,6 +328,8 @@ void ReplicationCoordinatorImpl::_onVoteRequestComplete(long long newTerm) {
return;
case VoteRequester::Result::kSuccessfullyElected:
log() << "election succeeded, assuming primary role in term " << _topCoord->getTerm();
+ ReplicationMetrics::get(getServiceContext())
+ .incrementNumElectionsSuccessfulForReason(reason);
break;
case VoteRequester::Result::kPrimaryRespondedNo:
// This is impossible because we would only require the primary's
diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp
index 88fc5cd5208..4970940746c 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp
@@ -242,6 +242,20 @@ TEST_F(ReplCoordTest, ElectionSucceedsWhenNodeIsTheOnlyNode) {
getReplCoord()->fillIsMasterForReplSet(&imResponse, {});
ASSERT_TRUE(imResponse.isMaster()) << imResponse.toBSON().toString();
ASSERT_FALSE(imResponse.isSecondary()) << imResponse.toBSON().toString();
+
+ // Check that only the 'numCatchUpsSkipped' primary catchup conclusion reason was incremented.
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtxPtr.get()).getNumCatchUpsSucceeded_forTesting());
+ ASSERT_EQ(0,
+ ReplicationMetrics::get(opCtxPtr.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+ ASSERT_EQ(1, ReplicationMetrics::get(opCtxPtr.get()).getNumCatchUpsSkipped_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtxPtr.get()).getNumCatchUpsTimedOut_forTesting());
+ ASSERT_EQ(0,
+ ReplicationMetrics::get(opCtxPtr.get()).getNumCatchUpsFailedWithError_forTesting());
+ ASSERT_EQ(0,
+ ReplicationMetrics::get(opCtxPtr.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+ ASSERT_EQ(0,
+ ReplicationMetrics::get(opCtxPtr.get())
+ .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
}
TEST_F(ReplCoordTest, ElectionSucceedsWhenAllNodesVoteYea) {
@@ -276,15 +290,21 @@ TEST_F(ReplCoordTest, ElectionSucceedsWhenAllNodesVoteYea) {
stopCapturingLogMessages();
ASSERT_EQUALS(1, countLogLinesContaining("election succeeded"));
- // Check that the numElectionTimeoutsCalled election metric has been incremented, and that none
- // of the metrics that track the number of elections called for other reasons has been
- // incremented.
+ // Check that the numElectionTimeoutsCalled and the numElectionTimeoutsSuccessful election
+ // metrics have been incremented, and that none of the metrics that track the number of
+ // elections called or successful for other reasons has been incremented.
ServiceContext* svcCtx = getServiceContext();
ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumStepUpCmdsCalled_forTesting());
ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumPriorityTakeoversCalled_forTesting());
ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumCatchUpTakeoversCalled_forTesting());
ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumElectionTimeoutsCalled_forTesting());
ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumFreezeTimeoutsCalled_forTesting());
+ ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumStepUpCmdsSuccessful_forTesting());
+ ASSERT_EQUALS(0,
+ ReplicationMetrics::get(svcCtx).getNumPriorityTakeoversSuccessful_forTesting());
+ ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumCatchUpTakeoversSuccessful_forTesting());
+ ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumElectionTimeoutsSuccessful_forTesting());
+ ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumFreezeTimeoutsSuccessful_forTesting());
}
TEST_F(ReplCoordTest, ElectionSucceedsWhenMaxSevenNodesVoteYea) {
@@ -1481,15 +1501,21 @@ TEST_F(TakeoverTest, SuccessfulCatchupTakeover) {
TopologyCoordinator::StartElectionReason::kCatchupTakeover,
lastVoteExpected);
- // Check that the numCatchUpTakeoversCalled election metric has been incremented, and that none
- // of the metrics that track the number of elections called for other reasons has been
- // incremented.
+ // Check that the numCatchUpTakeoversCalled and the numCatchUpTakeoversSuccessful election
+ // metrics have been incremented, and that none of the metrics that track the number of
+ // elections called or successful for other reasons has been incremented.
ServiceContext* svcCtx = getServiceContext();
ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumStepUpCmdsCalled_forTesting());
ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumPriorityTakeoversCalled_forTesting());
ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumCatchUpTakeoversCalled_forTesting());
ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumElectionTimeoutsCalled_forTesting());
ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumFreezeTimeoutsCalled_forTesting());
+ ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumStepUpCmdsSuccessful_forTesting());
+ ASSERT_EQUALS(0,
+ ReplicationMetrics::get(svcCtx).getNumPriorityTakeoversSuccessful_forTesting());
+ ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumCatchUpTakeoversSuccessful_forTesting());
+ ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumElectionTimeoutsSuccessful_forTesting());
+ ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumFreezeTimeoutsSuccessful_forTesting());
}
TEST_F(TakeoverTest, CatchupTakeoverDryRunFailsPrimarySaysNo) {
@@ -1848,15 +1874,21 @@ TEST_F(TakeoverTest, SuccessfulPriorityTakeover) {
TopologyCoordinator::StartElectionReason::kPriorityTakeover,
lastVoteExpected);
- // Check that the numPriorityTakeoversCalled election metric has been incremented, and that none
- // of the metrics that track the number of elections called for other reasons has been
- // incremented.
+ // Check that the numPriorityTakeoversCalled and the numPriorityTakeoversSuccessful election
+ // metrics have been incremented, and that none of the metrics that track the number of
+ // elections called or successful for other reasons has been incremented.
ServiceContext* svcCtx = getServiceContext();
ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumStepUpCmdsCalled_forTesting());
ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumPriorityTakeoversCalled_forTesting());
ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumCatchUpTakeoversCalled_forTesting());
ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumElectionTimeoutsCalled_forTesting());
ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumFreezeTimeoutsCalled_forTesting());
+ ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumStepUpCmdsSuccessful_forTesting());
+ ASSERT_EQUALS(1,
+ ReplicationMetrics::get(svcCtx).getNumPriorityTakeoversSuccessful_forTesting());
+ ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumCatchUpTakeoversSuccessful_forTesting());
+ ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumElectionTimeoutsSuccessful_forTesting());
+ ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumFreezeTimeoutsSuccessful_forTesting());
}
TEST_F(TakeoverTest, DontCallForPriorityTakeoverWhenLaggedSameSecond) {
@@ -2309,6 +2341,18 @@ TEST_F(PrimaryCatchUpTest, PrimaryDoesNotNeedToCatchUp) {
// Check that the number of elections requiring primary catchup was not incremented.
ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+ // Check that only the 'numCatchUpsAlreadyCaughtUp' primary catchup conclusion reason was
+ // incremented.
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+ ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+ ASSERT_EQ(0,
+ ReplicationMetrics::get(opCtx.get())
+ .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
}
// Heartbeats set a future target OpTime and we reached that successfully.
@@ -2335,6 +2379,17 @@ TEST_F(PrimaryCatchUpTest, CatchupSucceeds) {
// Check that the number of elections requiring primary catchup was incremented.
ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+ // Check that only the 'numCatchUpsSucceeded' primary catchup conclusion reason was incremented.
+ ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+ ASSERT_EQ(0,
+ ReplicationMetrics::get(opCtx.get())
+ .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
}
TEST_F(PrimaryCatchUpTest, CatchupTimeout) {
@@ -2358,6 +2413,17 @@ TEST_F(PrimaryCatchUpTest, CatchupTimeout) {
// Check that the number of elections requiring primary catchup was incremented.
ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+ // Check that only the 'numCatchUpsTimedOut' primary catchup conclusion reason was incremented.
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+ ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+ ASSERT_EQ(0,
+ ReplicationMetrics::get(opCtx.get())
+ .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
}
TEST_F(PrimaryCatchUpTest, CannotSeeAllNodes) {
@@ -2386,6 +2452,18 @@ TEST_F(PrimaryCatchUpTest, CannotSeeAllNodes) {
// Check that the number of elections requiring primary catchup was not incremented.
ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+ // Check that only the 'numCatchUpsAlreadyCaughtUp' primary catchup conclusion reason was
+ // incremented.
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+ ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+ ASSERT_EQ(0,
+ ReplicationMetrics::get(opCtx.get())
+ .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
}
TEST_F(PrimaryCatchUpTest, HeartbeatTimeout) {
@@ -2414,6 +2492,18 @@ TEST_F(PrimaryCatchUpTest, HeartbeatTimeout) {
// Check that the number of elections requiring primary catchup was not incremented.
ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+ // Check that only the 'numCatchUpsAlreadyCaughtUp' primary catchup conclusion reason was
+ // incremented.
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+ ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+ ASSERT_EQ(0,
+ ReplicationMetrics::get(opCtx.get())
+ .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
}
TEST_F(PrimaryCatchUpTest, PrimaryStepsDownBeforeHeartbeatRefreshing) {
@@ -2440,6 +2530,18 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownBeforeHeartbeatRefreshing) {
// Check that the number of elections requiring primary catchup was not incremented.
ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+ // Since the primary stepped down in catchup mode because it saw a higher term, check that only
+ // the 'numCatchUpsFailedWithNewTerm' primary catchup conclusion reason was incremented.
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+ ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+ ASSERT_EQ(0,
+ ReplicationMetrics::get(opCtx.get())
+ .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
}
TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringCatchUp) {
@@ -2472,6 +2574,18 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringCatchUp) {
// Check that the number of elections requiring primary catchup was incremented.
ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+ // Since the primary stepped down in catchup mode because it saw a higher term, check that only
+ // the 'numCatchUpsFailedWithNewTerm' primary catchup conclusion reason was incremented.
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+ ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+ ASSERT_EQ(0,
+ ReplicationMetrics::get(opCtx.get())
+ .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
}
TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringDrainMode) {
@@ -2493,6 +2607,21 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringDrainMode) {
stopCapturingLogMessages();
ASSERT_EQUALS(1, countLogLinesContaining("Caught up to the latest"));
+ // Check that the number of elections requiring primary catchup was incremented.
+ auto opCtx = makeOperationContext();
+ ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+ // Check that only the 'numCatchUpsSucceeded' primary catchup conclusion reason was incremented.
+ ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+ ASSERT_EQ(0,
+ ReplicationMetrics::get(opCtx.get())
+ .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
+
// Step down during drain mode.
TopologyCoordinator::UpdateTermResult updateTermResult;
auto evh = replCoord->updateTerm_forTest(2, &updateTermResult);
@@ -2511,7 +2640,6 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringDrainMode) {
net->scheduleResponse(noi, net->now(), makeHeartbeatResponse(time2));
});
ASSERT(replCoord->getApplierState() == ApplierState::Draining);
- auto opCtx = makeOperationContext();
{
Lock::GlobalLock lock(opCtx.get(), MODE_IX);
ASSERT_FALSE(replCoord->canAcceptWritesForDatabase(opCtx.get(), "test"));
@@ -2521,8 +2649,20 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringDrainMode) {
ASSERT(replCoord->getApplierState() == ApplierState::Stopped);
ASSERT_TRUE(replCoord->canAcceptWritesForDatabase(opCtx.get(), "test"));
- // Check that the number of elections requiring primary catchup was incremented.
+ // Check that the number of elections requiring primary catchup was not incremented again.
ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+ // Check that only the 'numCatchUpsAlreadyCaughtUp' primary catchup conclusion reason was
+ // incremented.
+ ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+ ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+ ASSERT_EQ(0,
+ ReplicationMetrics::get(opCtx.get())
+ .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
}
TEST_F(PrimaryCatchUpTest, FreshestNodeBecomesAvailableLater) {
@@ -2589,6 +2729,17 @@ TEST_F(PrimaryCatchUpTest, FreshestNodeBecomesAvailableLater) {
// Check that the number of elections requiring primary catchup was incremented.
ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+ // Check that only the 'numCatchUpsSucceeded' primary catchup conclusion reason was incremented.
+ ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+ ASSERT_EQ(0,
+ ReplicationMetrics::get(opCtx.get())
+ .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
}
TEST_F(PrimaryCatchUpTest, InfiniteTimeoutAndAbort) {
@@ -2623,7 +2774,9 @@ TEST_F(PrimaryCatchUpTest, InfiniteTimeoutAndAbort) {
ASSERT(getReplCoord()->getApplierState() == ApplierState::Running);
// Simulate a user initiated abort.
- ASSERT_OK(getReplCoord()->abortCatchupIfNeeded());
+ ASSERT_OK(getReplCoord()->abortCatchupIfNeeded(
+ ReplicationCoordinator::PrimaryCatchUpConclusionReason::
+ kFailedWithReplSetAbortPrimaryCatchUpCmd));
ASSERT(getReplCoord()->getApplierState() == ApplierState::Draining);
stopCapturingLogMessages();
@@ -2637,6 +2790,18 @@ TEST_F(PrimaryCatchUpTest, InfiniteTimeoutAndAbort) {
// Check that the number of elections requiring primary catchup was incremented.
ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+ // Check that only the 'numCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd' primary catchup
+ // conclusion reason was incremented.
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+ ASSERT_EQ(1,
+ ReplicationMetrics::get(opCtx.get())
+ .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
}
TEST_F(PrimaryCatchUpTest, ZeroTimeout) {
@@ -2654,6 +2819,57 @@ TEST_F(PrimaryCatchUpTest, ZeroTimeout) {
// Check that the number of elections requiring primary catchup was not incremented.
ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+ // Check that only the 'numCatchUpsSkipped' primary catchup conclusion reason was incremented.
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+ ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+ ASSERT_EQ(0,
+ ReplicationMetrics::get(opCtx.get())
+ .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
+}
+
+TEST_F(PrimaryCatchUpTest, CatchUpFailsDueToPrimaryStepDown) {
+ startCapturingLogMessages();
+
+ OpTime time1(Timestamp(100, 1), 0);
+ OpTime time2(Timestamp(100, 2), 0);
+ ReplSetConfig config = setUp3NodeReplSetAndRunForElection(time1);
+ // Step down in the middle of catchup.
+ auto abortTime = getNet()->now() + config.getCatchUpTimeoutPeriod() / 2;
+ replyHeartbeatsAndRunUntil(abortTime, [this, time2](const NetworkOpIter noi) {
+ // Other nodes are ahead of me.
+ getNet()->scheduleResponse(noi, getNet()->now(), makeHeartbeatResponse(time2));
+ });
+ ASSERT(getReplCoord()->getApplierState() == ApplierState::Running);
+
+ auto opCtx = makeOperationContext();
+ ASSERT_OK(getReplCoord()->stepDown(opCtx.get(), true, Milliseconds(0), Milliseconds(1000)));
+ ASSERT_TRUE(getReplCoord()->getMemberState().secondary());
+ ASSERT(getReplCoord()->getApplierState() == ApplierState::Running);
+
+ stopCapturingLogMessages();
+ ASSERT_EQUALS(1, countLogLinesContaining("Exited primary catch-up mode"));
+ ASSERT_EQUALS(0, countLogLinesContaining("Caught up to the latest"));
+ ASSERT_EQUALS(0, countLogLinesContaining("Catchup timed out"));
+
+ // Check that the number of elections requiring primary catchup was incremented.
+ ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
+
+ // Check that only the 'numCatchUpsFailedWithError' primary catchup conclusion reason was
+ // incremented.
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting());
+ ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting());
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting());
+ ASSERT_EQ(0,
+ ReplicationMetrics::get(opCtx.get())
+ .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
}
} // namespace
diff --git a/src/mongo/db/repl/replication_coordinator_impl_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_test.cpp
index 55bbe9d249f..e4f6bca7fed 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_test.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_test.cpp
@@ -2216,15 +2216,21 @@ TEST_F(ReplCoordTest, SingleNodeReplSetUnfreeze) {
getNet()->exitNetwork();
ASSERT_TRUE(getReplCoord()->getMemberState().primary());
- // Check that the numFreezeTimeoutsCalled election metric has been incremented, and that none
- // of the metrics that track the number of elections called for other reasons has been
- // incremented.
+ // Check that the numFreezeTimeoutsCalled and the numFreezeTimeoutsSuccessful election metrics
+ // have been incremented, and that none of the metrics that track the number of elections called
+ // or successful for other reasons has been incremented.
ServiceContext* svcCtx = getServiceContext();
ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumStepUpCmdsCalled_forTesting());
ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumPriorityTakeoversCalled_forTesting());
ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumCatchUpTakeoversCalled_forTesting());
ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumElectionTimeoutsCalled_forTesting());
ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumFreezeTimeoutsCalled_forTesting());
+ ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumStepUpCmdsSuccessful_forTesting());
+ ASSERT_EQUALS(0,
+ ReplicationMetrics::get(svcCtx).getNumPriorityTakeoversSuccessful_forTesting());
+ ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumCatchUpTakeoversSuccessful_forTesting());
+ ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumElectionTimeoutsSuccessful_forTesting());
+ ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumFreezeTimeoutsSuccessful_forTesting());
}
TEST_F(ReplCoordTest, NodeBecomesPrimaryAgainWhenStepDownTimeoutExpiresInASingleNodeSet) {
@@ -2241,15 +2247,21 @@ TEST_F(ReplCoordTest, NodeBecomesPrimaryAgainWhenStepDownTimeoutExpiresInASingle
auto opCtx = makeOperationContext();
runSingleNodeElection(opCtx.get());
- // Check that the numElectionTimeoutsCalled election metric has been incremented, and that none
- // of the metrics that track the number of elections called for other reasons has been
- // incremented.
+ // Check that the numElectionTimeoutsCalled and the numElectionTimeoutsSuccessful election
+ // metrics have been incremented, and that none of the metrics that track the number of
+ // elections called or successful for other reasons has been incremented.
ServiceContext* svcCtx = getServiceContext();
ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumStepUpCmdsCalled_forTesting());
ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumPriorityTakeoversCalled_forTesting());
ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumCatchUpTakeoversCalled_forTesting());
ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumElectionTimeoutsCalled_forTesting());
ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumFreezeTimeoutsCalled_forTesting());
+ ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumStepUpCmdsSuccessful_forTesting());
+ ASSERT_EQUALS(0,
+ ReplicationMetrics::get(svcCtx).getNumPriorityTakeoversSuccessful_forTesting());
+ ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumCatchUpTakeoversSuccessful_forTesting());
+ ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumElectionTimeoutsSuccessful_forTesting());
+ ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumFreezeTimeoutsSuccessful_forTesting());
ASSERT_OK(getReplCoord()->stepDown(opCtx.get(), true, Milliseconds(0), Milliseconds(1000)));
getNet()->enterNetwork(); // Must do this before inspecting the topocoord
@@ -2266,15 +2278,22 @@ TEST_F(ReplCoordTest, NodeBecomesPrimaryAgainWhenStepDownTimeoutExpiresInASingle
getNet()->exitNetwork();
ASSERT_TRUE(getReplCoord()->getMemberState().primary());
- // Check that the numFreezeTimeoutsCalled election metric has been incremented, and that none
- // of the metrics that track the number of elections called for other reasons has been
- // incremented. When a stepdown timeout expires in a single node replica set, an election is
- // called for the same reason as is used when a freeze timeout expires.
+ // Check that the numFreezeTimeoutsCalled and the numFreezeTimeoutsSuccessful election metrics
+ // have been incremented, and that none of the metrics that track the number of elections called
+ // or successful for other reasons has been incremented. When a stepdown timeout expires in a
+ // single node replica set, an election is called for the same reason as is used when a freeze
+ // timeout expires.
ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumStepUpCmdsCalled_forTesting());
ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumPriorityTakeoversCalled_forTesting());
ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumCatchUpTakeoversCalled_forTesting());
ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumElectionTimeoutsCalled_forTesting());
ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumFreezeTimeoutsCalled_forTesting());
+ ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumStepUpCmdsSuccessful_forTesting());
+ ASSERT_EQUALS(0,
+ ReplicationMetrics::get(svcCtx).getNumPriorityTakeoversSuccessful_forTesting());
+ ASSERT_EQUALS(0, ReplicationMetrics::get(svcCtx).getNumCatchUpTakeoversSuccessful_forTesting());
+ ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumElectionTimeoutsSuccessful_forTesting());
+ ASSERT_EQUALS(1, ReplicationMetrics::get(svcCtx).getNumFreezeTimeoutsSuccessful_forTesting());
}
TEST_F(
diff --git a/src/mongo/db/repl/replication_coordinator_mock.cpp b/src/mongo/db/repl/replication_coordinator_mock.cpp
index 6bc921b4628..5e06cf589f4 100644
--- a/src/mongo/db/repl/replication_coordinator_mock.cpp
+++ b/src/mongo/db/repl/replication_coordinator_mock.cpp
@@ -489,7 +489,7 @@ void ReplicationCoordinatorMock::alwaysAllowWrites(bool allowWrites) {
_alwaysAllowWrites = allowWrites;
}
-Status ReplicationCoordinatorMock::abortCatchupIfNeeded() {
+Status ReplicationCoordinatorMock::abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) {
return Status::OK();
}
diff --git a/src/mongo/db/repl/replication_coordinator_mock.h b/src/mongo/db/repl/replication_coordinator_mock.h
index 7decacbdc96..7cc450ed939 100644
--- a/src/mongo/db/repl/replication_coordinator_mock.h
+++ b/src/mongo/db/repl/replication_coordinator_mock.h
@@ -285,7 +285,7 @@ public:
return _service;
}
- virtual Status abortCatchupIfNeeded() override;
+ virtual Status abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) override;
void signalDropPendingCollectionsRemovedFromStorage() final;
diff --git a/src/mongo/db/repl/replication_metrics.cpp b/src/mongo/db/repl/replication_metrics.cpp
index bcc04ea0f53..0f9a3491019 100644
--- a/src/mongo/db/repl/replication_metrics.cpp
+++ b/src/mongo/db/repl/replication_metrics.cpp
@@ -59,31 +59,59 @@ void ReplicationMetrics::incrementNumElectionsCalledForReason(
case TopologyCoordinator::StartElectionReason::kStepUpRequestSkipDryRun: {
ElectionReasonCounter& stepUpCmd = _electionMetrics.getStepUpCmd();
stepUpCmd.incrementCalled();
- _electionMetrics.setStepUpCmd(stepUpCmd);
break;
}
case TopologyCoordinator::StartElectionReason::kPriorityTakeover: {
ElectionReasonCounter& priorityTakeover = _electionMetrics.getPriorityTakeover();
priorityTakeover.incrementCalled();
- _electionMetrics.setPriorityTakeover(priorityTakeover);
break;
}
case TopologyCoordinator::StartElectionReason::kCatchupTakeover: {
ElectionReasonCounter& catchUpTakeover = _electionMetrics.getCatchUpTakeover();
catchUpTakeover.incrementCalled();
- _electionMetrics.setCatchUpTakeover(catchUpTakeover);
break;
}
case TopologyCoordinator::StartElectionReason::kElectionTimeout: {
ElectionReasonCounter& electionTimeout = _electionMetrics.getElectionTimeout();
electionTimeout.incrementCalled();
- _electionMetrics.setElectionTimeout(electionTimeout);
break;
}
case TopologyCoordinator::StartElectionReason::kSingleNodePromptElection: {
ElectionReasonCounter& freezeTimeout = _electionMetrics.getFreezeTimeout();
freezeTimeout.incrementCalled();
- _electionMetrics.setFreezeTimeout(freezeTimeout);
+ break;
+ }
+ }
+}
+
+void ReplicationMetrics::incrementNumElectionsSuccessfulForReason(
+ TopologyCoordinator::StartElectionReason reason) {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ switch (reason) {
+ case TopologyCoordinator::StartElectionReason::kStepUpRequest:
+ case TopologyCoordinator::StartElectionReason::kStepUpRequestSkipDryRun: {
+ ElectionReasonCounter& stepUpCmd = _electionMetrics.getStepUpCmd();
+ stepUpCmd.incrementSuccessful();
+ break;
+ }
+ case TopologyCoordinator::StartElectionReason::kPriorityTakeover: {
+ ElectionReasonCounter& priorityTakeover = _electionMetrics.getPriorityTakeover();
+ priorityTakeover.incrementSuccessful();
+ break;
+ }
+ case TopologyCoordinator::StartElectionReason::kCatchupTakeover: {
+ ElectionReasonCounter& catchUpTakeover = _electionMetrics.getCatchUpTakeover();
+ catchUpTakeover.incrementSuccessful();
+ break;
+ }
+ case TopologyCoordinator::StartElectionReason::kElectionTimeout: {
+ ElectionReasonCounter& electionTimeout = _electionMetrics.getElectionTimeout();
+ electionTimeout.incrementSuccessful();
+ break;
+ }
+ case TopologyCoordinator::StartElectionReason::kSingleNodePromptElection: {
+ ElectionReasonCounter& freezeTimeout = _electionMetrics.getFreezeTimeout();
+ freezeTimeout.incrementSuccessful();
break;
}
}
@@ -100,6 +128,40 @@ void ReplicationMetrics::incrementNumCatchUps() {
_electionMetrics.setNumCatchUps(_electionMetrics.getNumCatchUps() + 1);
}
+void ReplicationMetrics::incrementNumCatchUpsConcludedForReason(
+ ReplicationCoordinator::PrimaryCatchUpConclusionReason reason) {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ switch (reason) {
+ case ReplicationCoordinator::PrimaryCatchUpConclusionReason::kSucceeded:
+ _electionMetrics.setNumCatchUpsSucceeded(_electionMetrics.getNumCatchUpsSucceeded() +
+ 1);
+ break;
+ case ReplicationCoordinator::PrimaryCatchUpConclusionReason::kAlreadyCaughtUp:
+ _electionMetrics.setNumCatchUpsAlreadyCaughtUp(
+ _electionMetrics.getNumCatchUpsAlreadyCaughtUp() + 1);
+ break;
+ case ReplicationCoordinator::PrimaryCatchUpConclusionReason::kSkipped:
+ _electionMetrics.setNumCatchUpsSkipped(_electionMetrics.getNumCatchUpsSkipped() + 1);
+ break;
+ case ReplicationCoordinator::PrimaryCatchUpConclusionReason::kTimedOut:
+ _electionMetrics.setNumCatchUpsTimedOut(_electionMetrics.getNumCatchUpsTimedOut() + 1);
+ break;
+ case ReplicationCoordinator::PrimaryCatchUpConclusionReason::kFailedWithError:
+ _electionMetrics.setNumCatchUpsFailedWithError(
+ _electionMetrics.getNumCatchUpsFailedWithError() + 1);
+ break;
+ case ReplicationCoordinator::PrimaryCatchUpConclusionReason::kFailedWithNewTerm:
+ _electionMetrics.setNumCatchUpsFailedWithNewTerm(
+ _electionMetrics.getNumCatchUpsFailedWithNewTerm() + 1);
+ break;
+ case ReplicationCoordinator::PrimaryCatchUpConclusionReason::
+ kFailedWithReplSetAbortPrimaryCatchUpCmd:
+ _electionMetrics.setNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd(
+ _electionMetrics.getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd() + 1);
+ break;
+ }
+}
+
int ReplicationMetrics::getNumStepUpCmdsCalled_forTesting() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
return _electionMetrics.getStepUpCmd().getCalled();
@@ -130,11 +192,71 @@ int ReplicationMetrics::getNumStepDownsCausedByHigherTerm_forTesting() {
return _electionMetrics.getNumStepDownsCausedByHigherTerm();
}
+int ReplicationMetrics::getNumStepUpCmdsSuccessful_forTesting() {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ return _electionMetrics.getStepUpCmd().getSuccessful();
+}
+
+int ReplicationMetrics::getNumPriorityTakeoversSuccessful_forTesting() {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ return _electionMetrics.getPriorityTakeover().getSuccessful();
+}
+
+int ReplicationMetrics::getNumCatchUpTakeoversSuccessful_forTesting() {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ return _electionMetrics.getCatchUpTakeover().getSuccessful();
+}
+
+int ReplicationMetrics::getNumElectionTimeoutsSuccessful_forTesting() {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ return _electionMetrics.getElectionTimeout().getSuccessful();
+}
+
+int ReplicationMetrics::getNumFreezeTimeoutsSuccessful_forTesting() {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ return _electionMetrics.getFreezeTimeout().getSuccessful();
+}
+
int ReplicationMetrics::getNumCatchUps_forTesting() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
return _electionMetrics.getNumCatchUps();
}
+int ReplicationMetrics::getNumCatchUpsSucceeded_forTesting() {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ return _electionMetrics.getNumCatchUpsSucceeded();
+}
+
+int ReplicationMetrics::getNumCatchUpsAlreadyCaughtUp_forTesting() {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ return _electionMetrics.getNumCatchUpsAlreadyCaughtUp();
+}
+
+int ReplicationMetrics::getNumCatchUpsSkipped_forTesting() {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ return _electionMetrics.getNumCatchUpsSkipped();
+}
+
+int ReplicationMetrics::getNumCatchUpsTimedOut_forTesting() {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ return _electionMetrics.getNumCatchUpsTimedOut();
+}
+
+int ReplicationMetrics::getNumCatchUpsFailedWithError_forTesting() {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ return _electionMetrics.getNumCatchUpsFailedWithError();
+}
+
+int ReplicationMetrics::getNumCatchUpsFailedWithNewTerm_forTesting() {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ return _electionMetrics.getNumCatchUpsFailedWithNewTerm();
+}
+
+int ReplicationMetrics::getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting() {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ return _electionMetrics.getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd();
+}
+
BSONObj ReplicationMetrics::getElectionMetricsBSON() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
return _electionMetrics.toBSON();
diff --git a/src/mongo/db/repl/replication_metrics.h b/src/mongo/db/repl/replication_metrics.h
index 51dcd66ec77..de2c9876f68 100644
--- a/src/mongo/db/repl/replication_metrics.h
+++ b/src/mongo/db/repl/replication_metrics.h
@@ -49,16 +49,31 @@ public:
~ReplicationMetrics();
void incrementNumElectionsCalledForReason(TopologyCoordinator::StartElectionReason reason);
+ void incrementNumElectionsSuccessfulForReason(TopologyCoordinator::StartElectionReason reason);
void incrementNumStepDownsCausedByHigherTerm();
void incrementNumCatchUps();
+ void incrementNumCatchUpsConcludedForReason(
+ ReplicationCoordinator::PrimaryCatchUpConclusionReason reason);
int getNumStepUpCmdsCalled_forTesting();
int getNumPriorityTakeoversCalled_forTesting();
int getNumCatchUpTakeoversCalled_forTesting();
int getNumElectionTimeoutsCalled_forTesting();
int getNumFreezeTimeoutsCalled_forTesting();
+ int getNumStepUpCmdsSuccessful_forTesting();
+ int getNumPriorityTakeoversSuccessful_forTesting();
+ int getNumCatchUpTakeoversSuccessful_forTesting();
+ int getNumElectionTimeoutsSuccessful_forTesting();
+ int getNumFreezeTimeoutsSuccessful_forTesting();
int getNumStepDownsCausedByHigherTerm_forTesting();
int getNumCatchUps_forTesting();
+ int getNumCatchUpsSucceeded_forTesting();
+ int getNumCatchUpsAlreadyCaughtUp_forTesting();
+ int getNumCatchUpsSkipped_forTesting();
+ int getNumCatchUpsTimedOut_forTesting();
+ int getNumCatchUpsFailedWithError_forTesting();
+ int getNumCatchUpsFailedWithNewTerm_forTesting();
+ int getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting();
BSONObj getElectionMetricsBSON();
diff --git a/src/mongo/db/repl/replication_metrics.idl b/src/mongo/db/repl/replication_metrics.idl
index e500814e44e..c9ddd9f2027 100644
--- a/src/mongo/db/repl/replication_metrics.idl
+++ b/src/mongo/db/repl/replication_metrics.idl
@@ -86,6 +86,39 @@ structs:
was behind"
type: long
default: 0
+ numCatchUpsSucceeded:
+ description: "Number of times primary catchup concluded because the primary caught
+ up to the latest known optime successfully"
+ type: long
+ default: 0
+ numCatchUpsAlreadyCaughtUp:
+ description: "Number of times primary catchup concluded because the primary was
+ already caught up"
+ type: long
+ default: 0
+ numCatchUpsSkipped:
+ description: "Number of times primary catchup concluded because it was skipped"
+ type: long
+ default: 0
+ numCatchUpsTimedOut:
+ description: "Number of times primary catchup concluded because it timed out"
+ type: long
+ default: 0
+ numCatchUpsFailedWithError:
+ description: "Number of times primary catchup concluded because it failed with an
+ error"
+ type: long
+ default: 0
+ numCatchUpsFailedWithNewTerm:
+ description: "Number of times primary catchup concluded because the primary stepped
+ down on seeing a higher term"
+ type: long
+ default: 0
+ numCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd:
+ description: "Number of times primary catchup concluded because of the
+ replSetAbortPrimaryCatchUp command"
+ type: long
+ default: 0
ElectionCandidateMetrics:
description: "Stores metrics that are specific to the last election in which the node was a
diff --git a/src/mongo/embedded/replication_coordinator_embedded.cpp b/src/mongo/embedded/replication_coordinator_embedded.cpp
index 185c0b276ef..2b36011e3b1 100644
--- a/src/mongo/embedded/replication_coordinator_embedded.cpp
+++ b/src/mongo/embedded/replication_coordinator_embedded.cpp
@@ -321,7 +321,7 @@ Status ReplicationCoordinatorEmbedded::processReplSetInitiate(OperationContext*,
UASSERT_NOT_IMPLEMENTED;
}
-Status ReplicationCoordinatorEmbedded::abortCatchupIfNeeded() {
+Status ReplicationCoordinatorEmbedded::abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) {
UASSERT_NOT_IMPLEMENTED;
}
diff --git a/src/mongo/embedded/replication_coordinator_embedded.h b/src/mongo/embedded/replication_coordinator_embedded.h
index afedb2b2e2e..b591d352f43 100644
--- a/src/mongo/embedded/replication_coordinator_embedded.h
+++ b/src/mongo/embedded/replication_coordinator_embedded.h
@@ -235,7 +235,7 @@ public:
Status stepUpIfEligible(bool skipDryRun) override;
- Status abortCatchupIfNeeded() override;
+ Status abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) override;
void signalDropPendingCollectionsRemovedFromStorage() final;