diff options
27 files changed, 987 insertions, 38 deletions
diff --git a/jstests/replsets/catchup.js b/jstests/replsets/catchup.js index c28f5f59a31..9d90c51099c 100644 --- a/jstests/replsets/catchup.js +++ b/jstests/replsets/catchup.js @@ -5,6 +5,7 @@ load("jstests/libs/check_log.js"); load("jstests/libs/write_concern_util.js"); +load("jstests/replsets/libs/election_metrics.js"); load("jstests/replsets/rslib.js"); var name = "catch_up"; @@ -65,6 +66,7 @@ function stopReplicationAndEnforceNewPrimaryToCatchUp() { assert.writeOK(oldPrimary.getDB("test").foo.insert({x: i})); } var latestOpOnOldPrimary = getLatestOp(oldPrimary); + // New primary wins immediately, but needs to catch up. var newPrimary = stepUpNode(oldSecondaries[0]); var latestOpOnNewPrimary = getLatestOp(newPrimary); @@ -97,19 +99,102 @@ function reconfigElectionAndCatchUpTimeout(electionTimeout, catchupTimeout) { rst.awaitReplication(); jsTest.log("Case 1: The primary is up-to-date after refreshing heartbeats."); +let initialNewPrimaryStatus = + assert.commandWorked(rst.getSecondary().adminCommand({serverStatus: 1})); + // Should complete transition to primary immediately. var newPrimary = stepUpNode(rst.getSecondary()); // Should win an election and finish the transition very quickly. assert.eq(newPrimary, rst.getPrimary()); rst.awaitReplication(); +// Check that the 'numCatchUps' field has not been incremented in serverStatus. +let newNewPrimaryStatus = assert.commandWorked(newPrimary.adminCommand({serverStatus: 1})); +verifyServerStatusChange( + initialNewPrimaryStatus.electionMetrics, newNewPrimaryStatus.electionMetrics, 'numCatchUps', 0); +// Check that the 'numCatchUpsAlreadyCaughtUp' field has been incremented in serverStatus, and +// that none of the other reasons for catchup concluding has been incremented. +verifyCatchUpConclusionReason(initialNewPrimaryStatus.electionMetrics, + newNewPrimaryStatus.electionMetrics, + 'numCatchUpsAlreadyCaughtUp'); + +// Check that the 'electionCandidateMetrics' section of the replSetGetStatus response does not have +// a 'targetCatchupOpTime' field if the target opTime for catchup is not set. +let res = assert.commandWorked(newPrimary.adminCommand({replSetGetStatus: 1})); +assert(res.electionCandidateMetrics, + () => "Response should have an 'electionCandidateMetrics' field: " + tojson(res)); +assert(!res.electionCandidateMetrics.targetCatchupOpTime, + () => "Response should not have an 'electionCandidateMetrics.targetCatchupOpTime' field: " + + tojson(res.electionCandidateMetrics)); + +// Check that the 'electionCandidateMetrics' section of the replSetGetStatus response has a +// 'numCatchUpOps' field once the primary is caught up, and that it has the correct value. +assert(res.electionCandidateMetrics.numCatchUpOps, + () => "Response should have an 'electionCandidateMetrics.numCatchUpOps' field: " + + tojson(res.electionCandidateMetrics)); +assert.eq(res.electionCandidateMetrics.numCatchUpOps, 0); + jsTest.log("Case 2: The primary needs to catch up, succeeds in time."); +initialNewPrimaryStatus = + assert.commandWorked(rst.getSecondaries()[0].adminCommand({serverStatus: 1})); + var stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp(); +// Check that the 'electionCandidateMetrics' section of the replSetGetStatus response does not have +// a 'newTermStartDate' field before the transition to primary is complete. +res = assert.commandWorked(stepUpResults.newPrimary.adminCommand({replSetGetStatus: 1})); +assert(res.electionCandidateMetrics, + () => "Response should have an 'electionCandidateMetrics' field: " + tojson(res)); +assert(!res.electionCandidateMetrics.newTermStartDate, + () => "Response should not have an 'electionCandidateMetrics.newTermStartDate' field: " + + tojson(res.electionCandidateMetrics)); + // Disable fail point to allow replication. restartServerReplication(stepUpResults.oldSecondaries); // getPrimary() blocks until the primary finishes drain mode. assert.eq(stepUpResults.newPrimary, rst.getPrimary()); + +// Wait until the new primary completes the transition to primary and writes a no-op. +checkLog.contains(stepUpResults.newPrimary, "transition to primary complete"); +// Check that the new primary's term has been updated because of the no-op. +assert.eq(getLatestOp(stepUpResults.newPrimary).t, stepUpResults.latestOpOnNewPrimary.t + 1); + +// Check that the 'electionCandidateMetrics' section of the replSetGetStatus response has a +// 'newTermStartDate' field once the transition to primary is complete. +res = assert.commandWorked(stepUpResults.newPrimary.adminCommand({replSetGetStatus: 1})); +assert(res.electionCandidateMetrics, + () => "Response should have an 'electionCandidateMetrics' field: " + tojson(res)); +assert(res.electionCandidateMetrics.newTermStartDate, + () => "Response should have an 'electionCandidateMetrics.newTermStartDate' field: " + + tojson(res.electionCandidateMetrics)); + +// Check that the 'numCatchUps' field has been incremented in serverStatus. +newNewPrimaryStatus = + assert.commandWorked(stepUpResults.newPrimary.adminCommand({serverStatus: 1})); +verifyServerStatusChange( + initialNewPrimaryStatus.electionMetrics, newNewPrimaryStatus.electionMetrics, 'numCatchUps', 1); +// Check that the 'numCatchUpsSucceeded' field has been incremented in serverStatus, and that +// none of the other reasons for catchup concluding has been incremented. +verifyCatchUpConclusionReason(initialNewPrimaryStatus.electionMetrics, + newNewPrimaryStatus.electionMetrics, + 'numCatchUpsSucceeded'); + +// Check that the 'electionCandidateMetrics' section of the replSetGetStatus response has +// 'targetCatchupOpTime' and 'numCatchUpOps' fields once the primary is caught up, and that they +// have the correct values. +assert(res.electionCandidateMetrics.targetCatchupOpTime, + () => "Response should have an 'electionCandidateMetrics.targetCatchupOpTime' field: " + + tojson(res.electionCandidateMetrics)); +assert.eq(res.electionCandidateMetrics.targetCatchupOpTime.ts, + stepUpResults.latestOpOnOldPrimary.ts); +assert.eq(res.electionCandidateMetrics.targetCatchupOpTime.t, stepUpResults.latestOpOnOldPrimary.t); +assert(res.electionCandidateMetrics.numCatchUpOps, + () => "Response should have an 'electionCandidateMetrics.numCatchUpOps' field: " + + tojson(res.electionCandidateMetrics)); +// numCatchUpOps should be 4 because the 'foo' collection is implicitly created during the 3 +// inserts, and that's where the additional oplog entry comes from. +assert.eq(res.electionCandidateMetrics.numCatchUpOps, 4); + // Wait for all secondaries to catch up rst.awaitReplication(); // Check the latest op on old primary is preserved on the new one. @@ -146,6 +231,9 @@ stepUpResults.oldPrimary.reconnect(stepUpResults.newPrimary); rst.awaitReplication(); jsTest.log("Case 4: The primary needs to catch up, fails due to timeout."); +initialNewPrimaryStatus = + assert.commandWorked(rst.getSecondaries()[0].adminCommand({serverStatus: 1})); + // Reconfig to make the catchup timeout shorter. reconfigElectionAndCatchUpTimeout(conf.settings.electionTimeoutMillis, 10 * 1000); @@ -155,6 +243,14 @@ checkLog.contains(stepUpResults.newPrimary, "Catchup timed out after becoming pr restartServerReplication(stepUpResults.newPrimary); assert.eq(stepUpResults.newPrimary, rst.getPrimary()); +// Check that the 'numCatchUpsTimedOut' field has been incremented in serverStatus, and that +// none of the other reasons for catchup concluding has been incremented. +newNewPrimaryStatus = + assert.commandWorked(stepUpResults.newPrimary.adminCommand({serverStatus: 1})); +verifyCatchUpConclusionReason(initialNewPrimaryStatus.electionMetrics, + newNewPrimaryStatus.electionMetrics, + 'numCatchUpsTimedOut'); + // Wait for the no-op "new primary" after winning an election, so that we know it has // finished transition to primary. assert.soon(function() { @@ -171,9 +267,21 @@ jsTest.log("Case 5: The primary needs to catch up with no timeout, then gets abo reconfigElectionAndCatchUpTimeout(conf.settings.electionTimeoutMillis, -1); stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp(); +initialNewPrimaryStatus = + assert.commandWorked(stepUpResults.newPrimary.adminCommand({serverStatus: 1})); + // Abort catchup. assert.commandWorked(stepUpResults.newPrimary.adminCommand({replSetAbortPrimaryCatchUp: 1})); +// Check that the 'numCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd' field has been +// incremented in serverStatus, and that none of the other reasons for catchup concluding has +// been incremented. +newNewPrimaryStatus = + assert.commandWorked(stepUpResults.newPrimary.adminCommand({serverStatus: 1})); +verifyCatchUpConclusionReason(initialNewPrimaryStatus.electionMetrics, + newNewPrimaryStatus.electionMetrics, + 'numCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd'); + // Wait for the no-op "new primary" after winning an election, so that we know it has // finished transition to primary. assert.soon(function() { @@ -187,11 +295,22 @@ rst.awaitReplication(); checkOpInOplog(stepUpResults.newPrimary, stepUpResults.latestOpOnOldPrimary, 0); jsTest.log("Case 6: The primary needs to catch up with no timeout, but steps down."); +initialNewPrimaryStatus = + assert.commandWorked(rst.getSecondaries()[0].adminCommand({serverStatus: 1})); + var stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp(); // Step-down command should abort catchup. assert.commandWorked(stepUpResults.newPrimary.adminCommand({replSetStepDown: 60})); +// Check that the 'numCatchUpsFailedWithError' field has been incremented in serverStatus, and +// that none of the other reasons for catchup concluding has been incremented. +newNewPrimaryStatus = + assert.commandWorked(stepUpResults.newPrimary.adminCommand({serverStatus: 1})); +verifyCatchUpConclusionReason(initialNewPrimaryStatus.electionMetrics, + newNewPrimaryStatus.electionMetrics, + 'numCatchUpsFailedWithError'); + // Rename the primary. var steppedDownPrimary = stepUpResults.newPrimary; var newPrimary = rst.getPrimary(); diff --git a/jstests/replsets/catchup_takeover_two_nodes_ahead.js b/jstests/replsets/catchup_takeover_two_nodes_ahead.js index 6203889af88..31b78302329 100644 --- a/jstests/replsets/catchup_takeover_two_nodes_ahead.js +++ b/jstests/replsets/catchup_takeover_two_nodes_ahead.js @@ -33,8 +33,6 @@ var primary = replSet.getPrimary(); var writeConcern = {writeConcern: {w: 2, wtimeout: replSet.kDefaultTimeoutMS}}; assert.writeOK(primary.getDB(name).bar.insert({x: 100}, writeConcern)); -const initialPrimaryStatus = assert.commandWorked(primary.adminCommand({serverStatus: 1})); - // Write something so that node 0 is ahead of node 1. stopServerReplication(nodes[1]); writeConcern = { @@ -42,6 +40,9 @@ writeConcern = { }; assert.writeOK(primary.getDB(name).bar.insert({y: 100}, writeConcern)); +const initialPrimaryStatus = assert.commandWorked(primary.adminCommand({serverStatus: 1})); +const initialNode2Status = assert.commandWorked(nodes[2].adminCommand({serverStatus: 1})); + // Step up one of the lagged nodes. assert.commandWorked(nodes[2].adminCommand({replSetStepUp: 1})); replSet.awaitNodesAgreeOnPrimary(); @@ -65,6 +66,13 @@ const newPrimaryStatus = assert.commandWorked(primary.adminCommand({serverStatus verifyServerStatusElectionReasonCounterChange( initialPrimaryStatus.electionMetrics, newPrimaryStatus.electionMetrics, "catchUpTakeover", 1); +// Check that the 'numCatchUpsFailedWithNewTerm' field has been incremented in serverStatus, and +// that none of the other reasons for catchup concluding has been incremented. +const newNode2Status = assert.commandWorked(nodes[2].adminCommand({serverStatus: 1})); +verifyCatchUpConclusionReason(initialNode2Status.electionMetrics, + newNode2Status.electionMetrics, + 'numCatchUpsFailedWithNewTerm'); + // Wait until the old primary steps down so the connections won't be closed. replSet.waitForState(2, ReplSetTest.State.SECONDARY, replSet.kDefaultTimeoutMS); // Let the nodes catchup. diff --git a/jstests/replsets/libs/election_metrics.js b/jstests/replsets/libs/election_metrics.js index 180123a5371..36307b2117e 100644 --- a/jstests/replsets/libs/election_metrics.js +++ b/jstests/replsets/libs/election_metrics.js @@ -9,10 +9,11 @@ */ function verifyServerStatusElectionReasonCounterValue(electionMetrics, fieldName, value) { const field = electionMetrics[fieldName]; - assert.eq(field["called"], value, `expected the 'called' field of ${fieldName} to be ${value}`); + assert.eq( + field["called"], value, `expected the 'called' field of '${fieldName}' to be ${value}`); assert.eq(field["successful"], value, - `expected the 'successful' field of ${fieldName} to be ${value}`); + `expected the 'successful' field of '${fieldName}' to be ${value}`); } /** @@ -25,9 +26,45 @@ function verifyServerStatusElectionReasonCounterChange( const newField = newElectionMetrics[fieldName]; assert.eq(initialField["called"] + expectedIncrement, newField["called"], - `expected the 'called' field of ${fieldName} to increase by ${expectedIncrement}`); + `expected the 'called' field of '${fieldName}' to increase by ${expectedIncrement}`); assert.eq( initialField["successful"] + expectedIncrement, newField["successful"], - `expected the 'successful' field of ${fieldName} to increase by ${expectedIncrement}`); + `expected the 'successful' field of '${fieldName}' to increase by ${expectedIncrement}`); +} + +/** + * Verifies that the given field in serverStatus is incremented in the way we expect. + */ +function verifyServerStatusChange(initialStatus, newStatus, fieldName, expectedIncrement) { + assert.eq(initialStatus[fieldName] + expectedIncrement, + newStatus[fieldName], + `expected '${fieldName}' to increase by ${expectedIncrement}`); +} + +/** + * Verifies that the given reason for primary catchup concluding is incremented in serverStatus, and + * that none of the other reasons are. + */ +function verifyCatchUpConclusionReason(initialStatus, newStatus, fieldName) { + const catchUpConclusionMetrics = [ + "numCatchUpsSucceeded", + "numCatchUpsAlreadyCaughtUp", + "numCatchUpsSkipped", + "numCatchUpsTimedOut", + "numCatchUpsFailedWithError", + "numCatchUpsFailedWithNewTerm", + "numCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd" + ]; + + catchUpConclusionMetrics.forEach(function(metric) { + if (metric === fieldName) { + assert.eq(initialStatus[metric] + 1, + newStatus[metric], + `expected '${metric}' to increase by 1`); + } else { + assert.eq( + initialStatus[metric], newStatus[metric], `did not expect '${metric}' to increase`); + } + }); } diff --git a/jstests/replsets/server_election_metrics.js b/jstests/replsets/server_election_metrics.js index cb1ad5f1cf8..8e094492b6c 100644 --- a/jstests/replsets/server_election_metrics.js +++ b/jstests/replsets/server_election_metrics.js @@ -35,6 +35,15 @@ ${tojson(serverStatusResponse)}`)); verifyElectionReasonCounterFields(serverStatusResponse, "electionTimeout"); verifyElectionReasonCounterFields(serverStatusResponse, "freezeTimeout"); verifyElectionMetricsField(serverStatusResponse, "numStepDownsCausedByHigherTerm"); + verifyElectionMetricsField(serverStatusResponse, "numCatchUps"); + verifyElectionMetricsField(serverStatusResponse, "numCatchUpsSucceeded"); + verifyElectionMetricsField(serverStatusResponse, "numCatchUpsAlreadyCaughtUp"); + verifyElectionMetricsField(serverStatusResponse, "numCatchUpsSkipped"); + verifyElectionMetricsField(serverStatusResponse, "numCatchUpsTimedOut"); + verifyElectionMetricsField(serverStatusResponse, "numCatchUpsFailedWithError"); + verifyElectionMetricsField(serverStatusResponse, "numCatchUpsFailedWithNewTerm"); + verifyElectionMetricsField(serverStatusResponse, + "numCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd"); } // Set up the replica set. diff --git a/jstests/replsets/stepdown.js b/jstests/replsets/stepdown.js index ecba409427f..6abdd335e92 100644 --- a/jstests/replsets/stepdown.js +++ b/jstests/replsets/stepdown.js @@ -2,10 +2,14 @@ * Check that on a loss of primary, another node doesn't assume primary if it is stale. We force a * stepDown to test this. * + * This test also checks that the serverStatus command metrics replSetStepDown and + * replSetStepDownWithForce are incremented correctly. + * * This test requires the fsync command to force a secondary to be stale. * @tags: [requires_fsync] */ +load("jstests/replsets/libs/election_metrics.js"); load("jstests/replsets/rslib.js"); // We are bypassing collection validation because this test runs "shutdown" command so the server is @@ -60,6 +64,11 @@ try { assert.writeOK(master.getDB("foo").bar.insert({x: i})); } + let res = assert.commandWorked(master.adminCommand({replSetGetStatus: 1})); + assert(res.electionCandidateMetrics, + () => "Response should have an 'electionCandidateMetrics' field: " + tojson(res)); + let intitialServerStatus = assert.commandWorked(master.adminCommand({serverStatus: 1})); + jsTestLog('Do stepdown of primary ' + master + ' that should not work'); // this should fail, so we don't need to try/catch @@ -67,15 +76,126 @@ try { 'Step down ' + master + ' expected error: ' + tojson(assert.commandFailed(master.getDB("admin").runCommand({replSetStepDown: 10})))); + // Check that the 'total' and 'failed' fields of 'replSetStepDown' have been incremented in + // serverStatus and that they have not been incremented for 'replSetStepDownWithForce'. + let newServerStatus = assert.commandWorked(master.adminCommand({serverStatus: 1})); + verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDown, + newServerStatus.metrics.commands.replSetStepDown, + "total", + 1); + verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDown, + newServerStatus.metrics.commands.replSetStepDown, + "failed", + 1); + verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDownWithForce, + newServerStatus.metrics.commands.replSetStepDownWithForce, + "total", + 0); + verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDownWithForce, + newServerStatus.metrics.commands.replSetStepDownWithForce, + "failed", + 0); + + // This section checks that the metrics are incremented accurately when the command fails due to + // an error occurring before stepDown is called in the replication coordinator, such as due to + // bad values or type mismatches in the arguments, or checkReplEnabledForCommand returning a bad + // status. The stepdown period being negative is one example of such an error, but success in + // this case gives us confidence that the behavior in the other cases is the same. + + // Stepdown should fail because the stepdown period is negative + jsTestLog('Do stepdown of primary ' + master + ' that should not work'); + assert.commandFailedWithCode( + master.getDB("admin").runCommand({replSetStepDown: -1, force: true}), ErrorCodes.BadValue); + + // Check that the 'total' and 'failed' fields of 'replSetStepDown' and + // 'replSetStepDownWithForce' have been incremented in serverStatus. + intitialServerStatus = newServerStatus; + newServerStatus = assert.commandWorked(master.adminCommand({serverStatus: 1})); + verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDown, + newServerStatus.metrics.commands.replSetStepDown, + "total", + 1); + verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDown, + newServerStatus.metrics.commands.replSetStepDown, + "failed", + 1); + verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDownWithForce, + newServerStatus.metrics.commands.replSetStepDownWithForce, + "total", + 1); + verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDownWithForce, + newServerStatus.metrics.commands.replSetStepDownWithForce, + "failed", + 1); + jsTestLog('Do stepdown of primary ' + master + ' that should work'); assert.commandWorked( master.adminCommand({replSetStepDown: ReplSetTest.kDefaultTimeoutMS, force: true})); + // Check that the 'total' fields of 'replSetStepDown' and 'replSetStepDownWithForce' have been + // incremented in serverStatus and that their 'failed' fields have not been incremented. + intitialServerStatus = newServerStatus; + newServerStatus = assert.commandWorked(master.adminCommand({serverStatus: 1})); + verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDown, + newServerStatus.metrics.commands.replSetStepDown, + "total", + 1); + verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDown, + newServerStatus.metrics.commands.replSetStepDown, + "failed", + 0); + verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDownWithForce, + newServerStatus.metrics.commands.replSetStepDownWithForce, + "total", + 1); + verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDownWithForce, + newServerStatus.metrics.commands.replSetStepDownWithForce, + "failed", + 0); + jsTestLog('Checking isMaster on ' + master); var r2 = assert.commandWorked(master.getDB("admin").runCommand({ismaster: 1})); jsTestLog('Result from running isMaster on ' + master + ': ' + tojson(r2)); assert.eq(r2.ismaster, false); assert.eq(r2.secondary, true); + + // Check that the 'electionCandidateMetrics' section of the replSetGetStatus response has been + // cleared, since the node is no longer primary. + res = assert.commandWorked(master.adminCommand({replSetGetStatus: 1})); + assert(!res.electionCandidateMetrics, + () => "Response should not have an 'electionCandidateMetrics' field: " + tojson(res)); + + // This section checks that the metrics are incremented accurately when the command fails due to + // an error while stepping down. This is one reason the replSetStepDown command could fail once + // we call stepDown in the replication coordinator, but success in this case gives us confidence + // that the behavior in the other cases is the same. + + // Stepdown should fail because the node is no longer primary + jsTestLog('Do stepdown of primary ' + master + ' that should not work'); + assert.commandFailedWithCode(master.getDB("admin").runCommand( + {replSetStepDown: ReplSetTest.kDefaultTimeoutMS, force: true}), + ErrorCodes.NotMaster); + + // Check that the 'total' and 'failed' fields of 'replSetStepDown' and + // 'replSetStepDownWithForce' have been incremented in serverStatus. + intitialServerStatus = newServerStatus; + newServerStatus = assert.commandWorked(master.adminCommand({serverStatus: 1})); + verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDown, + newServerStatus.metrics.commands.replSetStepDown, + "total", + 1); + verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDown, + newServerStatus.metrics.commands.replSetStepDown, + "failed", + 1); + verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDownWithForce, + newServerStatus.metrics.commands.replSetStepDownWithForce, + "total", + 1); + verifyServerStatusChange(intitialServerStatus.metrics.commands.replSetStepDownWithForce, + newServerStatus.metrics.commands.replSetStepDownWithForce, + "failed", + 1); } catch (e) { throw e; } finally { diff --git a/jstests/replsets/unconditional_step_down.js b/jstests/replsets/unconditional_step_down.js index c9f95bcb1ac..4d5e37821db 100644 --- a/jstests/replsets/unconditional_step_down.js +++ b/jstests/replsets/unconditional_step_down.js @@ -106,6 +106,10 @@ function runStepDownTest({testMsg, stepDownFn, toRemovedState}) { jsTestLog("Wait for write cmd to reach the fail point"); waitForCurOpByFailPoint(primaryDB, collNss, writeFailPoint); + let res = assert.commandWorked(primary.adminCommand({replSetGetStatus: 1})); + assert(res.electionCandidateMetrics, + () => "Response should have an 'electionCandidateMetrics' field: " + tojson(res)); + jsTestLog("Trigger step down"); var oldConfig = stepDownFn(); @@ -119,6 +123,16 @@ function runStepDownTest({testMsg, stepDownFn, toRemovedState}) { (toRemovedState) ? ReplSetTest.State.REMOVED : ReplSetTest.State.SECONDARY); assert.commandWorked(primary.adminCommand({configureFailPoint: writeFailPoint, mode: "off"})); + + // Check that the 'electionCandidateMetrics' section of the replSetGetStatus response has been + // cleared, since the node is no longer primary. + if (!toRemovedState) { + res = assert.commandWorked(primary.adminCommand({replSetGetStatus: 1})); + assert( + !res.electionCandidateMetrics, + () => "Response should not have an 'electionCandidateMetrics' field: " + tojson(res)); + } + // Get the new primary. refreshConnection(); } diff --git a/src/mongo/db/repl/SConscript b/src/mongo/db/repl/SConscript index 779fb0a3275..ce402e8c0b4 100644 --- a/src/mongo/db/repl/SConscript +++ b/src/mongo/db/repl/SConscript @@ -1711,6 +1711,7 @@ env.Library( '$BUILD_DIR/mongo/db/op_observer', '$BUILD_DIR/mongo/db/query_exec', '$BUILD_DIR/mongo/db/repl/oplog_buffer_proxy', + '$BUILD_DIR/mongo/db/repl/replication_metrics', '$BUILD_DIR/mongo/db/s/balancer', '$BUILD_DIR/mongo/db/s/sharding_runtime_d', '$BUILD_DIR/mongo/db/service_context', diff --git a/src/mongo/db/repl/bgsync.cpp b/src/mongo/db/repl/bgsync.cpp index 5b5b85c6af5..53d3e6a3612 100644 --- a/src/mongo/db/repl/bgsync.cpp +++ b/src/mongo/db/repl/bgsync.cpp @@ -300,7 +300,11 @@ void BackgroundSync::_produce() { log() << "Our newest OpTime : " << lastOpTimeFetched; log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen << " from " << syncSourceResp.getSyncSource(); - _replCoord->abortCatchupIfNeeded().transitional_ignore(); + auto status = _replCoord->abortCatchupIfNeeded( + ReplicationCoordinator::PrimaryCatchUpConclusionReason::kFailedWithError); + if (!status.isOK()) { + LOG(1) << "Aborting catch-up failed with status: " << status; + } return; } @@ -534,7 +538,11 @@ void BackgroundSync::_runRollback(OperationContext* opCtx, StorageInterface* storageInterface) { if (_replCoord->getMemberState().primary()) { warning() << "Rollback situation detected in catch-up mode. Aborting catch-up mode."; - _replCoord->abortCatchupIfNeeded().transitional_ignore(); + auto status = _replCoord->abortCatchupIfNeeded( + ReplicationCoordinator::PrimaryCatchUpConclusionReason::kFailedWithError); + if (!status.isOK()) { + LOG(1) << "Aborting catch-up failed with status: " << status; + } return; } diff --git a/src/mongo/db/repl/repl_set_commands.cpp b/src/mongo/db/repl/repl_set_commands.cpp index a38d510fd5a..aa09a8a55d1 100644 --- a/src/mongo/db/repl/repl_set_commands.cpp +++ b/src/mongo/db/repl/repl_set_commands.cpp @@ -450,16 +450,31 @@ public: "primary.)\n" "http://dochub.mongodb.org/core/replicasetcommands"; } - CmdReplSetStepDown() : ReplSetCommand("replSetStepDown") {} + CmdReplSetStepDown() + : ReplSetCommand("replSetStepDown"), + _stepDownCmdsWithForceExecutedMetric("commands.replSetStepDownWithForce.total", + &_stepDownCmdsWithForceExecuted), + _stepDownCmdsWithForceFailedMetric("commands.replSetStepDownWithForce.failed", + &_stepDownCmdsWithForceFailed) {} virtual bool run(OperationContext* opCtx, const string&, const BSONObj& cmdObj, BSONObjBuilder& result) { + const bool force = cmdObj["force"].trueValue(); + + if (force) { + _stepDownCmdsWithForceExecuted.increment(); + } + + auto onExitGuard = makeGuard([&] { + if (force) { + _stepDownCmdsWithForceFailed.increment(); + } + }); + Status status = ReplicationCoordinator::get(opCtx)->checkReplEnabledForCommand(&result); uassertStatusOK(status); - const bool force = cmdObj["force"].trueValue(); - long long stepDownForSecs = cmdObj.firstElement().numberLong(); if (stepDownForSecs == 0) { stepDownForSecs = 60; @@ -498,10 +513,17 @@ public: ReplicationCoordinator::get(opCtx)->stepDown( opCtx, force, Seconds(secondaryCatchUpPeriodSecs), Seconds(stepDownForSecs)); + + onExitGuard.dismiss(); return true; } private: + mutable Counter64 _stepDownCmdsWithForceExecuted; + mutable Counter64 _stepDownCmdsWithForceFailed; + ServerStatusMetricField<Counter64> _stepDownCmdsWithForceExecutedMetric; + ServerStatusMetricField<Counter64> _stepDownCmdsWithForceFailedMetric; + ActionSet getAuthActionSet() const override { return ActionSet{ActionType::replSetStateChange}; } @@ -743,7 +765,9 @@ public: uassertStatusOK(status); log() << "Received replSetAbortPrimaryCatchUp request"; - status = ReplicationCoordinator::get(opCtx)->abortCatchupIfNeeded(); + status = ReplicationCoordinator::get(opCtx)->abortCatchupIfNeeded( + ReplicationCoordinator::PrimaryCatchUpConclusionReason:: + kFailedWithReplSetAbortPrimaryCatchUpCmd); if (!status.isOK()) { log() << "replSetAbortPrimaryCatchUp request failed" << causedBy(status); } diff --git a/src/mongo/db/repl/replication_coordinator.h b/src/mongo/db/repl/replication_coordinator.h index 8b767924eac..2f8bd118552 100644 --- a/src/mongo/db/repl/replication_coordinator.h +++ b/src/mongo/db/repl/replication_coordinator.h @@ -863,10 +863,26 @@ public: virtual ServiceContext* getServiceContext() = 0; + enum PrimaryCatchUpConclusionReason { + kSucceeded, + kAlreadyCaughtUp, + kSkipped, + kTimedOut, + kFailedWithError, + kFailedWithNewTerm, + kFailedWithReplSetAbortPrimaryCatchUpCmd + }; + /** * Abort catchup if the node is in catchup mode. */ - virtual Status abortCatchupIfNeeded() = 0; + virtual Status abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) = 0; + + /** + * Increment the counter for the number of ops applied during catchup if the node is in catchup + * mode. + */ + virtual void incrementNumCatchUpOpsIfCatchingUp(int numOps) = 0; /** * Signals that drop pending collections have been removed from storage. diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp index f87f0f0681a..ba98e84b2d0 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp @@ -69,6 +69,7 @@ #include "mongo/db/repl/repl_server_parameters_gen.h" #include "mongo/db/repl/repl_settings.h" #include "mongo/db/repl/replication_coordinator.h" +#include "mongo/db/repl/replication_metrics.h" #include "mongo/db/repl/replication_process.h" #include "mongo/db/repl/storage_interface.h" #include "mongo/db/s/balancer/balancer.h" @@ -488,6 +489,9 @@ OpTime ReplicationCoordinatorExternalStateImpl::onTransitionToPrimary(OperationC fassert(28665, loadLastOpTimeAndWallTimeResult); auto opTimeToReturn = loadLastOpTimeAndWallTimeResult.getValue().opTime; + auto newTermStartDate = loadLastOpTimeAndWallTimeResult.getValue().wallTime; + ReplicationMetrics::get(opCtx).setNewTermStartDate(newTermStartDate); + _shardingOnTransitionToPrimaryHook(opCtx); _dropAllTempCollections(opCtx); diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index 4f57c4ccedf..945b9983ea2 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -2110,6 +2110,10 @@ void ReplicationCoordinatorImpl::stepDown(OperationContext* opCtx, lk.lock(); _updateAndLogStatsOnStepDown(&arsd); + + // Clear the node's election candidate metrics since it is no longer primary. + ReplicationMetrics::get(opCtx).clearElectionCandidateMetrics(); + _topCoord->finishUnconditionalStepDown(); onExitGuard.dismiss(); @@ -2374,6 +2378,9 @@ Status ReplicationCoordinatorImpl::processReplSetGetStatus( } } + BSONObj electionCandidateMetrics = + ReplicationMetrics::get(getServiceContext()).getElectionCandidateMetricsBSON(); + stdx::lock_guard<stdx::mutex> lk(_mutex); Status result(ErrorCodes::InternalError, "didn't set status in prepareStatusResponse"); _topCoord->prepareStatusResponse( @@ -2382,6 +2389,7 @@ Status ReplicationCoordinatorImpl::processReplSetGetStatus( static_cast<unsigned>(time(0) - serverGlobalParams.started), _getCurrentCommittedSnapshotOpTimeAndWallTime_inlock(), initialSyncProgress, + electionCandidateMetrics, _storage->getLastStableCheckpointTimestampDeprecated(_service), _storage->getLastStableRecoveryTimestamp(_service)}, response, @@ -2687,6 +2695,9 @@ void ReplicationCoordinatorImpl::_finishReplSetReconfig(OperationContext* opCtx, lk.lock(); _updateAndLogStatsOnStepDown(&arsd.get()); + + // Clear the node's election candidate metrics since it is no longer primary. + ReplicationMetrics::get(opCtx).clearElectionCandidateMetrics(); } else { // Release the rstl lock as the node might have stepped down due to // other unconditional step down code paths like learning new term via heartbeat & @@ -2835,7 +2846,6 @@ ReplicationCoordinatorImpl::_updateMemberStateFromTopologyCoordinator(WithLock l _readWriteAbility->setCanAcceptNonLocalWrites(lk, opCtx, canAcceptWrites); } - const MemberState newState = _topCoord->getMemberState(); if (newState == _memberState) { if (_topCoord->getRole() == TopologyCoordinator::Role::kCandidate) { @@ -2867,7 +2877,16 @@ ReplicationCoordinatorImpl::_updateMemberStateFromTopologyCoordinator(WithLock l // Exit catchup mode if we're in it and enable replication producer and applier on stepdown. if (_memberState.primary()) { if (_catchupState) { - _catchupState->abort_inlock(); + // _pendingTermUpdateDuringStepDown is set before stepping down due to hearing about a + // higher term, so that we can remember the term we heard and update our term as part of + // finishing stepdown. It is then unset toward the end of stepdown, after the function + // we are in is called. Thus we must be stepping down due to seeing a higher term if and + // only if _pendingTermUpdateDuringStepDown is set here. + if (_pendingTermUpdateDuringStepDown) { + _catchupState->abort_inlock(PrimaryCatchUpConclusionReason::kFailedWithNewTerm); + } else { + _catchupState->abort_inlock(PrimaryCatchUpConclusionReason::kFailedWithError); + } } _applierState = ApplierState::Running; _externalState->startProducerIfStopped(); @@ -3011,7 +3030,7 @@ void ReplicationCoordinatorImpl::CatchupState::start_inlock() { // No catchup in single node replica set. if (_repl->_rsConfig.getNumMembers() == 1) { - abort_inlock(); + abort_inlock(PrimaryCatchUpConclusionReason::kSkipped); return; } @@ -3020,7 +3039,7 @@ void ReplicationCoordinatorImpl::CatchupState::start_inlock() { // When catchUpTimeoutMillis is 0, we skip doing catchup entirely. if (catchupTimeout == ReplSetConfig::kCatchUpDisabled) { log() << "Skipping primary catchup since the catchup timeout is 0."; - abort_inlock(); + abort_inlock(PrimaryCatchUpConclusionReason::kSkipped); return; } @@ -3035,7 +3054,7 @@ void ReplicationCoordinatorImpl::CatchupState::start_inlock() { return; } log() << "Catchup timed out after becoming primary."; - abort_inlock(); + abort_inlock(PrimaryCatchUpConclusionReason::kTimedOut); }; // Deal with infinity and overflow - no timeout. @@ -3048,15 +3067,20 @@ void ReplicationCoordinatorImpl::CatchupState::start_inlock() { auto status = _repl->_replExecutor->scheduleWorkAt(timeoutDate, std::move(timeoutCB)); if (!status.isOK()) { log() << "Failed to schedule catchup timeout work."; - abort_inlock(); + abort_inlock(PrimaryCatchUpConclusionReason::kFailedWithError); return; } _timeoutCbh = status.getValue(); + + _numCatchUpOps = 0; } -void ReplicationCoordinatorImpl::CatchupState::abort_inlock() { +void ReplicationCoordinatorImpl::CatchupState::abort_inlock(PrimaryCatchUpConclusionReason reason) { invariant(_repl->_getMemberState_inlock().primary()); + ReplicationMetrics::get(getGlobalServiceContext()) + .incrementNumCatchUpsConcludedForReason(reason); + log() << "Exited primary catch-up mode."; // Clean up its own members. if (_timeoutCbh) { @@ -3084,7 +3108,10 @@ void ReplicationCoordinatorImpl::CatchupState::signalHeartbeatUpdate_inlock() { if (*targetOpTime <= myLastApplied) { log() << "Caught up to the latest optime known via heartbeats after becoming primary. " << "Target optime: " << *targetOpTime << ". My Last Applied: " << myLastApplied; - abort_inlock(); + // Report the number of ops applied during catchup in replSetGetStatus once the primary is + // caught up. + ReplicationMetrics::get(getGlobalServiceContext()).setNumCatchUpOps(_numCatchUpOps); + abort_inlock(PrimaryCatchUpConclusionReason::kAlreadyCaughtUp); return; } @@ -3093,6 +3120,8 @@ void ReplicationCoordinatorImpl::CatchupState::signalHeartbeatUpdate_inlock() { return; } + ReplicationMetrics::get(getGlobalServiceContext()).setTargetCatchupOpTime(targetOpTime.get()); + log() << "Heartbeats updated catchup target optime to " << *targetOpTime; log() << "Latest known optime per replica set member:"; auto opTimesPerMember = _repl->_topCoord->latestKnownOpTimeSinceHeartbeatRestartPerMember(); @@ -3103,29 +3132,49 @@ void ReplicationCoordinatorImpl::CatchupState::signalHeartbeatUpdate_inlock() { if (_waiter) { _repl->_opTimeWaiterList.remove_inlock(_waiter.get()); + } else { + // Only increment the 'numCatchUps' election metric the first time we add a waiter, so that + // we only increment it once each time a primary has to catch up. If there is already an + // existing waiter, then the node is catching up and has already been counted. + ReplicationMetrics::get(getGlobalServiceContext()).incrementNumCatchUps(); } + auto targetOpTimeCB = [this, targetOpTime]() { // Double check the target time since stepdown may signal us too. const auto myLastApplied = _repl->_getMyLastAppliedOpTime_inlock(); if (*targetOpTime <= myLastApplied) { log() << "Caught up to the latest known optime successfully after becoming primary. " << "Target optime: " << *targetOpTime << ". My Last Applied: " << myLastApplied; - abort_inlock(); + // Report the number of ops applied during catchup in replSetGetStatus once the primary + // is caught up. + ReplicationMetrics::get(getGlobalServiceContext()).setNumCatchUpOps(_numCatchUpOps); + abort_inlock(PrimaryCatchUpConclusionReason::kSucceeded); } }; _waiter = stdx::make_unique<CallbackWaiter>(*targetOpTime, targetOpTimeCB); _repl->_opTimeWaiterList.add_inlock(_waiter.get()); } -Status ReplicationCoordinatorImpl::abortCatchupIfNeeded() { +void ReplicationCoordinatorImpl::CatchupState::incrementNumCatchUpOps_inlock(int numOps) { + _numCatchUpOps += numOps; +} + +Status ReplicationCoordinatorImpl::abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) { stdx::lock_guard<stdx::mutex> lk(_mutex); if (_catchupState) { - _catchupState->abort_inlock(); + _catchupState->abort_inlock(reason); return Status::OK(); } return Status(ErrorCodes::IllegalOperation, "The node is not in catch-up mode."); } +void ReplicationCoordinatorImpl::incrementNumCatchUpOpsIfCatchingUp(int numOps) { + stdx::lock_guard<stdx::mutex> lk(_mutex); + if (_catchupState) { + _catchupState->incrementNumCatchUpOps_inlock(numOps); + } +} + void ReplicationCoordinatorImpl::signalDropPendingCollectionsRemovedFromStorage() { stdx::lock_guard<stdx::mutex> lock(_mutex); _wakeReadyWaiters_inlock(); diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h index 24d530fbf4e..57c31f707aa 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.h +++ b/src/mongo/db/repl/replication_coordinator_impl.h @@ -321,7 +321,9 @@ public: virtual Status stepUpIfEligible(bool skipDryRun) override; - virtual Status abortCatchupIfNeeded() override; + virtual Status abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) override; + + virtual void incrementNumCatchUpOpsIfCatchingUp(int numOps) override; void signalDropPendingCollectionsRemovedFromStorage() final; @@ -669,9 +671,11 @@ private: // start() can only be called once. void start_inlock(); // Reset the state itself to destruct the state. - void abort_inlock(); + void abort_inlock(PrimaryCatchUpConclusionReason reason); // Heartbeat calls this function to update the target optime. void signalHeartbeatUpdate_inlock(); + // Increment the counter for the number of ops applied during catchup. + void incrementNumCatchUpOps_inlock(int numOps); private: ReplicationCoordinatorImpl* _repl; // Not owned. @@ -680,6 +684,8 @@ private: // Handle to a Waiter that contains the current target optime to reach after which // we can exit catchup mode. std::unique_ptr<CallbackWaiter> _waiter; + // Counter for the number of ops applied during catchup. + int _numCatchUpOps; }; // Inner class to manage the concurrency of _canAcceptNonLocalWrites and _canServeNonLocalReads. diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp index 2b7d7bd62f9..ea12a516ba1 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp @@ -64,6 +64,10 @@ public: if (_replCoord->_electionFinishedEvent.isValid()) { _replCoord->_replExecutor->signalEvent(_replCoord->_electionFinishedEvent); } + + // Clear the node's election candidate metrics if it loses either the dry-run or actual + // election, since it will not become primary. + ReplicationMetrics::get(getGlobalServiceContext()).clearElectionCandidateMetrics(); } void dismiss() { @@ -141,6 +145,9 @@ void ReplicationCoordinatorImpl::_startElectSelfV1_inlock( long long term = _topCoord->getTerm(); int primaryIndex = -1; + Date_t now = _replExecutor->now(); + ReplicationMetrics::get(getServiceContext()).setElectionCandidateMetrics(now); + if (reason == TopologyCoordinator::StartElectionReason::kStepUpRequestSkipDryRun) { long long newTerm = term + 1; log() << "skipping dry run and running for election in term " << newTerm; diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp index b1b4d4879b5..f54883200ea 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp @@ -48,6 +48,8 @@ #include "mongo/util/fail_point_service.h" #include "mongo/util/log.h" +#include <boost/optional/optional_io.hpp> + namespace mongo { namespace repl { namespace { @@ -221,6 +223,20 @@ TEST_F(ReplCoordTest, ElectionSucceedsWhenNodeIsTheOnlyNode) { getReplCoord()->fillIsMasterForReplSet(&imResponse, {}); ASSERT_TRUE(imResponse.isMaster()) << imResponse.toBSON().toString(); ASSERT_FALSE(imResponse.isSecondary()) << imResponse.toBSON().toString(); + + // Check that only the 'numCatchUpsSkipped' primary catchup conclusion reason was incremented. + ASSERT_EQ(0, ReplicationMetrics::get(opCtxPtr.get()).getNumCatchUpsSucceeded_forTesting()); + ASSERT_EQ(0, + ReplicationMetrics::get(opCtxPtr.get()).getNumCatchUpsAlreadyCaughtUp_forTesting()); + ASSERT_EQ(1, ReplicationMetrics::get(opCtxPtr.get()).getNumCatchUpsSkipped_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtxPtr.get()).getNumCatchUpsTimedOut_forTesting()); + ASSERT_EQ(0, + ReplicationMetrics::get(opCtxPtr.get()).getNumCatchUpsFailedWithError_forTesting()); + ASSERT_EQ(0, + ReplicationMetrics::get(opCtxPtr.get()).getNumCatchUpsFailedWithNewTerm_forTesting()); + ASSERT_EQ(0, + ReplicationMetrics::get(opCtxPtr.get()) + .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting()); } TEST_F(ReplCoordTest, ElectionSucceedsWhenAllNodesVoteYea) { @@ -330,6 +346,10 @@ TEST_F(ReplCoordTest, ElectionFailsWhenInsufficientVotesAreReceivedDuringDryRun) simulateEnoughHeartbeatsForAllNodesUp(); + // Check that the node's election candidate metrics are unset before it becomes primary. + ASSERT_BSONOBJ_EQ( + BSONObj(), ReplicationMetrics::get(getServiceContext()).getElectionCandidateMetricsBSON()); + auto electionTimeoutWhen = getReplCoord()->getElectionTimeout_forTest(); ASSERT_NOT_EQUALS(Date_t(), electionTimeoutWhen); log() << "Election timeout scheduled at " << electionTimeoutWhen << " (simulator time)"; @@ -354,6 +374,11 @@ TEST_F(ReplCoordTest, ElectionFailsWhenInsufficientVotesAreReceivedDuringDryRun) << false << "reason" << "don't like him much"))); voteRequests++; + // Check that the node's election candidate metrics are set once it has called an + // election. + ASSERT_BSONOBJ_NE( + BSONObj(), + ReplicationMetrics::get(getServiceContext()).getElectionCandidateMetricsBSON()); } else { net->blackHole(noi); } @@ -363,6 +388,11 @@ TEST_F(ReplCoordTest, ElectionFailsWhenInsufficientVotesAreReceivedDuringDryRun) stopCapturingLogMessages(); ASSERT_EQUALS( 1, countLogLinesContaining("not running for primary, we received insufficient votes")); + + // Check that the node's election candidate metrics have been cleared, since it lost the dry-run + // election and will not become primary. + ASSERT_BSONOBJ_EQ( + BSONObj(), ReplicationMetrics::get(getServiceContext()).getElectionCandidateMetricsBSON()); } TEST_F(ReplCoordTest, ElectionFailsWhenDryRunResponseContainsANewerTerm) { @@ -653,9 +683,17 @@ TEST_F(ReplCoordTest, ElectionFailsWhenVoteRequestResponseContainsANewerTerm) { replCoordSetMyLastDurableOpTime(time1, Date_t() + Seconds(time1.getSecs())); ASSERT_OK(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + // Check that the node's election candidate metrics are unset before it becomes primary. + ASSERT_BSONOBJ_EQ( + BSONObj(), ReplicationMetrics::get(getServiceContext()).getElectionCandidateMetricsBSON()); + simulateEnoughHeartbeatsForAllNodesUp(); simulateSuccessfulDryRun(); + // Check that the node's election candidate metrics are set once it has called an election. + ASSERT_BSONOBJ_NE( + BSONObj(), ReplicationMetrics::get(getServiceContext()).getElectionCandidateMetricsBSON()); + NetworkInterfaceMock* net = getNet(); net->enterNetwork(); while (net->hasReadyRequests()) { @@ -680,6 +718,11 @@ TEST_F(ReplCoordTest, ElectionFailsWhenVoteRequestResponseContainsANewerTerm) { stopCapturingLogMessages(); ASSERT_EQUALS(1, countLogLinesContaining("not becoming primary, we have been superseded already")); + + // Check that the node's election candidate metrics have been cleared, since it lost the actual + // election and will not become primary. + ASSERT_BSONOBJ_EQ( + BSONObj(), ReplicationMetrics::get(getServiceContext()).getElectionCandidateMetricsBSON()); } TEST_F(ReplCoordTest, ElectionFailsWhenTermChangesDuringDryRun) { @@ -2199,6 +2242,25 @@ TEST_F(PrimaryCatchUpTest, PrimaryDoesNotNeedToCatchUp) { signalDrainComplete(opCtx.get()); Lock::GlobalLock lock(opCtx.get(), MODE_IX); ASSERT_TRUE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test")); + + // Check that the number of elections requiring primary catchup was not incremented. + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting()); + + // Check that only the 'numCatchUpsAlreadyCaughtUp' primary catchup conclusion reason was + // incremented. + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting()); + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting()); + ASSERT_EQ(0, + ReplicationMetrics::get(opCtx.get()) + .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting()); + + // Check that the targetCatchupOpTime metric was not set. + ASSERT_EQUALS(boost::none, + ReplicationMetrics::get(getServiceContext()).getTargetCatchupOpTime_forTesting()); } // Heartbeats set a future target OpTime and we reached that successfully. @@ -2208,11 +2270,23 @@ TEST_F(PrimaryCatchUpTest, CatchupSucceeds) { OpTime time1(Timestamp(100, 1), 0); OpTime time2(Timestamp(100, 2), 0); ReplSetConfig config = setUp3NodeReplSetAndRunForElection(time1); + + // Check that the targetCatchupOpTime metric is unset before the target opTime for catchup is + // set. + ASSERT_EQUALS(boost::none, + ReplicationMetrics::get(getServiceContext()).getTargetCatchupOpTime_forTesting()); + processHeartbeatRequests([this, time2](const NetworkOpIter noi) { auto net = getNet(); // The old primary accepted one more op and all nodes caught up after voting for me. net->scheduleResponse(noi, net->now(), makeHeartbeatResponse(time2)); }); + + // Check that the targetCatchupOpTime metric was set correctly when heartbeats updated the + // target opTime for catchup. + ASSERT_EQUALS(time2, + ReplicationMetrics::get(getServiceContext()).getTargetCatchupOpTime_forTesting()); + ASSERT(getReplCoord()->getApplierState() == ApplierState::Running); advanceMyLastAppliedOpTime(time2, Date_t() + Seconds(time2.getSecs())); ASSERT(getReplCoord()->getApplierState() == ApplierState::Draining); @@ -2222,6 +2296,20 @@ TEST_F(PrimaryCatchUpTest, CatchupSucceeds) { signalDrainComplete(opCtx.get()); Lock::GlobalLock lock(opCtx.get(), MODE_IX); ASSERT_TRUE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test")); + + // Check that the number of elections requiring primary catchup was incremented. + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting()); + + // Check that only the 'numCatchUpsSucceeded' primary catchup conclusion reason was incremented. + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting()); + ASSERT_EQ(0, + ReplicationMetrics::get(opCtx.get()) + .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting()); } TEST_F(PrimaryCatchUpTest, CatchupTimeout) { @@ -2242,6 +2330,20 @@ TEST_F(PrimaryCatchUpTest, CatchupTimeout) { signalDrainComplete(opCtx.get()); Lock::GlobalLock lock(opCtx.get(), MODE_IX); ASSERT_TRUE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test")); + + // Check that the number of elections requiring primary catchup was incremented. + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting()); + + // Check that only the 'numCatchUpsTimedOut' primary catchup conclusion reason was incremented. + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting()); + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting()); + ASSERT_EQ(0, + ReplicationMetrics::get(opCtx.get()) + .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting()); } TEST_F(PrimaryCatchUpTest, CannotSeeAllNodes) { @@ -2267,6 +2369,21 @@ TEST_F(PrimaryCatchUpTest, CannotSeeAllNodes) { signalDrainComplete(opCtx.get()); Lock::GlobalLock lock(opCtx.get(), MODE_IX); ASSERT_TRUE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test")); + + // Check that the number of elections requiring primary catchup was not incremented. + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting()); + + // Check that only the 'numCatchUpsAlreadyCaughtUp' primary catchup conclusion reason was + // incremented. + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting()); + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting()); + ASSERT_EQ(0, + ReplicationMetrics::get(opCtx.get()) + .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting()); } TEST_F(PrimaryCatchUpTest, HeartbeatTimeout) { @@ -2292,6 +2409,21 @@ TEST_F(PrimaryCatchUpTest, HeartbeatTimeout) { signalDrainComplete(opCtx.get()); Lock::GlobalLock lock(opCtx.get(), MODE_IX); ASSERT_TRUE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test")); + + // Check that the number of elections requiring primary catchup was not incremented. + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting()); + + // Check that only the 'numCatchUpsAlreadyCaughtUp' primary catchup conclusion reason was + // incremented. + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting()); + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting()); + ASSERT_EQ(0, + ReplicationMetrics::get(opCtx.get()) + .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting()); } TEST_F(PrimaryCatchUpTest, PrimaryStepsDownBeforeHeartbeatRefreshing) { @@ -2315,6 +2447,21 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownBeforeHeartbeatRefreshing) { auto opCtx = makeOperationContext(); Lock::GlobalLock lock(opCtx.get(), MODE_IX); ASSERT_FALSE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test")); + + // Check that the number of elections requiring primary catchup was not incremented. + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting()); + + // Since the primary stepped down in catchup mode because it saw a higher term, check that only + // the 'numCatchUpsFailedWithNewTerm' primary catchup conclusion reason was incremented. + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting()); + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting()); + ASSERT_EQ(0, + ReplicationMetrics::get(opCtx.get()) + .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting()); } TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringCatchUp) { @@ -2329,6 +2476,10 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringCatchUp) { // Other nodes are ahead of me. getNet()->scheduleResponse(noi, getNet()->now(), makeHeartbeatResponse(time2)); }); + + ASSERT_EQUALS(time2, + ReplicationMetrics::get(getServiceContext()).getTargetCatchupOpTime_forTesting()); + ASSERT(getReplCoord()->getApplierState() == ApplierState::Running); TopologyCoordinator::UpdateTermResult updateTermResult; auto evh = getReplCoord()->updateTerm_forTest(2, &updateTermResult); @@ -2344,6 +2495,25 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringCatchUp) { auto opCtx = makeOperationContext(); Lock::GlobalLock lock(opCtx.get(), MODE_IX); ASSERT_FALSE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test")); + + // Check that the number of elections requiring primary catchup was incremented. + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting()); + + // Since the primary stepped down in catchup mode because it saw a higher term, check that only + // the 'numCatchUpsFailedWithNewTerm' primary catchup conclusion reason was incremented. + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting()); + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting()); + ASSERT_EQ(0, + ReplicationMetrics::get(opCtx.get()) + .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting()); + + // Check that the targetCatchupOpTime metric was cleared when the node stepped down. + ASSERT_EQUALS(boost::none, + ReplicationMetrics::get(getServiceContext()).getTargetCatchupOpTime_forTesting()); } TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringDrainMode) { @@ -2365,6 +2535,21 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringDrainMode) { stopCapturingLogMessages(); ASSERT_EQUALS(1, countLogLinesContaining("Caught up to the latest")); + // Check that the number of elections requiring primary catchup was incremented. + auto opCtx = makeOperationContext(); + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting()); + + // Check that only the 'numCatchUpsSucceeded' primary catchup conclusion reason was incremented. + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting()); + ASSERT_EQ(0, + ReplicationMetrics::get(opCtx.get()) + .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting()); + // Step down during drain mode. TopologyCoordinator::UpdateTermResult updateTermResult; auto evh = replCoord->updateTerm_forTest(2, &updateTermResult); @@ -2383,7 +2568,6 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringDrainMode) { net->scheduleResponse(noi, net->now(), makeHeartbeatResponse(time2)); }); ASSERT(replCoord->getApplierState() == ApplierState::Draining); - auto opCtx = makeOperationContext(); { Lock::GlobalLock lock(opCtx.get(), MODE_IX); ASSERT_FALSE(replCoord->canAcceptWritesForDatabase(opCtx.get(), "test")); @@ -2392,6 +2576,21 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringDrainMode) { Lock::GlobalLock lock(opCtx.get(), MODE_IX); ASSERT(replCoord->getApplierState() == ApplierState::Stopped); ASSERT_TRUE(replCoord->canAcceptWritesForDatabase(opCtx.get(), "test")); + + // Check that the number of elections requiring primary catchup was not incremented again. + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting()); + + // Check that only the 'numCatchUpsAlreadyCaughtUp' primary catchup conclusion reason was + // incremented. + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting()); + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting()); + ASSERT_EQ(0, + ReplicationMetrics::get(opCtx.get()) + .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting()); } TEST_F(PrimaryCatchUpTest, FreshestNodeBecomesAvailableLater) { @@ -2420,6 +2619,8 @@ TEST_F(PrimaryCatchUpTest, FreshestNodeBecomesAvailableLater) { ASSERT(getReplCoord()->getApplierState() == ApplierState::Running); stopCapturingLogMessages(); ASSERT_EQ(1, countLogLinesContaining("Heartbeats updated catchup target optime")); + ASSERT_EQUALS(time3, + ReplicationMetrics::get(getServiceContext()).getTargetCatchupOpTime_forTesting()); // 3) Advancing its applied optime to time 2 isn't enough. advanceMyLastAppliedOpTime(time2, Date_t() + Seconds(time2.getSecs())); @@ -2440,6 +2641,8 @@ TEST_F(PrimaryCatchUpTest, FreshestNodeBecomesAvailableLater) { ASSERT(getReplCoord()->getApplierState() == ApplierState::Running); stopCapturingLogMessages(); ASSERT_EQ(1, countLogLinesContaining("Heartbeats updated catchup target optime")); + ASSERT_EQUALS(time4, + ReplicationMetrics::get(getServiceContext()).getTargetCatchupOpTime_forTesting()); // 5) Advancing to time 3 isn't enough now. advanceMyLastAppliedOpTime(time3, Date_t() + Seconds(time3.getSecs())); @@ -2455,6 +2658,20 @@ TEST_F(PrimaryCatchUpTest, FreshestNodeBecomesAvailableLater) { signalDrainComplete(opCtx.get()); Lock::GlobalLock lock(opCtx.get(), MODE_IX); ASSERT_TRUE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test")); + + // Check that the number of elections requiring primary catchup was incremented. + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting()); + + // Check that only the 'numCatchUpsSucceeded' primary catchup conclusion reason was incremented. + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting()); + ASSERT_EQ(0, + ReplicationMetrics::get(opCtx.get()) + .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting()); } TEST_F(PrimaryCatchUpTest, InfiniteTimeoutAndAbort) { @@ -2489,7 +2706,9 @@ TEST_F(PrimaryCatchUpTest, InfiniteTimeoutAndAbort) { ASSERT(getReplCoord()->getApplierState() == ApplierState::Running); // Simulate a user initiated abort. - ASSERT_OK(getReplCoord()->abortCatchupIfNeeded()); + ASSERT_OK(getReplCoord()->abortCatchupIfNeeded( + ReplicationCoordinator::PrimaryCatchUpConclusionReason:: + kFailedWithReplSetAbortPrimaryCatchUpCmd)); ASSERT(getReplCoord()->getApplierState() == ApplierState::Draining); stopCapturingLogMessages(); @@ -2500,6 +2719,21 @@ TEST_F(PrimaryCatchUpTest, InfiniteTimeoutAndAbort) { signalDrainComplete(opCtx.get()); Lock::GlobalLock lock(opCtx.get(), MODE_IX); ASSERT_TRUE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test")); + + // Check that the number of elections requiring primary catchup was incremented. + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting()); + + // Check that only the 'numCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd' primary catchup + // conclusion reason was incremented. + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting()); + ASSERT_EQ(1, + ReplicationMetrics::get(opCtx.get()) + .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting()); } TEST_F(PrimaryCatchUpTest, ZeroTimeout) { @@ -2514,6 +2748,60 @@ TEST_F(PrimaryCatchUpTest, ZeroTimeout) { signalDrainComplete(opCtx.get()); Lock::GlobalLock lock(opCtx.get(), MODE_IX); ASSERT_TRUE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test")); + + // Check that the number of elections requiring primary catchup was not incremented. + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting()); + + // Check that only the 'numCatchUpsSkipped' primary catchup conclusion reason was incremented. + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting()); + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting()); + ASSERT_EQ(0, + ReplicationMetrics::get(opCtx.get()) + .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting()); +} + +TEST_F(PrimaryCatchUpTest, CatchUpFailsDueToPrimaryStepDown) { + startCapturingLogMessages(); + + OpTime time1(Timestamp(100, 1), 0); + OpTime time2(Timestamp(100, 2), 0); + ReplSetConfig config = setUp3NodeReplSetAndRunForElection(time1); + // Step down in the middle of catchup. + auto abortTime = getNet()->now() + config.getCatchUpTimeoutPeriod() / 2; + replyHeartbeatsAndRunUntil(abortTime, [this, time2](const NetworkOpIter noi) { + // Other nodes are ahead of me. + getNet()->scheduleResponse(noi, getNet()->now(), makeHeartbeatResponse(time2)); + }); + ASSERT(getReplCoord()->getApplierState() == ApplierState::Running); + + auto opCtx = makeOperationContext(); + getReplCoord()->stepDown(opCtx.get(), true, Milliseconds(0), Milliseconds(1000)); + ASSERT_TRUE(getReplCoord()->getMemberState().secondary()); + ASSERT(getReplCoord()->getApplierState() == ApplierState::Running); + + stopCapturingLogMessages(); + ASSERT_EQUALS(1, countLogLinesContaining("Exited primary catch-up mode")); + ASSERT_EQUALS(0, countLogLinesContaining("Caught up to the latest")); + ASSERT_EQUALS(0, countLogLinesContaining("Catchup timed out")); + + // Check that the number of elections requiring primary catchup was incremented. + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting()); + + // Check that only the 'numCatchUpsFailedWithError' primary catchup conclusion reason was + // incremented. + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSucceeded_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsAlreadyCaughtUp_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsSkipped_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsTimedOut_forTesting()); + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithError_forTesting()); + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUpsFailedWithNewTerm_forTesting()); + ASSERT_EQ(0, + ReplicationMetrics::get(opCtx.get()) + .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting()); } } // namespace diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index 59276c9093b..309c5e2e708 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -48,6 +48,7 @@ #include "mongo/db/repl/repl_set_heartbeat_args_v1.h" #include "mongo/db/repl/repl_set_heartbeat_response.h" #include "mongo/db/repl/replication_coordinator_impl.h" +#include "mongo/db/repl/replication_metrics.h" #include "mongo/db/repl/replication_process.h" #include "mongo/db/repl/topology_coordinator.h" #include "mongo/db/repl/vote_requester.h" @@ -424,6 +425,10 @@ void ReplicationCoordinatorImpl::_stepDownFinish( lk.lock(); _updateAndLogStatsOnStepDown(&arsd); + + // Clear the node's election candidate metrics since it is no longer primary. + ReplicationMetrics::get(opCtx.get()).clearElectionCandidateMetrics(); + _topCoord->finishUnconditionalStepDown(); const auto action = _updateMemberStateFromTopologyCoordinator(lk, opCtx.get()); @@ -646,6 +651,9 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish( lk.lock(); _updateAndLogStatsOnStepDown(&arsd.get()); + + // Clear the node's election candidate metrics since it is no longer primary. + ReplicationMetrics::get(opCtx.get()).clearElectionCandidateMetrics(); } else { // Release the rstl lock as the node might have stepped down due to // other unconditional step down code paths like learning new term via heartbeat & diff --git a/src/mongo/db/repl/replication_coordinator_mock.cpp b/src/mongo/db/repl/replication_coordinator_mock.cpp index 32589b56952..cd8bfb9dc3a 100644 --- a/src/mongo/db/repl/replication_coordinator_mock.cpp +++ b/src/mongo/db/repl/replication_coordinator_mock.cpp @@ -529,10 +529,14 @@ void ReplicationCoordinatorMock::alwaysAllowWrites(bool allowWrites) { _alwaysAllowWrites = allowWrites; } -Status ReplicationCoordinatorMock::abortCatchupIfNeeded() { +Status ReplicationCoordinatorMock::abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) { return Status::OK(); } +void ReplicationCoordinatorMock::incrementNumCatchUpOpsIfCatchingUp(int numOps) { + return; +} + void ReplicationCoordinatorMock::signalDropPendingCollectionsRemovedFromStorage() {} boost::optional<Timestamp> ReplicationCoordinatorMock::getRecoveryTimestamp() { diff --git a/src/mongo/db/repl/replication_coordinator_mock.h b/src/mongo/db/repl/replication_coordinator_mock.h index 67f7d2cead8..cb3bab9e157 100644 --- a/src/mongo/db/repl/replication_coordinator_mock.h +++ b/src/mongo/db/repl/replication_coordinator_mock.h @@ -302,7 +302,9 @@ public: return _service; } - virtual Status abortCatchupIfNeeded() override; + virtual Status abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) override; + + virtual void incrementNumCatchUpOpsIfCatchingUp(int numOps) override; void signalDropPendingCollectionsRemovedFromStorage() final; diff --git a/src/mongo/db/repl/replication_metrics.cpp b/src/mongo/db/repl/replication_metrics.cpp index 8cdc1405827..b6fb3289f35 100644 --- a/src/mongo/db/repl/replication_metrics.cpp +++ b/src/mongo/db/repl/replication_metrics.cpp @@ -128,6 +128,45 @@ void ReplicationMetrics::incrementNumStepDownsCausedByHigherTerm() { _electionMetrics.getNumStepDownsCausedByHigherTerm() + 1); } +void ReplicationMetrics::incrementNumCatchUps() { + stdx::lock_guard<stdx::mutex> lk(_mutex); + _electionMetrics.setNumCatchUps(_electionMetrics.getNumCatchUps() + 1); +} + +void ReplicationMetrics::incrementNumCatchUpsConcludedForReason( + ReplicationCoordinator::PrimaryCatchUpConclusionReason reason) { + stdx::lock_guard<stdx::mutex> lk(_mutex); + switch (reason) { + case ReplicationCoordinator::PrimaryCatchUpConclusionReason::kSucceeded: + _electionMetrics.setNumCatchUpsSucceeded(_electionMetrics.getNumCatchUpsSucceeded() + + 1); + break; + case ReplicationCoordinator::PrimaryCatchUpConclusionReason::kAlreadyCaughtUp: + _electionMetrics.setNumCatchUpsAlreadyCaughtUp( + _electionMetrics.getNumCatchUpsAlreadyCaughtUp() + 1); + break; + case ReplicationCoordinator::PrimaryCatchUpConclusionReason::kSkipped: + _electionMetrics.setNumCatchUpsSkipped(_electionMetrics.getNumCatchUpsSkipped() + 1); + break; + case ReplicationCoordinator::PrimaryCatchUpConclusionReason::kTimedOut: + _electionMetrics.setNumCatchUpsTimedOut(_electionMetrics.getNumCatchUpsTimedOut() + 1); + break; + case ReplicationCoordinator::PrimaryCatchUpConclusionReason::kFailedWithError: + _electionMetrics.setNumCatchUpsFailedWithError( + _electionMetrics.getNumCatchUpsFailedWithError() + 1); + break; + case ReplicationCoordinator::PrimaryCatchUpConclusionReason::kFailedWithNewTerm: + _electionMetrics.setNumCatchUpsFailedWithNewTerm( + _electionMetrics.getNumCatchUpsFailedWithNewTerm() + 1); + break; + case ReplicationCoordinator::PrimaryCatchUpConclusionReason:: + kFailedWithReplSetAbortPrimaryCatchUpCmd: + _electionMetrics.setNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd( + _electionMetrics.getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd() + 1); + break; + } +} + int ReplicationMetrics::getNumStepUpCmdsCalled_forTesting() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _electionMetrics.getStepUpCmd().getCalled(); @@ -183,11 +222,93 @@ int ReplicationMetrics::getNumStepDownsCausedByHigherTerm_forTesting() { return _electionMetrics.getNumStepDownsCausedByHigherTerm(); } +int ReplicationMetrics::getNumCatchUps_forTesting() { + stdx::lock_guard<stdx::mutex> lk(_mutex); + return _electionMetrics.getNumCatchUps(); +} + +int ReplicationMetrics::getNumCatchUpsSucceeded_forTesting() { + stdx::lock_guard<stdx::mutex> lk(_mutex); + return _electionMetrics.getNumCatchUpsSucceeded(); +} + +int ReplicationMetrics::getNumCatchUpsAlreadyCaughtUp_forTesting() { + stdx::lock_guard<stdx::mutex> lk(_mutex); + return _electionMetrics.getNumCatchUpsAlreadyCaughtUp(); +} + +int ReplicationMetrics::getNumCatchUpsSkipped_forTesting() { + stdx::lock_guard<stdx::mutex> lk(_mutex); + return _electionMetrics.getNumCatchUpsSkipped(); +} + +int ReplicationMetrics::getNumCatchUpsTimedOut_forTesting() { + stdx::lock_guard<stdx::mutex> lk(_mutex); + return _electionMetrics.getNumCatchUpsTimedOut(); +} + +int ReplicationMetrics::getNumCatchUpsFailedWithError_forTesting() { + stdx::lock_guard<stdx::mutex> lk(_mutex); + return _electionMetrics.getNumCatchUpsFailedWithError(); +} + +int ReplicationMetrics::getNumCatchUpsFailedWithNewTerm_forTesting() { + stdx::lock_guard<stdx::mutex> lk(_mutex); + return _electionMetrics.getNumCatchUpsFailedWithNewTerm(); +} + +int ReplicationMetrics::getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting() { + stdx::lock_guard<stdx::mutex> lk(_mutex); + return _electionMetrics.getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd(); +} + +void ReplicationMetrics::setElectionCandidateMetrics(Date_t lastElectionDate) { + stdx::lock_guard<stdx::mutex> lk(_mutex); + _electionCandidateMetrics.setLastElectionDate(lastElectionDate); + _nodeIsCandidateOrPrimary = true; +} + +void ReplicationMetrics::setTargetCatchupOpTime(OpTime opTime) { + stdx::lock_guard<stdx::mutex> lk(_mutex); + _electionCandidateMetrics.setTargetCatchupOpTime(opTime); +} + +void ReplicationMetrics::setNumCatchUpOps(int numCatchUpOps) { + stdx::lock_guard<stdx::mutex> lk(_mutex); + _electionCandidateMetrics.setNumCatchUpOps(numCatchUpOps); +} + +void ReplicationMetrics::setNewTermStartDate(Date_t newTermStartDate) { + stdx::lock_guard<stdx::mutex> lk(_mutex); + _electionCandidateMetrics.setNewTermStartDate(newTermStartDate); +} + +boost::optional<OpTime> ReplicationMetrics::getTargetCatchupOpTime_forTesting() { + stdx::lock_guard<stdx::mutex> lk(_mutex); + return _electionCandidateMetrics.getTargetCatchupOpTime(); +} + BSONObj ReplicationMetrics::getElectionMetricsBSON() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _electionMetrics.toBSON(); } +BSONObj ReplicationMetrics::getElectionCandidateMetricsBSON() { + stdx::lock_guard<stdx::mutex> lk(_mutex); + if (_nodeIsCandidateOrPrimary) { + return _electionCandidateMetrics.toBSON(); + } + return BSONObj(); +} + +void ReplicationMetrics::clearElectionCandidateMetrics() { + stdx::lock_guard<stdx::mutex> lk(_mutex); + _electionCandidateMetrics.setTargetCatchupOpTime(boost::none); + _electionCandidateMetrics.setNumCatchUpOps(boost::none); + _electionCandidateMetrics.setNewTermStartDate(boost::none); + _nodeIsCandidateOrPrimary = false; +} + class ReplicationMetrics::ElectionMetricsSSS : public ServerStatusSection { public: ElectionMetricsSSS() : ServerStatusSection("electionMetrics") {} diff --git a/src/mongo/db/repl/replication_metrics.h b/src/mongo/db/repl/replication_metrics.h index 564be3b08dd..a169e8a0da5 100644 --- a/src/mongo/db/repl/replication_metrics.h +++ b/src/mongo/db/repl/replication_metrics.h @@ -48,9 +48,13 @@ public: ReplicationMetrics(); ~ReplicationMetrics(); + // Election metrics void incrementNumElectionsCalledForReason(TopologyCoordinator::StartElectionReason reason); void incrementNumElectionsSuccessfulForReason(TopologyCoordinator::StartElectionReason reason); void incrementNumStepDownsCausedByHigherTerm(); + void incrementNumCatchUps(); + void incrementNumCatchUpsConcludedForReason( + ReplicationCoordinator::PrimaryCatchUpConclusionReason reason); int getNumStepUpCmdsCalled_forTesting(); int getNumPriorityTakeoversCalled_forTesting(); @@ -63,8 +67,30 @@ public: int getNumElectionTimeoutsSuccessful_forTesting(); int getNumFreezeTimeoutsSuccessful_forTesting(); int getNumStepDownsCausedByHigherTerm_forTesting(); + int getNumCatchUps_forTesting(); + int getNumCatchUpsSucceeded_forTesting(); + int getNumCatchUpsAlreadyCaughtUp_forTesting(); + int getNumCatchUpsSkipped_forTesting(); + int getNumCatchUpsTimedOut_forTesting(); + int getNumCatchUpsFailedWithError_forTesting(); + int getNumCatchUpsFailedWithNewTerm_forTesting(); + int getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting(); + + // Election candidate metrics + + // All the election candidate metrics that should be set when a node calls an election are set + // in this one function, so that the 'electionCandidateMetrics' section of replSetStatus shows a + // consistent state. + void setElectionCandidateMetrics(Date_t lastElectionDate); + void setTargetCatchupOpTime(OpTime opTime); + void setNumCatchUpOps(int numCatchUpOps); + void setNewTermStartDate(Date_t newTermStartDate); + + boost::optional<OpTime> getTargetCatchupOpTime_forTesting(); BSONObj getElectionMetricsBSON(); + BSONObj getElectionCandidateMetricsBSON(); + void clearElectionCandidateMetrics(); private: class ElectionMetricsSSS; @@ -73,6 +99,8 @@ private: ElectionMetrics _electionMetrics; ElectionCandidateMetrics _electionCandidateMetrics; ElectionParticipantMetrics _electionParticipantMetrics; + + bool _nodeIsCandidateOrPrimary = false; }; } // namespace repl diff --git a/src/mongo/db/repl/replication_metrics.idl b/src/mongo/db/repl/replication_metrics.idl index a14042cbcf1..c17140b9125 100644 --- a/src/mongo/db/repl/replication_metrics.idl +++ b/src/mongo/db/repl/replication_metrics.idl @@ -34,9 +34,11 @@ global: cpp_namespace: "mongo::repl" cpp_includes: - "mongo/db/repl/election_reason_counter_parser.h" + - "mongo/db/repl/optime.h" imports: - "mongo/idl/basic_types.idl" + - "mongo/db/repl/replication_types.idl" types: ElectionReasonCounter: @@ -76,15 +78,67 @@ structs: description: "Number of times this node stepped down because it saw a higher term" type: long default: 0 - + numCatchUps: + description: "Number of elections that required the primary to catch up because it + was behind" + type: long + default: 0 + numCatchUpsSucceeded: + description: "Number of times primary catchup concluded because the primary caught + up to the latest known optime successfully" + type: long + default: 0 + numCatchUpsAlreadyCaughtUp: + description: "Number of times primary catchup concluded because the primary was + already caught up" + type: long + default: 0 + numCatchUpsSkipped: + description: "Number of times primary catchup concluded because it was skipped" + type: long + default: 0 + numCatchUpsTimedOut: + description: "Number of times primary catchup concluded because it timed out" + type: long + default: 0 + numCatchUpsFailedWithError: + description: "Number of times primary catchup concluded because it failed with an + error" + type: long + default: 0 + numCatchUpsFailedWithNewTerm: + description: "Number of times primary catchup concluded because the primary stepped + down on seeing a higher term" + type: long + default: 0 + numCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd: + description: "Number of times primary catchup concluded because of the + replSetAbortPrimaryCatchUp command" + type: long + default: 0 + ElectionCandidateMetrics: description: "Stores metrics that are specific to the last election in which the node was a candidate" strict: true fields: - priorityAtElection: - description: "The node's priority at the time of the election" - type: double + lastElectionDate: + description: "Time the node called the dry run election, or the actual election if + it skipped dry run" + type: date + targetCatchupOpTime: + description: "The node's target opTime for catchup" + type: optime + optional: true + numCatchUpOps: + description: "Number of ops applied during catchup when the primary successfully + catches up" + type: long + optional: true + newTermStartDate: + description: "Time the new term oplog entry was written" + type: date + optional: true ElectionParticipantMetrics: description: "Stores metrics that are specific to the last election in which the node voted" diff --git a/src/mongo/db/repl/sync_tail.cpp b/src/mongo/db/repl/sync_tail.cpp index 6463e5954f9..7efe5ee10b4 100644 --- a/src/mongo/db/repl/sync_tail.cpp +++ b/src/mongo/db/repl/sync_tail.cpp @@ -1459,6 +1459,10 @@ StatusWith<OpTime> SyncTail::multiApply(OperationContext* opCtx, MultiApplier::O } } + // Increment the counter for the number of ops applied during catchup if the node is in catchup + // mode. + replCoord->incrementNumCatchUpOpsIfCatchingUp(ops.size()); + // We have now written all database writes and updated the oplog to match. return ops.back().getOpTime(); } diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp index 81056086087..b57480c4836 100644 --- a/src/mongo/db/repl/topology_coordinator.cpp +++ b/src/mongo/db/repl/topology_coordinator.cpp @@ -1417,7 +1417,8 @@ const MemberConfig* TopologyCoordinator::_currentPrimaryMember() const { std::string TopologyCoordinator::_getReplSetStatusString() { // Construct a ReplSetStatusArgs using default parameters. Missing parameters will not be // included in the status string. - ReplSetStatusArgs rsStatusArgs{Date_t::now(), 0U, OpTimeAndWallTime(), BSONObj(), boost::none}; + ReplSetStatusArgs rsStatusArgs{ + Date_t::now(), 0U, OpTimeAndWallTime(), BSONObj(), BSONObj(), boost::none}; BSONObjBuilder builder; Status result(ErrorCodes::InternalError, "didn't set status in prepareStatusResponse"); prepareStatusResponse(rsStatusArgs, &builder, &result); @@ -1439,6 +1440,7 @@ void TopologyCoordinator::prepareStatusResponse(const ReplSetStatusArgs& rsStatu const OpTime lastOpDurable = getMyLastDurableOpTime(); const Date_t lastOpDurableWall = getMyLastDurableOpTimeAndWallTime().wallTime; const BSONObj& initialSyncStatus = rsStatusArgs.initialSyncStatus; + const BSONObj& electionCandidateMetrics = rsStatusArgs.electionCandidateMetrics; const boost::optional<Timestamp>& lastStableRecoveryTimestamp = rsStatusArgs.lastStableRecoveryTimestamp; const boost::optional<Timestamp>& lastStableCheckpointTimestampDeprecated = @@ -1638,6 +1640,10 @@ void TopologyCoordinator::prepareStatusResponse(const ReplSetStatusArgs& rsStatu response->append("initialSyncStatus", initialSyncStatus); } + if (!electionCandidateMetrics.isEmpty()) { + response->append("electionCandidateMetrics", electionCandidateMetrics); + } + response->append("members", membersOut); *result = Status::OK(); } diff --git a/src/mongo/db/repl/topology_coordinator.h b/src/mongo/db/repl/topology_coordinator.h index 6a6a6af4652..e29453292a4 100644 --- a/src/mongo/db/repl/topology_coordinator.h +++ b/src/mongo/db/repl/topology_coordinator.h @@ -301,6 +301,7 @@ public: const unsigned selfUptime; const OpTimeAndWallTime readConcernMajorityOpTime; const BSONObj initialSyncStatus; + const BSONObj electionCandidateMetrics; // boost::none if the storage engine does not support RTT, or if it does but does not // persist data to necessitate taking checkpoints. Timestamp::min() if a checkpoint is yet diff --git a/src/mongo/db/repl/topology_coordinator_v1_test.cpp b/src/mongo/db/repl/topology_coordinator_v1_test.cpp index 633c0220372..388b8e84dc1 100644 --- a/src/mongo/db/repl/topology_coordinator_v1_test.cpp +++ b/src/mongo/db/repl/topology_coordinator_v1_test.cpp @@ -1538,6 +1538,7 @@ TEST_F(TopoCoordTest, ReplSetGetStatus) { Timestamp lastStableRecoveryTimestamp(2, 2); Timestamp lastStableCheckpointTimestampDeprecated(2, 2); BSONObj initialSyncStatus = BSON("failedInitialSyncAttempts" << 1); + BSONObj electionCandidateMetrics = BSON("DummyElectionMetrics" << 1); std::string setName = "mySet"; ReplSetHeartbeatResponse hb; @@ -1593,6 +1594,7 @@ TEST_F(TopoCoordTest, ReplSetGetStatus) { static_cast<unsigned>(durationCount<Seconds>(uptimeSecs)), {readConcernMajorityOpTime, readConcernMajorityWallTime}, initialSyncStatus, + electionCandidateMetrics, lastStableCheckpointTimestampDeprecated, lastStableRecoveryTimestamp}, &statusBuilder, @@ -1703,6 +1705,7 @@ TEST_F(TopoCoordTest, ReplSetGetStatus) { ASSERT_EQUALS(2000, rsStatus["heartbeatIntervalMillis"].numberInt()); ASSERT_BSONOBJ_EQ(initialSyncStatus, rsStatus["initialSyncStatus"].Obj()); + ASSERT_BSONOBJ_EQ(electionCandidateMetrics, rsStatus["electionCandidateMetrics"].Obj()); // Test no lastStableRecoveryTimestamp field. BSONObjBuilder statusBuilder2; @@ -1711,7 +1714,8 @@ TEST_F(TopoCoordTest, ReplSetGetStatus) { curTime, static_cast<unsigned>(durationCount<Seconds>(uptimeSecs)), {readConcernMajorityOpTime, readConcernMajorityWallTime}, - initialSyncStatus}, + initialSyncStatus, + BSONObj()}, &statusBuilder2, &resultStatus); ASSERT_OK(resultStatus); @@ -1720,6 +1724,7 @@ TEST_F(TopoCoordTest, ReplSetGetStatus) { ASSERT_EQUALS(setName, rsStatus["set"].String()); ASSERT_FALSE(rsStatus.hasField("lastStableRecoveryTimestamp")); ASSERT_FALSE(rsStatus.hasField("lastStableCheckpointTimestamp")); + ASSERT_FALSE(rsStatus.hasField("electionCandidateMetrics")); } TEST_F(TopoCoordTest, ReplSetGetStatusIPs) { diff --git a/src/mongo/embedded/replication_coordinator_embedded.cpp b/src/mongo/embedded/replication_coordinator_embedded.cpp index ccdf18229aa..b11d0106d77 100644 --- a/src/mongo/embedded/replication_coordinator_embedded.cpp +++ b/src/mongo/embedded/replication_coordinator_embedded.cpp @@ -355,7 +355,11 @@ Status ReplicationCoordinatorEmbedded::processReplSetInitiate(OperationContext*, UASSERT_NOT_IMPLEMENTED; } -Status ReplicationCoordinatorEmbedded::abortCatchupIfNeeded() { +Status ReplicationCoordinatorEmbedded::abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) { + UASSERT_NOT_IMPLEMENTED; +} + +void ReplicationCoordinatorEmbedded::incrementNumCatchUpOpsIfCatchingUp(int numOps) { UASSERT_NOT_IMPLEMENTED; } diff --git a/src/mongo/embedded/replication_coordinator_embedded.h b/src/mongo/embedded/replication_coordinator_embedded.h index c68a7e8b7af..1246adf7e93 100644 --- a/src/mongo/embedded/replication_coordinator_embedded.h +++ b/src/mongo/embedded/replication_coordinator_embedded.h @@ -249,7 +249,9 @@ public: Status stepUpIfEligible(bool skipDryRun) override; - Status abortCatchupIfNeeded() override; + Status abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) override; + + void incrementNumCatchUpOpsIfCatchingUp(int numOps) override; void signalDropPendingCollectionsRemovedFromStorage() final; |