diff options
author | Xuerui Fa <xuerui.fa@mongodb.com> | 2019-09-30 21:44:33 +0000 |
---|---|---|
committer | evergreen <evergreen@mongodb.com> | 2019-09-30 21:44:33 +0000 |
commit | 065bc52eec452e5bedfd7d9d8df569ef4309616d (patch) | |
tree | 624e56ba0a5765a4a752637fabd8865764a37bc0 | |
parent | 28f301324e1809212f6f15ba641cea77f3c8719f (diff) | |
download | mongo-065bc52eec452e5bedfd7d9d8df569ef4309616d.tar.gz |
SERVER-41505 Track average number of catchup ops in serverStatus
(cherry picked from commit c1e1e12913fdcee54c059e023e8e7ecd36f7cb17)
-rw-r--r-- | jstests/replsets/avg_num_catchup_ops.js | 75 | ||||
-rw-r--r-- | jstests/replsets/catchup.js | 63 | ||||
-rw-r--r-- | jstests/replsets/rslib.js | 35 | ||||
-rw-r--r-- | jstests/replsets/step_down_during_draining.js | 18 | ||||
-rw-r--r-- | jstests/replsets/step_down_during_draining2.js | 17 | ||||
-rw-r--r-- | jstests/replsets/step_down_during_draining3.js | 15 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_metrics.cpp | 48 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_metrics.h | 44 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_metrics.idl | 4 | ||||
-rw-r--r-- | src/mongo/shell/replsettest.js | 25 |
10 files changed, 204 insertions, 140 deletions
diff --git a/jstests/replsets/avg_num_catchup_ops.js b/jstests/replsets/avg_num_catchup_ops.js new file mode 100644 index 00000000000..4d3feae0d28 --- /dev/null +++ b/jstests/replsets/avg_num_catchup_ops.js @@ -0,0 +1,75 @@ +/** + * Tests that the election metric for average number of catchup ops is being set correctly. We test + * this by electing a node to be primary twice and forcing it to catch up each time. + */ +(function() { +"use strict"; + +load("jstests/libs/check_log.js"); +load("jstests/libs/write_concern_util.js"); +load("jstests/replsets/libs/election_metrics.js"); +load("jstests/replsets/rslib.js"); + +const name = jsTestName(); +const rst = new ReplSetTest({name: name, nodes: 3, useBridge: true}); + +rst.startSet(); +const confSettings = { + catchUpTimeoutMillis: 4 * 60 * 1000, +}; + +rst.initiateWithHighElectionTimeout(confSettings); +rst.awaitSecondaryNodes(); +rst.awaitReplication(); + +const testNode = rst.getSecondaries()[0]; + +let stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp(rst, testNode); +restartServerReplication(stepUpResults.oldSecondaries); +// Block until the primary finishes drain mode. +assert.eq(stepUpResults.newPrimary, rst.getPrimary()); +// Wait until the new primary completes the transition to primary and writes a no-op. +checkLog.contains(stepUpResults.newPrimary, "transition to primary complete"); + +let testNodeReplSetGetStatus = + assert.commandWorked(stepUpResults.newPrimary.adminCommand({replSetGetStatus: 1})); +let testNodeServerStatus = + assert.commandWorked(stepUpResults.newPrimary.adminCommand({serverStatus: 1})); + +// Check that metrics associated with catchup have been set correctly in both replSetGetStatus and +// serverStatus. +assert(testNodeReplSetGetStatus.electionCandidateMetrics.numCatchUpOps, + () => "Response should have an 'numCatchUpOps' field: " + + tojson(testNodeReplSetGetStatus.electionCandidateMetrics)); +// numCatchUpOps should be 4 because the 'foo' collection is implicitly created during the 3 +// inserts, and that's where the additional oplog entry comes from. +assert.eq(testNodeReplSetGetStatus.electionCandidateMetrics.numCatchUpOps, 4); +assert(testNodeServerStatus.electionMetrics.averageCatchUpOps, + () => "Response should have an 'averageCatchUpOps' field: " + + tojson(testNodeServerStatus.electionMetrics)); +assert.eq(testNodeServerStatus.electionMetrics.averageCatchUpOps, 4); + +// Step up another node temporarily. +const tempPrimary = rst.stepUpNoAwaitReplication(rst.getSecondaries()[0]); +assert.eq(tempPrimary, rst.getPrimary()); +rst.awaitReplication(); + +// Step up the testNode and force it to catchup again. +stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp(rst, testNode); +restartServerReplication(stepUpResults.oldSecondaries); +assert.eq(stepUpResults.newPrimary, rst.getPrimary()); +checkLog.contains(stepUpResults.newPrimary, "transition to primary complete"); +rst.awaitReplication(); + +testNodeServerStatus = + assert.commandWorked(stepUpResults.newPrimary.adminCommand({serverStatus: 1})); +testNodeReplSetGetStatus = + assert.commandWorked(stepUpResults.newPrimary.adminCommand({replSetGetStatus: 1})); + +// numCatchUpOps is now 3 due to the 'foo' collection already being created. +assert.eq(testNodeReplSetGetStatus.electionCandidateMetrics.numCatchUpOps, 3); +assert.eq(testNodeServerStatus.electionMetrics.numCatchUps, 2); +assert.eq(testNodeServerStatus.electionMetrics.averageCatchUpOps, 3.5); + +rst.stopSet(); +})();
\ No newline at end of file diff --git a/jstests/replsets/catchup.js b/jstests/replsets/catchup.js index 9d90c51099c..2ae5e09e732 100644 --- a/jstests/replsets/catchup.js +++ b/jstests/replsets/catchup.js @@ -36,16 +36,6 @@ rst.nodes.forEach(function(node) { node.adminCommand(verbosity); }); -function stepUpNode(node) { - assert.soonNoExcept(function() { - assert.commandWorked(node.adminCommand({replSetStepUp: 1})); - rst.awaitNodesAgreeOnPrimary(rst.kDefaultTimeoutMS, rst.nodes, rst.getNodeId(node)); - return node.adminCommand('replSetGetStatus').myState == ReplSetTest.State.PRIMARY; - }, 'failed to step up node ' + node.host, rst.kDefaultTimeoutMS); - - return node; -} - function checkOpInOplog(node, op, count) { node.getDB("admin").getMongo().setSlaveOk(); var oplog = node.getDB("local")['oplog.rs']; @@ -53,36 +43,6 @@ function checkOpInOplog(node, op, count) { assert.eq(oplog.count(op), count, "op: " + tojson(op) + ", oplog: " + tojson(oplogArray)); } -// Stop replication on secondaries, do writes and step up one of the secondaries. -// -// The old primary has extra writes that are not replicated to the other nodes yet, -// but the new primary steps up, getting the vote from the the third node "voter". -function stopReplicationAndEnforceNewPrimaryToCatchUp() { - // Write documents that cannot be replicated to secondaries in time. - var oldSecondaries = rst.getSecondaries(); - var oldPrimary = rst.getPrimary(); - stopServerReplication(oldSecondaries); - for (var i = 0; i < 3; i++) { - assert.writeOK(oldPrimary.getDB("test").foo.insert({x: i})); - } - var latestOpOnOldPrimary = getLatestOp(oldPrimary); - - // New primary wins immediately, but needs to catch up. - var newPrimary = stepUpNode(oldSecondaries[0]); - var latestOpOnNewPrimary = getLatestOp(newPrimary); - // Check this node is not writable. - assert.eq(newPrimary.getDB("test").isMaster().ismaster, false); - - return { - oldSecondaries: oldSecondaries, - oldPrimary: oldPrimary, - newPrimary: newPrimary, - voter: oldSecondaries[1], - latestOpOnOldPrimary: latestOpOnOldPrimary, - latestOpOnNewPrimary: latestOpOnNewPrimary - }; -} - function reconfigElectionAndCatchUpTimeout(electionTimeout, catchupTimeout) { // Reconnect all nodes to make sure reconfig succeeds. rst.nodes.forEach(reconnect); @@ -103,7 +63,7 @@ let initialNewPrimaryStatus = assert.commandWorked(rst.getSecondary().adminCommand({serverStatus: 1})); // Should complete transition to primary immediately. -var newPrimary = stepUpNode(rst.getSecondary()); +var newPrimary = rst.stepUpNoAwaitReplication(rst.getSecondary()); // Should win an election and finish the transition very quickly. assert.eq(newPrimary, rst.getPrimary()); rst.awaitReplication(); @@ -138,7 +98,7 @@ jsTest.log("Case 2: The primary needs to catch up, succeeds in time."); initialNewPrimaryStatus = assert.commandWorked(rst.getSecondaries()[0].adminCommand({serverStatus: 1})); -var stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp(); +var stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp(rst, rst.getSecondaries()[0]); // Check that the 'electionCandidateMetrics' section of the replSetGetStatus response does not have // a 'newTermStartDate' field before the transition to primary is complete. @@ -179,21 +139,14 @@ verifyCatchUpConclusionReason(initialNewPrimaryStatus.electionMetrics, newNewPrimaryStatus.electionMetrics, 'numCatchUpsSucceeded'); -// Check that the 'electionCandidateMetrics' section of the replSetGetStatus response has -// 'targetCatchupOpTime' and 'numCatchUpOps' fields once the primary is caught up, and that they -// have the correct values. +// Check that the 'electionCandidateMetrics' section of the replSetGetStatus response has the +// 'targetCatchupOpTime' field once the primary is caught up, and that it has the correct values. assert(res.electionCandidateMetrics.targetCatchupOpTime, () => "Response should have an 'electionCandidateMetrics.targetCatchupOpTime' field: " + tojson(res.electionCandidateMetrics)); assert.eq(res.electionCandidateMetrics.targetCatchupOpTime.ts, stepUpResults.latestOpOnOldPrimary.ts); assert.eq(res.electionCandidateMetrics.targetCatchupOpTime.t, stepUpResults.latestOpOnOldPrimary.t); -assert(res.electionCandidateMetrics.numCatchUpOps, - () => "Response should have an 'electionCandidateMetrics.numCatchUpOps' field: " + - tojson(res.electionCandidateMetrics)); -// numCatchUpOps should be 4 because the 'foo' collection is implicitly created during the 3 -// inserts, and that's where the additional oplog entry comes from. -assert.eq(res.electionCandidateMetrics.numCatchUpOps, 4); // Wait for all secondaries to catch up rst.awaitReplication(); @@ -206,7 +159,7 @@ jsTest.log("Case 3: The primary needs to catch up, but has to change sync source // longer be blacklisted by the new primary if it happened to be at the beginning of the test. reconfigElectionAndCatchUpTimeout(3 * 60 * 1000, conf.settings.catchUpTimeoutMillis); -stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp(); +stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp(rst, rst.getSecondaries()[0]); // Disable fail point on the voter. Wait until it catches up with the old primary. restartServerReplication(stepUpResults.voter); @@ -237,7 +190,7 @@ initialNewPrimaryStatus = // Reconfig to make the catchup timeout shorter. reconfigElectionAndCatchUpTimeout(conf.settings.electionTimeoutMillis, 10 * 1000); -stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp(); +stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp(rst, rst.getSecondaries()[0]); // Wait until the new primary completes the transition to primary and writes a no-op. checkLog.contains(stepUpResults.newPrimary, "Catchup timed out after becoming primary"); restartServerReplication(stepUpResults.newPrimary); @@ -265,7 +218,7 @@ rst.awaitReplication(); jsTest.log("Case 5: The primary needs to catch up with no timeout, then gets aborted."); // Reconfig to make the catchup timeout infinite. reconfigElectionAndCatchUpTimeout(conf.settings.electionTimeoutMillis, -1); -stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp(); +stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp(rst, rst.getSecondaries()[0]); initialNewPrimaryStatus = assert.commandWorked(stepUpResults.newPrimary.adminCommand({serverStatus: 1})); @@ -298,7 +251,7 @@ jsTest.log("Case 6: The primary needs to catch up with no timeout, but steps dow initialNewPrimaryStatus = assert.commandWorked(rst.getSecondaries()[0].adminCommand({serverStatus: 1})); -var stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp(); +var stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp(rst, rst.getSecondaries()[0]); // Step-down command should abort catchup. assert.commandWorked(stepUpResults.newPrimary.adminCommand({replSetStepDown: 60})); diff --git a/jstests/replsets/rslib.js b/jstests/replsets/rslib.js index 209ed8172e1..4af2e49bbf0 100644 --- a/jstests/replsets/rslib.js +++ b/jstests/replsets/rslib.js @@ -14,6 +14,7 @@ var reInitiateWithoutThrowingOnAbortedMember; var awaitRSClientHosts; var getLastOpTime; var setLogVerbosity; +var stopReplicationAndEnforceNewPrimaryToCatchUp; (function() { "use strict"; @@ -453,4 +454,38 @@ setLogVerbosity = function(nodes, logVerbosity) { assert.commandWorked(node.adminCommand(verbosity)); }); }; + +/** + * Stop replication on secondaries, do writes and step up the node that was passed in. + * + * The old primary has extra writes that are not replicated to the other nodes yet, + * but the new primary steps up, getting the vote from the the third node "voter". + */ +stopReplicationAndEnforceNewPrimaryToCatchUp = function(rst, node) { + // Write documents that cannot be replicated to secondaries in time. + const oldSecondaries = rst.getSecondaries(); + const oldPrimary = rst.getPrimary(); + + stopServerReplication(oldSecondaries); + for (let i = 0; i < 3; i++) { + assert.commandWorked(oldPrimary.getDB("test").foo.insert({x: i})); + } + + const latestOpOnOldPrimary = getLatestOp(oldPrimary); + + // New primary wins immediately, but needs to catch up. + const newPrimary = rst.stepUpNoAwaitReplication(node); + const latestOpOnNewPrimary = getLatestOp(newPrimary); + // Check this node is not writable. + assert.eq(newPrimary.getDB("test").isMaster().ismaster, false); + + return { + oldSecondaries: oldSecondaries, + oldPrimary: oldPrimary, + newPrimary: newPrimary, + voter: oldSecondaries[1], + latestOpOnOldPrimary: latestOpOnOldPrimary, + latestOpOnNewPrimary: latestOpOnNewPrimary + }; +}; }()); diff --git a/jstests/replsets/step_down_during_draining.js b/jstests/replsets/step_down_during_draining.js index 47c8ee2651a..53009c66f36 100644 --- a/jstests/replsets/step_down_during_draining.js +++ b/jstests/replsets/step_down_during_draining.js @@ -48,18 +48,6 @@ function disableFailPoint(node) { assert.commandWorked(node.adminCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'off'})); } -// Since this test blocks a node in drain mode, we cannot use the ReplSetTest stepUp helper -// that waits for a node to leave drain mode. -function stepUpNode(node) { - assert.soonNoExcept(function() { - assert.commandWorked(node.adminCommand({replSetStepUp: 1})); - // We do not specify a specific primary so that if a different primary gets elected - // due to unfortunate timing we can try again. - replSet.awaitNodesAgreeOnPrimary(); - return node.adminCommand('replSetGetStatus').myState === ReplSetTest.State.PRIMARY; - }, 'failed to step up node ' + node.host, replSet.kDefaultTimeoutMS); -} - // Do an initial insert to prevent the secondary from going into recovery var numDocuments = 20; var coll = primary.getDB("foo").foo; @@ -91,7 +79,7 @@ assert.soon( 1000); reconnect(secondary); -stepUpNode(secondary); +replSet.stepUpNoAwaitReplication(secondary); // Secondary doesn't allow writes yet. var res = secondary.getDB("admin").runCommand({"isMaster": 1}); @@ -107,10 +95,10 @@ assert.commandFailedWithCode( // Original primary steps up. reconnect(primary); -stepUpNode(primary); +replSet.stepUpNoAwaitReplication(primary); reconnect(secondary); -stepUpNode(secondary); +replSet.stepUpNoAwaitReplication(secondary); // Disable fail point to allow replication. secondaries.forEach(disableFailPoint); diff --git a/jstests/replsets/step_down_during_draining2.js b/jstests/replsets/step_down_during_draining2.js index 1e97f93865a..25c1597d5c4 100644 --- a/jstests/replsets/step_down_during_draining2.js +++ b/jstests/replsets/step_down_during_draining2.js @@ -51,19 +51,6 @@ function disableFailPoint(node) { assert.commandWorked(node.adminCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'off'})); } -// Since this test blocks a node in drain mode, we cannot use the ReplSetTest stepUp helper -// that waits for a node to leave drain mode. -function stepUpNode(node) { - jsTest.log("Stepping up: " + node.host); - assert.soonNoExcept(function() { - assert.commandWorked(node.adminCommand({replSetStepUp: 1})); - // We do not specify a specific primary so that if a different primary gets elected - // due to unfortunate timing we can try again. - replSet.awaitNodesAgreeOnPrimary(); - return node.adminCommand('replSetGetStatus').myState === ReplSetTest.State.PRIMARY; - }, 'failed to step up node ' + node.host, replSet.kDefaultTimeoutMS); -} - // Do an initial insert to prevent the secondary from going into recovery var numDocuments = 20; var coll = primary.getDB("foo").foo; @@ -95,7 +82,7 @@ assert.soon( 1000); reconnect(secondary); -stepUpNode(secondary); +replSet.stepUpNoAwaitReplication(secondary); // Secondary doesn't allow writes yet. var res = secondary.getDB("admin").runCommand({"isMaster": 1}); @@ -158,7 +145,7 @@ assert(!secondary.adminCommand('ismaster').ismaster); // Now ensure that the node can successfully become primary again. replSet.restart(0); replSet.restart(2); -stepUpNode(secondary); +replSet.stepUpNoAwaitReplication(secondary); assert.soon(function() { return secondary.adminCommand('ismaster').ismaster; diff --git a/jstests/replsets/step_down_during_draining3.js b/jstests/replsets/step_down_during_draining3.js index 98c42955fc6..ac1f2239498 100644 --- a/jstests/replsets/step_down_during_draining3.js +++ b/jstests/replsets/step_down_during_draining3.js @@ -40,19 +40,6 @@ function disableFailPoint(node) { assert.commandWorked(node.adminCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'off'})); } -// Since this test blocks a node in drain mode, we cannot use the ReplSetTest stepUp helper -// that waits for a node to leave drain mode. -function stepUpNode(node) { - jsTest.log("Stepping up: " + node.host); - assert.soonNoExcept(function() { - assert.commandWorked(node.adminCommand({replSetStepUp: 1})); - // We do not specify a specific primary so that if a different primary gets elected - // due to unfortunate timing we can try again. - replSet.awaitNodesAgreeOnPrimary(); - return node.adminCommand('replSetGetStatus').myState === ReplSetTest.State.PRIMARY; - }, 'failed to step up node ' + node.host, replSet.kDefaultTimeoutMS); -} - // Do an initial insert to prevent the secondary from going into recovery var numDocuments = 20; var coll = primary.getDB("foo").foo; @@ -84,7 +71,7 @@ assert.soon( 1000); reconnect(secondary); -stepUpNode(secondary); +replSet.stepUpNoAwaitReplication(secondary); // Secondary doesn't allow writes yet. var res = secondary.getDB("admin").runCommand({"isMaster": 1}); diff --git a/src/mongo/db/repl/replication_metrics.cpp b/src/mongo/db/repl/replication_metrics.cpp index eae67e976fd..55508674562 100644 --- a/src/mongo/db/repl/replication_metrics.cpp +++ b/src/mongo/db/repl/replication_metrics.cpp @@ -131,6 +131,7 @@ void ReplicationMetrics::incrementNumStepDownsCausedByHigherTerm() { void ReplicationMetrics::incrementNumCatchUps() { stdx::lock_guard<stdx::mutex> lk(_mutex); _electionMetrics.setNumCatchUps(_electionMetrics.getNumCatchUps() + 1); + _updateAverageCatchUpOps(lk); } void ReplicationMetrics::incrementNumCatchUpsConcludedForReason( @@ -167,97 +168,97 @@ void ReplicationMetrics::incrementNumCatchUpsConcludedForReason( } } -int ReplicationMetrics::getNumStepUpCmdsCalled_forTesting() { +long ReplicationMetrics::getNumStepUpCmdsCalled_forTesting() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _electionMetrics.getStepUpCmd().getCalled(); } -int ReplicationMetrics::getNumPriorityTakeoversCalled_forTesting() { +long ReplicationMetrics::getNumPriorityTakeoversCalled_forTesting() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _electionMetrics.getPriorityTakeover().getCalled(); } -int ReplicationMetrics::getNumCatchUpTakeoversCalled_forTesting() { +long ReplicationMetrics::getNumCatchUpTakeoversCalled_forTesting() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _electionMetrics.getCatchUpTakeover().getCalled(); } -int ReplicationMetrics::getNumElectionTimeoutsCalled_forTesting() { +long ReplicationMetrics::getNumElectionTimeoutsCalled_forTesting() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _electionMetrics.getElectionTimeout().getCalled(); } -int ReplicationMetrics::getNumFreezeTimeoutsCalled_forTesting() { +long ReplicationMetrics::getNumFreezeTimeoutsCalled_forTesting() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _electionMetrics.getFreezeTimeout().getCalled(); } -int ReplicationMetrics::getNumStepUpCmdsSuccessful_forTesting() { +long ReplicationMetrics::getNumStepUpCmdsSuccessful_forTesting() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _electionMetrics.getStepUpCmd().getSuccessful(); } -int ReplicationMetrics::getNumPriorityTakeoversSuccessful_forTesting() { +long ReplicationMetrics::getNumPriorityTakeoversSuccessful_forTesting() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _electionMetrics.getPriorityTakeover().getSuccessful(); } -int ReplicationMetrics::getNumCatchUpTakeoversSuccessful_forTesting() { +long ReplicationMetrics::getNumCatchUpTakeoversSuccessful_forTesting() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _electionMetrics.getCatchUpTakeover().getSuccessful(); } -int ReplicationMetrics::getNumElectionTimeoutsSuccessful_forTesting() { +long ReplicationMetrics::getNumElectionTimeoutsSuccessful_forTesting() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _electionMetrics.getElectionTimeout().getSuccessful(); } -int ReplicationMetrics::getNumFreezeTimeoutsSuccessful_forTesting() { +long ReplicationMetrics::getNumFreezeTimeoutsSuccessful_forTesting() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _electionMetrics.getFreezeTimeout().getSuccessful(); } -int ReplicationMetrics::getNumStepDownsCausedByHigherTerm_forTesting() { +long ReplicationMetrics::getNumStepDownsCausedByHigherTerm_forTesting() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _electionMetrics.getNumStepDownsCausedByHigherTerm(); } -int ReplicationMetrics::getNumCatchUps_forTesting() { +long ReplicationMetrics::getNumCatchUps_forTesting() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _electionMetrics.getNumCatchUps(); } -int ReplicationMetrics::getNumCatchUpsSucceeded_forTesting() { +long ReplicationMetrics::getNumCatchUpsSucceeded_forTesting() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _electionMetrics.getNumCatchUpsSucceeded(); } -int ReplicationMetrics::getNumCatchUpsAlreadyCaughtUp_forTesting() { +long ReplicationMetrics::getNumCatchUpsAlreadyCaughtUp_forTesting() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _electionMetrics.getNumCatchUpsAlreadyCaughtUp(); } -int ReplicationMetrics::getNumCatchUpsSkipped_forTesting() { +long ReplicationMetrics::getNumCatchUpsSkipped_forTesting() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _electionMetrics.getNumCatchUpsSkipped(); } -int ReplicationMetrics::getNumCatchUpsTimedOut_forTesting() { +long ReplicationMetrics::getNumCatchUpsTimedOut_forTesting() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _electionMetrics.getNumCatchUpsTimedOut(); } -int ReplicationMetrics::getNumCatchUpsFailedWithError_forTesting() { +long ReplicationMetrics::getNumCatchUpsFailedWithError_forTesting() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _electionMetrics.getNumCatchUpsFailedWithError(); } -int ReplicationMetrics::getNumCatchUpsFailedWithNewTerm_forTesting() { +long ReplicationMetrics::getNumCatchUpsFailedWithNewTerm_forTesting() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _electionMetrics.getNumCatchUpsFailedWithNewTerm(); } -int ReplicationMetrics::getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting() { +long ReplicationMetrics::getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _electionMetrics.getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd(); } @@ -276,6 +277,8 @@ void ReplicationMetrics::setTargetCatchupOpTime(OpTime opTime) { void ReplicationMetrics::setNumCatchUpOps(int numCatchUpOps) { stdx::lock_guard<stdx::mutex> lk(_mutex); _electionCandidateMetrics.setNumCatchUpOps(numCatchUpOps); + _totalNumCatchUpOps += numCatchUpOps; + _updateAverageCatchUpOps(lk); } void ReplicationMetrics::setNewTermStartDate(Date_t newTermStartDate) { @@ -315,6 +318,13 @@ void ReplicationMetrics::clearElectionCandidateMetrics() { _nodeIsCandidateOrPrimary = false; } +void ReplicationMetrics::_updateAverageCatchUpOps(WithLock lk) { + long numCatchUps = _electionMetrics.getNumCatchUps(); + if (numCatchUps > 0) { + _electionMetrics.setAverageCatchUpOps(_totalNumCatchUpOps / numCatchUps); + } +} + class ReplicationMetrics::ElectionMetricsSSS : public ServerStatusSection { public: ElectionMetricsSSS() : ServerStatusSection("electionMetrics") {} diff --git a/src/mongo/db/repl/replication_metrics.h b/src/mongo/db/repl/replication_metrics.h index 9d868533cfc..816b1fc39bc 100644 --- a/src/mongo/db/repl/replication_metrics.h +++ b/src/mongo/db/repl/replication_metrics.h @@ -56,25 +56,25 @@ public: void incrementNumCatchUpsConcludedForReason( ReplicationCoordinator::PrimaryCatchUpConclusionReason reason); - int getNumStepUpCmdsCalled_forTesting(); - int getNumPriorityTakeoversCalled_forTesting(); - int getNumCatchUpTakeoversCalled_forTesting(); - int getNumElectionTimeoutsCalled_forTesting(); - int getNumFreezeTimeoutsCalled_forTesting(); - int getNumStepUpCmdsSuccessful_forTesting(); - int getNumPriorityTakeoversSuccessful_forTesting(); - int getNumCatchUpTakeoversSuccessful_forTesting(); - int getNumElectionTimeoutsSuccessful_forTesting(); - int getNumFreezeTimeoutsSuccessful_forTesting(); - int getNumStepDownsCausedByHigherTerm_forTesting(); - int getNumCatchUps_forTesting(); - int getNumCatchUpsSucceeded_forTesting(); - int getNumCatchUpsAlreadyCaughtUp_forTesting(); - int getNumCatchUpsSkipped_forTesting(); - int getNumCatchUpsTimedOut_forTesting(); - int getNumCatchUpsFailedWithError_forTesting(); - int getNumCatchUpsFailedWithNewTerm_forTesting(); - int getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting(); + long getNumStepUpCmdsCalled_forTesting(); + long getNumPriorityTakeoversCalled_forTesting(); + long getNumCatchUpTakeoversCalled_forTesting(); + long getNumElectionTimeoutsCalled_forTesting(); + long getNumFreezeTimeoutsCalled_forTesting(); + long getNumStepUpCmdsSuccessful_forTesting(); + long getNumPriorityTakeoversSuccessful_forTesting(); + long getNumCatchUpTakeoversSuccessful_forTesting(); + long getNumElectionTimeoutsSuccessful_forTesting(); + long getNumFreezeTimeoutsSuccessful_forTesting(); + long getNumStepDownsCausedByHigherTerm_forTesting(); + long getNumCatchUps_forTesting(); + long getNumCatchUpsSucceeded_forTesting(); + long getNumCatchUpsAlreadyCaughtUp_forTesting(); + long getNumCatchUpsSkipped_forTesting(); + long getNumCatchUpsTimedOut_forTesting(); + long getNumCatchUpsFailedWithError_forTesting(); + long getNumCatchUpsFailedWithNewTerm_forTesting(); + long getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting(); // Election candidate metrics @@ -96,12 +96,18 @@ public: private: class ElectionMetricsSSS; + void _updateAverageCatchUpOps(WithLock lk); + mutable stdx::mutex _mutex; ElectionMetrics _electionMetrics; ElectionCandidateMetrics _electionCandidateMetrics; ElectionParticipantMetrics _electionParticipantMetrics; bool _nodeIsCandidateOrPrimary = false; + + // This field is a double so that the division result in _updateAverageCatchUpOps will be a + // double without any casting. + double _totalNumCatchUpOps = 0.0; }; } // namespace repl diff --git a/src/mongo/db/repl/replication_metrics.idl b/src/mongo/db/repl/replication_metrics.idl index 2613eac59b9..6c5746cef92 100644 --- a/src/mongo/db/repl/replication_metrics.idl +++ b/src/mongo/db/repl/replication_metrics.idl @@ -116,6 +116,10 @@ structs: replSetAbortPrimaryCatchUp command" type: long default: 0 + averageCatchUpOps: + description: "Average number of ops applied during catchup" + type: double + default: 0.0 ElectionCandidateMetrics: description: "Stores metrics that are specific to the last election in which the node was a diff --git a/src/mongo/shell/replsettest.js b/src/mongo/shell/replsettest.js index 0c625f90c99..093a1a62e9f 100644 --- a/src/mongo/shell/replsettest.js +++ b/src/mongo/shell/replsettest.js @@ -1117,9 +1117,9 @@ var ReplSetTest = function(opts) { * Modifies the election timeout to be 24 hours so that no unplanned elections happen. Then * runs replSetInitiate on the replica set with the new config. */ - this.initiateWithHighElectionTimeout = function(timeout) { + this.initiateWithHighElectionTimeout = function(opts = {}) { let cfg = this.getReplSetConfig(); - cfg.settings = {"electionTimeoutMillis": 24 * 60 * 60 * 1000}; + cfg.settings = Object.assign(opts, {"electionTimeoutMillis": 24 * 60 * 60 * 1000}); this.initiate(cfg); }; @@ -1137,9 +1137,28 @@ var ReplSetTest = function(opts) { return; } + jsTest.log("Stepping up: " + node.host + " in stepUp"); assert.commandWorked(node.adminCommand({replSetStepUp: 1})); this.awaitNodesAgreeOnPrimary(); - assert.eq(this.getPrimary(), node, node.host + " was not primary after stepUp"); + assert.eq(this.getPrimary(), node, 'failed to step up node ' + node.host + ' in stepUp'); + }; + + /** + * Steps up 'node' as primary. + */ + this.stepUpNoAwaitReplication = function(node) { + jsTest.log("Stepping up: " + node.host + " in stepUpNoAwaitReplication"); + assert.soonNoExcept( + function() { + assert.commandWorked(node.adminCommand({replSetStepUp: 1})); + self.awaitNodesAgreeOnPrimary( + self.kDefaultTimeoutMS, self.nodes, self.getNodeId(node)); + return node.adminCommand('replSetGetStatus').myState === ReplSetTest.State.PRIMARY; + }, + 'failed to step up node ' + node.host + ' in stepUpNoAwaitReplication', + self.kDefaultTimeoutMS); + + return node; }; /** |