summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorXuerui Fa <xuerui.fa@mongodb.com>2019-09-30 21:44:33 +0000
committerevergreen <evergreen@mongodb.com>2019-09-30 21:44:33 +0000
commit065bc52eec452e5bedfd7d9d8df569ef4309616d (patch)
tree624e56ba0a5765a4a752637fabd8865764a37bc0
parent28f301324e1809212f6f15ba641cea77f3c8719f (diff)
downloadmongo-065bc52eec452e5bedfd7d9d8df569ef4309616d.tar.gz
SERVER-41505 Track average number of catchup ops in serverStatus
(cherry picked from commit c1e1e12913fdcee54c059e023e8e7ecd36f7cb17)
-rw-r--r--jstests/replsets/avg_num_catchup_ops.js75
-rw-r--r--jstests/replsets/catchup.js63
-rw-r--r--jstests/replsets/rslib.js35
-rw-r--r--jstests/replsets/step_down_during_draining.js18
-rw-r--r--jstests/replsets/step_down_during_draining2.js17
-rw-r--r--jstests/replsets/step_down_during_draining3.js15
-rw-r--r--src/mongo/db/repl/replication_metrics.cpp48
-rw-r--r--src/mongo/db/repl/replication_metrics.h44
-rw-r--r--src/mongo/db/repl/replication_metrics.idl4
-rw-r--r--src/mongo/shell/replsettest.js25
10 files changed, 204 insertions, 140 deletions
diff --git a/jstests/replsets/avg_num_catchup_ops.js b/jstests/replsets/avg_num_catchup_ops.js
new file mode 100644
index 00000000000..4d3feae0d28
--- /dev/null
+++ b/jstests/replsets/avg_num_catchup_ops.js
@@ -0,0 +1,75 @@
+/**
+ * Tests that the election metric for average number of catchup ops is being set correctly. We test
+ * this by electing a node to be primary twice and forcing it to catch up each time.
+ */
+(function() {
+"use strict";
+
+load("jstests/libs/check_log.js");
+load("jstests/libs/write_concern_util.js");
+load("jstests/replsets/libs/election_metrics.js");
+load("jstests/replsets/rslib.js");
+
+const name = jsTestName();
+const rst = new ReplSetTest({name: name, nodes: 3, useBridge: true});
+
+rst.startSet();
+const confSettings = {
+ catchUpTimeoutMillis: 4 * 60 * 1000,
+};
+
+rst.initiateWithHighElectionTimeout(confSettings);
+rst.awaitSecondaryNodes();
+rst.awaitReplication();
+
+const testNode = rst.getSecondaries()[0];
+
+let stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp(rst, testNode);
+restartServerReplication(stepUpResults.oldSecondaries);
+// Block until the primary finishes drain mode.
+assert.eq(stepUpResults.newPrimary, rst.getPrimary());
+// Wait until the new primary completes the transition to primary and writes a no-op.
+checkLog.contains(stepUpResults.newPrimary, "transition to primary complete");
+
+let testNodeReplSetGetStatus =
+ assert.commandWorked(stepUpResults.newPrimary.adminCommand({replSetGetStatus: 1}));
+let testNodeServerStatus =
+ assert.commandWorked(stepUpResults.newPrimary.adminCommand({serverStatus: 1}));
+
+// Check that metrics associated with catchup have been set correctly in both replSetGetStatus and
+// serverStatus.
+assert(testNodeReplSetGetStatus.electionCandidateMetrics.numCatchUpOps,
+ () => "Response should have an 'numCatchUpOps' field: " +
+ tojson(testNodeReplSetGetStatus.electionCandidateMetrics));
+// numCatchUpOps should be 4 because the 'foo' collection is implicitly created during the 3
+// inserts, and that's where the additional oplog entry comes from.
+assert.eq(testNodeReplSetGetStatus.electionCandidateMetrics.numCatchUpOps, 4);
+assert(testNodeServerStatus.electionMetrics.averageCatchUpOps,
+ () => "Response should have an 'averageCatchUpOps' field: " +
+ tojson(testNodeServerStatus.electionMetrics));
+assert.eq(testNodeServerStatus.electionMetrics.averageCatchUpOps, 4);
+
+// Step up another node temporarily.
+const tempPrimary = rst.stepUpNoAwaitReplication(rst.getSecondaries()[0]);
+assert.eq(tempPrimary, rst.getPrimary());
+rst.awaitReplication();
+
+// Step up the testNode and force it to catchup again.
+stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp(rst, testNode);
+restartServerReplication(stepUpResults.oldSecondaries);
+assert.eq(stepUpResults.newPrimary, rst.getPrimary());
+checkLog.contains(stepUpResults.newPrimary, "transition to primary complete");
+rst.awaitReplication();
+
+testNodeServerStatus =
+ assert.commandWorked(stepUpResults.newPrimary.adminCommand({serverStatus: 1}));
+testNodeReplSetGetStatus =
+ assert.commandWorked(stepUpResults.newPrimary.adminCommand({replSetGetStatus: 1}));
+
+// numCatchUpOps is now 3 due to the 'foo' collection already being created.
+assert.eq(testNodeReplSetGetStatus.electionCandidateMetrics.numCatchUpOps, 3);
+assert.eq(testNodeServerStatus.electionMetrics.numCatchUps, 2);
+assert.eq(testNodeServerStatus.electionMetrics.averageCatchUpOps, 3.5);
+
+rst.stopSet();
+})(); \ No newline at end of file
diff --git a/jstests/replsets/catchup.js b/jstests/replsets/catchup.js
index 9d90c51099c..2ae5e09e732 100644
--- a/jstests/replsets/catchup.js
+++ b/jstests/replsets/catchup.js
@@ -36,16 +36,6 @@ rst.nodes.forEach(function(node) {
node.adminCommand(verbosity);
});
-function stepUpNode(node) {
- assert.soonNoExcept(function() {
- assert.commandWorked(node.adminCommand({replSetStepUp: 1}));
- rst.awaitNodesAgreeOnPrimary(rst.kDefaultTimeoutMS, rst.nodes, rst.getNodeId(node));
- return node.adminCommand('replSetGetStatus').myState == ReplSetTest.State.PRIMARY;
- }, 'failed to step up node ' + node.host, rst.kDefaultTimeoutMS);
-
- return node;
-}
-
function checkOpInOplog(node, op, count) {
node.getDB("admin").getMongo().setSlaveOk();
var oplog = node.getDB("local")['oplog.rs'];
@@ -53,36 +43,6 @@ function checkOpInOplog(node, op, count) {
assert.eq(oplog.count(op), count, "op: " + tojson(op) + ", oplog: " + tojson(oplogArray));
}
-// Stop replication on secondaries, do writes and step up one of the secondaries.
-//
-// The old primary has extra writes that are not replicated to the other nodes yet,
-// but the new primary steps up, getting the vote from the the third node "voter".
-function stopReplicationAndEnforceNewPrimaryToCatchUp() {
- // Write documents that cannot be replicated to secondaries in time.
- var oldSecondaries = rst.getSecondaries();
- var oldPrimary = rst.getPrimary();
- stopServerReplication(oldSecondaries);
- for (var i = 0; i < 3; i++) {
- assert.writeOK(oldPrimary.getDB("test").foo.insert({x: i}));
- }
- var latestOpOnOldPrimary = getLatestOp(oldPrimary);
-
- // New primary wins immediately, but needs to catch up.
- var newPrimary = stepUpNode(oldSecondaries[0]);
- var latestOpOnNewPrimary = getLatestOp(newPrimary);
- // Check this node is not writable.
- assert.eq(newPrimary.getDB("test").isMaster().ismaster, false);
-
- return {
- oldSecondaries: oldSecondaries,
- oldPrimary: oldPrimary,
- newPrimary: newPrimary,
- voter: oldSecondaries[1],
- latestOpOnOldPrimary: latestOpOnOldPrimary,
- latestOpOnNewPrimary: latestOpOnNewPrimary
- };
-}
-
function reconfigElectionAndCatchUpTimeout(electionTimeout, catchupTimeout) {
// Reconnect all nodes to make sure reconfig succeeds.
rst.nodes.forEach(reconnect);
@@ -103,7 +63,7 @@ let initialNewPrimaryStatus =
assert.commandWorked(rst.getSecondary().adminCommand({serverStatus: 1}));
// Should complete transition to primary immediately.
-var newPrimary = stepUpNode(rst.getSecondary());
+var newPrimary = rst.stepUpNoAwaitReplication(rst.getSecondary());
// Should win an election and finish the transition very quickly.
assert.eq(newPrimary, rst.getPrimary());
rst.awaitReplication();
@@ -138,7 +98,7 @@ jsTest.log("Case 2: The primary needs to catch up, succeeds in time.");
initialNewPrimaryStatus =
assert.commandWorked(rst.getSecondaries()[0].adminCommand({serverStatus: 1}));
-var stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp();
+var stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp(rst, rst.getSecondaries()[0]);
// Check that the 'electionCandidateMetrics' section of the replSetGetStatus response does not have
// a 'newTermStartDate' field before the transition to primary is complete.
@@ -179,21 +139,14 @@ verifyCatchUpConclusionReason(initialNewPrimaryStatus.electionMetrics,
newNewPrimaryStatus.electionMetrics,
'numCatchUpsSucceeded');
-// Check that the 'electionCandidateMetrics' section of the replSetGetStatus response has
-// 'targetCatchupOpTime' and 'numCatchUpOps' fields once the primary is caught up, and that they
-// have the correct values.
+// Check that the 'electionCandidateMetrics' section of the replSetGetStatus response has the
+// 'targetCatchupOpTime' field once the primary is caught up, and that it has the correct values.
assert(res.electionCandidateMetrics.targetCatchupOpTime,
() => "Response should have an 'electionCandidateMetrics.targetCatchupOpTime' field: " +
tojson(res.electionCandidateMetrics));
assert.eq(res.electionCandidateMetrics.targetCatchupOpTime.ts,
stepUpResults.latestOpOnOldPrimary.ts);
assert.eq(res.electionCandidateMetrics.targetCatchupOpTime.t, stepUpResults.latestOpOnOldPrimary.t);
-assert(res.electionCandidateMetrics.numCatchUpOps,
- () => "Response should have an 'electionCandidateMetrics.numCatchUpOps' field: " +
- tojson(res.electionCandidateMetrics));
-// numCatchUpOps should be 4 because the 'foo' collection is implicitly created during the 3
-// inserts, and that's where the additional oplog entry comes from.
-assert.eq(res.electionCandidateMetrics.numCatchUpOps, 4);
// Wait for all secondaries to catch up
rst.awaitReplication();
@@ -206,7 +159,7 @@ jsTest.log("Case 3: The primary needs to catch up, but has to change sync source
// longer be blacklisted by the new primary if it happened to be at the beginning of the test.
reconfigElectionAndCatchUpTimeout(3 * 60 * 1000, conf.settings.catchUpTimeoutMillis);
-stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp();
+stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp(rst, rst.getSecondaries()[0]);
// Disable fail point on the voter. Wait until it catches up with the old primary.
restartServerReplication(stepUpResults.voter);
@@ -237,7 +190,7 @@ initialNewPrimaryStatus =
// Reconfig to make the catchup timeout shorter.
reconfigElectionAndCatchUpTimeout(conf.settings.electionTimeoutMillis, 10 * 1000);
-stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp();
+stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp(rst, rst.getSecondaries()[0]);
// Wait until the new primary completes the transition to primary and writes a no-op.
checkLog.contains(stepUpResults.newPrimary, "Catchup timed out after becoming primary");
restartServerReplication(stepUpResults.newPrimary);
@@ -265,7 +218,7 @@ rst.awaitReplication();
jsTest.log("Case 5: The primary needs to catch up with no timeout, then gets aborted.");
// Reconfig to make the catchup timeout infinite.
reconfigElectionAndCatchUpTimeout(conf.settings.electionTimeoutMillis, -1);
-stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp();
+stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp(rst, rst.getSecondaries()[0]);
initialNewPrimaryStatus =
assert.commandWorked(stepUpResults.newPrimary.adminCommand({serverStatus: 1}));
@@ -298,7 +251,7 @@ jsTest.log("Case 6: The primary needs to catch up with no timeout, but steps dow
initialNewPrimaryStatus =
assert.commandWorked(rst.getSecondaries()[0].adminCommand({serverStatus: 1}));
-var stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp();
+var stepUpResults = stopReplicationAndEnforceNewPrimaryToCatchUp(rst, rst.getSecondaries()[0]);
// Step-down command should abort catchup.
assert.commandWorked(stepUpResults.newPrimary.adminCommand({replSetStepDown: 60}));
diff --git a/jstests/replsets/rslib.js b/jstests/replsets/rslib.js
index 209ed8172e1..4af2e49bbf0 100644
--- a/jstests/replsets/rslib.js
+++ b/jstests/replsets/rslib.js
@@ -14,6 +14,7 @@ var reInitiateWithoutThrowingOnAbortedMember;
var awaitRSClientHosts;
var getLastOpTime;
var setLogVerbosity;
+var stopReplicationAndEnforceNewPrimaryToCatchUp;
(function() {
"use strict";
@@ -453,4 +454,38 @@ setLogVerbosity = function(nodes, logVerbosity) {
assert.commandWorked(node.adminCommand(verbosity));
});
};
+
+/**
+ * Stop replication on secondaries, do writes and step up the node that was passed in.
+ *
+ * The old primary has extra writes that are not replicated to the other nodes yet,
+ * but the new primary steps up, getting the vote from the the third node "voter".
+ */
+stopReplicationAndEnforceNewPrimaryToCatchUp = function(rst, node) {
+ // Write documents that cannot be replicated to secondaries in time.
+ const oldSecondaries = rst.getSecondaries();
+ const oldPrimary = rst.getPrimary();
+
+ stopServerReplication(oldSecondaries);
+ for (let i = 0; i < 3; i++) {
+ assert.commandWorked(oldPrimary.getDB("test").foo.insert({x: i}));
+ }
+
+ const latestOpOnOldPrimary = getLatestOp(oldPrimary);
+
+ // New primary wins immediately, but needs to catch up.
+ const newPrimary = rst.stepUpNoAwaitReplication(node);
+ const latestOpOnNewPrimary = getLatestOp(newPrimary);
+ // Check this node is not writable.
+ assert.eq(newPrimary.getDB("test").isMaster().ismaster, false);
+
+ return {
+ oldSecondaries: oldSecondaries,
+ oldPrimary: oldPrimary,
+ newPrimary: newPrimary,
+ voter: oldSecondaries[1],
+ latestOpOnOldPrimary: latestOpOnOldPrimary,
+ latestOpOnNewPrimary: latestOpOnNewPrimary
+ };
+};
}());
diff --git a/jstests/replsets/step_down_during_draining.js b/jstests/replsets/step_down_during_draining.js
index 47c8ee2651a..53009c66f36 100644
--- a/jstests/replsets/step_down_during_draining.js
+++ b/jstests/replsets/step_down_during_draining.js
@@ -48,18 +48,6 @@ function disableFailPoint(node) {
assert.commandWorked(node.adminCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'off'}));
}
-// Since this test blocks a node in drain mode, we cannot use the ReplSetTest stepUp helper
-// that waits for a node to leave drain mode.
-function stepUpNode(node) {
- assert.soonNoExcept(function() {
- assert.commandWorked(node.adminCommand({replSetStepUp: 1}));
- // We do not specify a specific primary so that if a different primary gets elected
- // due to unfortunate timing we can try again.
- replSet.awaitNodesAgreeOnPrimary();
- return node.adminCommand('replSetGetStatus').myState === ReplSetTest.State.PRIMARY;
- }, 'failed to step up node ' + node.host, replSet.kDefaultTimeoutMS);
-}
-
// Do an initial insert to prevent the secondary from going into recovery
var numDocuments = 20;
var coll = primary.getDB("foo").foo;
@@ -91,7 +79,7 @@ assert.soon(
1000);
reconnect(secondary);
-stepUpNode(secondary);
+replSet.stepUpNoAwaitReplication(secondary);
// Secondary doesn't allow writes yet.
var res = secondary.getDB("admin").runCommand({"isMaster": 1});
@@ -107,10 +95,10 @@ assert.commandFailedWithCode(
// Original primary steps up.
reconnect(primary);
-stepUpNode(primary);
+replSet.stepUpNoAwaitReplication(primary);
reconnect(secondary);
-stepUpNode(secondary);
+replSet.stepUpNoAwaitReplication(secondary);
// Disable fail point to allow replication.
secondaries.forEach(disableFailPoint);
diff --git a/jstests/replsets/step_down_during_draining2.js b/jstests/replsets/step_down_during_draining2.js
index 1e97f93865a..25c1597d5c4 100644
--- a/jstests/replsets/step_down_during_draining2.js
+++ b/jstests/replsets/step_down_during_draining2.js
@@ -51,19 +51,6 @@ function disableFailPoint(node) {
assert.commandWorked(node.adminCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'off'}));
}
-// Since this test blocks a node in drain mode, we cannot use the ReplSetTest stepUp helper
-// that waits for a node to leave drain mode.
-function stepUpNode(node) {
- jsTest.log("Stepping up: " + node.host);
- assert.soonNoExcept(function() {
- assert.commandWorked(node.adminCommand({replSetStepUp: 1}));
- // We do not specify a specific primary so that if a different primary gets elected
- // due to unfortunate timing we can try again.
- replSet.awaitNodesAgreeOnPrimary();
- return node.adminCommand('replSetGetStatus').myState === ReplSetTest.State.PRIMARY;
- }, 'failed to step up node ' + node.host, replSet.kDefaultTimeoutMS);
-}
-
// Do an initial insert to prevent the secondary from going into recovery
var numDocuments = 20;
var coll = primary.getDB("foo").foo;
@@ -95,7 +82,7 @@ assert.soon(
1000);
reconnect(secondary);
-stepUpNode(secondary);
+replSet.stepUpNoAwaitReplication(secondary);
// Secondary doesn't allow writes yet.
var res = secondary.getDB("admin").runCommand({"isMaster": 1});
@@ -158,7 +145,7 @@ assert(!secondary.adminCommand('ismaster').ismaster);
// Now ensure that the node can successfully become primary again.
replSet.restart(0);
replSet.restart(2);
-stepUpNode(secondary);
+replSet.stepUpNoAwaitReplication(secondary);
assert.soon(function() {
return secondary.adminCommand('ismaster').ismaster;
diff --git a/jstests/replsets/step_down_during_draining3.js b/jstests/replsets/step_down_during_draining3.js
index 98c42955fc6..ac1f2239498 100644
--- a/jstests/replsets/step_down_during_draining3.js
+++ b/jstests/replsets/step_down_during_draining3.js
@@ -40,19 +40,6 @@ function disableFailPoint(node) {
assert.commandWorked(node.adminCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'off'}));
}
-// Since this test blocks a node in drain mode, we cannot use the ReplSetTest stepUp helper
-// that waits for a node to leave drain mode.
-function stepUpNode(node) {
- jsTest.log("Stepping up: " + node.host);
- assert.soonNoExcept(function() {
- assert.commandWorked(node.adminCommand({replSetStepUp: 1}));
- // We do not specify a specific primary so that if a different primary gets elected
- // due to unfortunate timing we can try again.
- replSet.awaitNodesAgreeOnPrimary();
- return node.adminCommand('replSetGetStatus').myState === ReplSetTest.State.PRIMARY;
- }, 'failed to step up node ' + node.host, replSet.kDefaultTimeoutMS);
-}
-
// Do an initial insert to prevent the secondary from going into recovery
var numDocuments = 20;
var coll = primary.getDB("foo").foo;
@@ -84,7 +71,7 @@ assert.soon(
1000);
reconnect(secondary);
-stepUpNode(secondary);
+replSet.stepUpNoAwaitReplication(secondary);
// Secondary doesn't allow writes yet.
var res = secondary.getDB("admin").runCommand({"isMaster": 1});
diff --git a/src/mongo/db/repl/replication_metrics.cpp b/src/mongo/db/repl/replication_metrics.cpp
index eae67e976fd..55508674562 100644
--- a/src/mongo/db/repl/replication_metrics.cpp
+++ b/src/mongo/db/repl/replication_metrics.cpp
@@ -131,6 +131,7 @@ void ReplicationMetrics::incrementNumStepDownsCausedByHigherTerm() {
void ReplicationMetrics::incrementNumCatchUps() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
_electionMetrics.setNumCatchUps(_electionMetrics.getNumCatchUps() + 1);
+ _updateAverageCatchUpOps(lk);
}
void ReplicationMetrics::incrementNumCatchUpsConcludedForReason(
@@ -167,97 +168,97 @@ void ReplicationMetrics::incrementNumCatchUpsConcludedForReason(
}
}
-int ReplicationMetrics::getNumStepUpCmdsCalled_forTesting() {
+long ReplicationMetrics::getNumStepUpCmdsCalled_forTesting() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
return _electionMetrics.getStepUpCmd().getCalled();
}
-int ReplicationMetrics::getNumPriorityTakeoversCalled_forTesting() {
+long ReplicationMetrics::getNumPriorityTakeoversCalled_forTesting() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
return _electionMetrics.getPriorityTakeover().getCalled();
}
-int ReplicationMetrics::getNumCatchUpTakeoversCalled_forTesting() {
+long ReplicationMetrics::getNumCatchUpTakeoversCalled_forTesting() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
return _electionMetrics.getCatchUpTakeover().getCalled();
}
-int ReplicationMetrics::getNumElectionTimeoutsCalled_forTesting() {
+long ReplicationMetrics::getNumElectionTimeoutsCalled_forTesting() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
return _electionMetrics.getElectionTimeout().getCalled();
}
-int ReplicationMetrics::getNumFreezeTimeoutsCalled_forTesting() {
+long ReplicationMetrics::getNumFreezeTimeoutsCalled_forTesting() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
return _electionMetrics.getFreezeTimeout().getCalled();
}
-int ReplicationMetrics::getNumStepUpCmdsSuccessful_forTesting() {
+long ReplicationMetrics::getNumStepUpCmdsSuccessful_forTesting() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
return _electionMetrics.getStepUpCmd().getSuccessful();
}
-int ReplicationMetrics::getNumPriorityTakeoversSuccessful_forTesting() {
+long ReplicationMetrics::getNumPriorityTakeoversSuccessful_forTesting() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
return _electionMetrics.getPriorityTakeover().getSuccessful();
}
-int ReplicationMetrics::getNumCatchUpTakeoversSuccessful_forTesting() {
+long ReplicationMetrics::getNumCatchUpTakeoversSuccessful_forTesting() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
return _electionMetrics.getCatchUpTakeover().getSuccessful();
}
-int ReplicationMetrics::getNumElectionTimeoutsSuccessful_forTesting() {
+long ReplicationMetrics::getNumElectionTimeoutsSuccessful_forTesting() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
return _electionMetrics.getElectionTimeout().getSuccessful();
}
-int ReplicationMetrics::getNumFreezeTimeoutsSuccessful_forTesting() {
+long ReplicationMetrics::getNumFreezeTimeoutsSuccessful_forTesting() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
return _electionMetrics.getFreezeTimeout().getSuccessful();
}
-int ReplicationMetrics::getNumStepDownsCausedByHigherTerm_forTesting() {
+long ReplicationMetrics::getNumStepDownsCausedByHigherTerm_forTesting() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
return _electionMetrics.getNumStepDownsCausedByHigherTerm();
}
-int ReplicationMetrics::getNumCatchUps_forTesting() {
+long ReplicationMetrics::getNumCatchUps_forTesting() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
return _electionMetrics.getNumCatchUps();
}
-int ReplicationMetrics::getNumCatchUpsSucceeded_forTesting() {
+long ReplicationMetrics::getNumCatchUpsSucceeded_forTesting() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
return _electionMetrics.getNumCatchUpsSucceeded();
}
-int ReplicationMetrics::getNumCatchUpsAlreadyCaughtUp_forTesting() {
+long ReplicationMetrics::getNumCatchUpsAlreadyCaughtUp_forTesting() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
return _electionMetrics.getNumCatchUpsAlreadyCaughtUp();
}
-int ReplicationMetrics::getNumCatchUpsSkipped_forTesting() {
+long ReplicationMetrics::getNumCatchUpsSkipped_forTesting() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
return _electionMetrics.getNumCatchUpsSkipped();
}
-int ReplicationMetrics::getNumCatchUpsTimedOut_forTesting() {
+long ReplicationMetrics::getNumCatchUpsTimedOut_forTesting() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
return _electionMetrics.getNumCatchUpsTimedOut();
}
-int ReplicationMetrics::getNumCatchUpsFailedWithError_forTesting() {
+long ReplicationMetrics::getNumCatchUpsFailedWithError_forTesting() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
return _electionMetrics.getNumCatchUpsFailedWithError();
}
-int ReplicationMetrics::getNumCatchUpsFailedWithNewTerm_forTesting() {
+long ReplicationMetrics::getNumCatchUpsFailedWithNewTerm_forTesting() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
return _electionMetrics.getNumCatchUpsFailedWithNewTerm();
}
-int ReplicationMetrics::getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting() {
+long ReplicationMetrics::getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
return _electionMetrics.getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd();
}
@@ -276,6 +277,8 @@ void ReplicationMetrics::setTargetCatchupOpTime(OpTime opTime) {
void ReplicationMetrics::setNumCatchUpOps(int numCatchUpOps) {
stdx::lock_guard<stdx::mutex> lk(_mutex);
_electionCandidateMetrics.setNumCatchUpOps(numCatchUpOps);
+ _totalNumCatchUpOps += numCatchUpOps;
+ _updateAverageCatchUpOps(lk);
}
void ReplicationMetrics::setNewTermStartDate(Date_t newTermStartDate) {
@@ -315,6 +318,13 @@ void ReplicationMetrics::clearElectionCandidateMetrics() {
_nodeIsCandidateOrPrimary = false;
}
+void ReplicationMetrics::_updateAverageCatchUpOps(WithLock lk) {
+ long numCatchUps = _electionMetrics.getNumCatchUps();
+ if (numCatchUps > 0) {
+ _electionMetrics.setAverageCatchUpOps(_totalNumCatchUpOps / numCatchUps);
+ }
+}
+
class ReplicationMetrics::ElectionMetricsSSS : public ServerStatusSection {
public:
ElectionMetricsSSS() : ServerStatusSection("electionMetrics") {}
diff --git a/src/mongo/db/repl/replication_metrics.h b/src/mongo/db/repl/replication_metrics.h
index 9d868533cfc..816b1fc39bc 100644
--- a/src/mongo/db/repl/replication_metrics.h
+++ b/src/mongo/db/repl/replication_metrics.h
@@ -56,25 +56,25 @@ public:
void incrementNumCatchUpsConcludedForReason(
ReplicationCoordinator::PrimaryCatchUpConclusionReason reason);
- int getNumStepUpCmdsCalled_forTesting();
- int getNumPriorityTakeoversCalled_forTesting();
- int getNumCatchUpTakeoversCalled_forTesting();
- int getNumElectionTimeoutsCalled_forTesting();
- int getNumFreezeTimeoutsCalled_forTesting();
- int getNumStepUpCmdsSuccessful_forTesting();
- int getNumPriorityTakeoversSuccessful_forTesting();
- int getNumCatchUpTakeoversSuccessful_forTesting();
- int getNumElectionTimeoutsSuccessful_forTesting();
- int getNumFreezeTimeoutsSuccessful_forTesting();
- int getNumStepDownsCausedByHigherTerm_forTesting();
- int getNumCatchUps_forTesting();
- int getNumCatchUpsSucceeded_forTesting();
- int getNumCatchUpsAlreadyCaughtUp_forTesting();
- int getNumCatchUpsSkipped_forTesting();
- int getNumCatchUpsTimedOut_forTesting();
- int getNumCatchUpsFailedWithError_forTesting();
- int getNumCatchUpsFailedWithNewTerm_forTesting();
- int getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting();
+ long getNumStepUpCmdsCalled_forTesting();
+ long getNumPriorityTakeoversCalled_forTesting();
+ long getNumCatchUpTakeoversCalled_forTesting();
+ long getNumElectionTimeoutsCalled_forTesting();
+ long getNumFreezeTimeoutsCalled_forTesting();
+ long getNumStepUpCmdsSuccessful_forTesting();
+ long getNumPriorityTakeoversSuccessful_forTesting();
+ long getNumCatchUpTakeoversSuccessful_forTesting();
+ long getNumElectionTimeoutsSuccessful_forTesting();
+ long getNumFreezeTimeoutsSuccessful_forTesting();
+ long getNumStepDownsCausedByHigherTerm_forTesting();
+ long getNumCatchUps_forTesting();
+ long getNumCatchUpsSucceeded_forTesting();
+ long getNumCatchUpsAlreadyCaughtUp_forTesting();
+ long getNumCatchUpsSkipped_forTesting();
+ long getNumCatchUpsTimedOut_forTesting();
+ long getNumCatchUpsFailedWithError_forTesting();
+ long getNumCatchUpsFailedWithNewTerm_forTesting();
+ long getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting();
// Election candidate metrics
@@ -96,12 +96,18 @@ public:
private:
class ElectionMetricsSSS;
+ void _updateAverageCatchUpOps(WithLock lk);
+
mutable stdx::mutex _mutex;
ElectionMetrics _electionMetrics;
ElectionCandidateMetrics _electionCandidateMetrics;
ElectionParticipantMetrics _electionParticipantMetrics;
bool _nodeIsCandidateOrPrimary = false;
+
+ // This field is a double so that the division result in _updateAverageCatchUpOps will be a
+ // double without any casting.
+ double _totalNumCatchUpOps = 0.0;
};
} // namespace repl
diff --git a/src/mongo/db/repl/replication_metrics.idl b/src/mongo/db/repl/replication_metrics.idl
index 2613eac59b9..6c5746cef92 100644
--- a/src/mongo/db/repl/replication_metrics.idl
+++ b/src/mongo/db/repl/replication_metrics.idl
@@ -116,6 +116,10 @@ structs:
replSetAbortPrimaryCatchUp command"
type: long
default: 0
+ averageCatchUpOps:
+ description: "Average number of ops applied during catchup"
+ type: double
+ default: 0.0
ElectionCandidateMetrics:
description: "Stores metrics that are specific to the last election in which the node was a
diff --git a/src/mongo/shell/replsettest.js b/src/mongo/shell/replsettest.js
index 0c625f90c99..093a1a62e9f 100644
--- a/src/mongo/shell/replsettest.js
+++ b/src/mongo/shell/replsettest.js
@@ -1117,9 +1117,9 @@ var ReplSetTest = function(opts) {
* Modifies the election timeout to be 24 hours so that no unplanned elections happen. Then
* runs replSetInitiate on the replica set with the new config.
*/
- this.initiateWithHighElectionTimeout = function(timeout) {
+ this.initiateWithHighElectionTimeout = function(opts = {}) {
let cfg = this.getReplSetConfig();
- cfg.settings = {"electionTimeoutMillis": 24 * 60 * 60 * 1000};
+ cfg.settings = Object.assign(opts, {"electionTimeoutMillis": 24 * 60 * 60 * 1000});
this.initiate(cfg);
};
@@ -1137,9 +1137,28 @@ var ReplSetTest = function(opts) {
return;
}
+ jsTest.log("Stepping up: " + node.host + " in stepUp");
assert.commandWorked(node.adminCommand({replSetStepUp: 1}));
this.awaitNodesAgreeOnPrimary();
- assert.eq(this.getPrimary(), node, node.host + " was not primary after stepUp");
+ assert.eq(this.getPrimary(), node, 'failed to step up node ' + node.host + ' in stepUp');
+ };
+
+ /**
+ * Steps up 'node' as primary.
+ */
+ this.stepUpNoAwaitReplication = function(node) {
+ jsTest.log("Stepping up: " + node.host + " in stepUpNoAwaitReplication");
+ assert.soonNoExcept(
+ function() {
+ assert.commandWorked(node.adminCommand({replSetStepUp: 1}));
+ self.awaitNodesAgreeOnPrimary(
+ self.kDefaultTimeoutMS, self.nodes, self.getNodeId(node));
+ return node.adminCommand('replSetGetStatus').myState === ReplSetTest.State.PRIMARY;
+ },
+ 'failed to step up node ' + node.host + ' in stepUpNoAwaitReplication',
+ self.kDefaultTimeoutMS);
+
+ return node;
};
/**