summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSamyukta Lanka <samy.lanka@mongodb.com>2019-10-16 20:54:38 +0000
committerevergreen <evergreen@mongodb.com>2019-10-16 20:54:38 +0000
commit9a9b82e95a88c5ce25c958690c2d3365bc62bacc (patch)
treeb7d6ce480dccf68933eaba2340074ba2cf7bbceb
parentc9349a22f68fac52f6056fb08ea3ce0993dd8cbe (diff)
downloadmongo-9a9b82e95a88c5ce25c958690c2d3365bc62bacc.tar.gz
SERVER-43239 Fixed bug causing numCatchUpOps in repSetGetStatus to be incorrect
(cherry picked from commit 71e4779b0da9e8d58dbb179c49b1a86c5e48c93d) SERVER-41512 Added tracking for metrics around a node voting in an election (cherry picked from commit 7538504cb584720c2cbbc6d44ea62d0743b41fcf) SERVER-41513 Track the time the new term oplog entry was written by primary and applied in secondary (cherry picked from commit efde009845f32d8de2d094088628e67608bfa419)
-rw-r--r--jstests/replsets/election_candidate_and_participant_metrics.js116
-rw-r--r--jstests/replsets/election_participant_new_term_metrics.js109
-rw-r--r--src/mongo/db/pipeline/document_source_change_stream_test.cpp8
-rw-r--r--src/mongo/db/repl/SConscript1
-rw-r--r--src/mongo/db/repl/replication_coordinator.h12
-rw-r--r--src/mongo/db/repl/replication_coordinator_external_state_impl.cpp6
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp54
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.h6
-rw-r--r--src/mongo/db/repl/replication_coordinator_mock.cpp2
-rw-r--r--src/mongo/db/repl/replication_coordinator_mock.h2
-rw-r--r--src/mongo/db/repl/replication_coordinator_noop.cpp2
-rw-r--r--src/mongo/db/repl/replication_coordinator_noop.h2
-rw-r--r--src/mongo/db/repl/replication_metrics.cpp51
-rw-r--r--src/mongo/db/repl/replication_metrics.h26
-rw-r--r--src/mongo/db/repl/replication_metrics.idl34
-rw-r--r--src/mongo/db/repl/sync_tail.cpp13
-rw-r--r--src/mongo/db/repl/sync_tail_test.cpp26
-rw-r--r--src/mongo/db/repl/topology_coordinator.cpp32
-rw-r--r--src/mongo/db/repl/topology_coordinator.h1
-rw-r--r--src/mongo/db/repl/topology_coordinator_v1_test.cpp6
-rw-r--r--src/mongo/embedded/replication_coordinator_embedded.cpp2
-rw-r--r--src/mongo/embedded/replication_coordinator_embedded.h2
22 files changed, 437 insertions, 76 deletions
diff --git a/jstests/replsets/election_candidate_and_participant_metrics.js b/jstests/replsets/election_candidate_and_participant_metrics.js
index 7e19f1a06f1..029db2fc883 100644
--- a/jstests/replsets/election_candidate_and_participant_metrics.js
+++ b/jstests/replsets/election_candidate_and_participant_metrics.js
@@ -1,17 +1,18 @@
/**
- * This test checks that the metrics around election candidates and voters are set correctly.
+ * This test checks that the metrics around election candidates and participants are set and updated
+ * correctly. We test this with a two node replica set by forcing multiple election handoffs and
+ * checking the 'electionCandidateMetrics' and 'electionParticipantMetrics' fields of replSetStatus
+ * after each handoff.
*/
(function() {
"use strict";
-load("jstests/libs/check_log.js");
-load("jstests/replsets/libs/election_metrics.js");
+
load("jstests/replsets/libs/election_handoff.js");
const testName = jsTestName();
const numNodes = 2;
const rst = ReplSetTest({name: testName, nodes: numNodes});
-const nodes = rst.nodeList();
rst.startSet();
// Make sure there are no election timeouts firing for the duration of the test. This helps
@@ -35,10 +36,10 @@ assert.eq(originalPrimaryElectionCandidateMetrics.lastElectionReason, "electionT
assert(originalPrimaryElectionCandidateMetrics.lastElectionDate,
() => "Response should have an 'electionCandidateMetrics.lastElectionDate' field: " +
tojson(originalPrimaryElectionCandidateMetrics));
-assert(originalPrimaryElectionCandidateMetrics.termAtElection,
- () => "Response should have an 'electionCandidateMetrics.termAtElection' field: " +
+assert(originalPrimaryElectionCandidateMetrics.electionTerm,
+ () => "Response should have an 'electionCandidateMetrics.electionTerm' field: " +
tojson(originalPrimaryElectionCandidateMetrics));
-assert.eq(originalPrimaryElectionCandidateMetrics.termAtElection, 1);
+assert.eq(originalPrimaryElectionCandidateMetrics.electionTerm, 1);
assert(
originalPrimaryElectionCandidateMetrics.lastCommittedOpTimeAtElection,
() =>
@@ -81,10 +82,10 @@ assert.eq(newPrimaryElectionCandidateMetrics.lastElectionReason, "stepUpRequestS
assert(newPrimaryElectionCandidateMetrics.lastElectionDate,
() => "Response should have an 'electionCandidateMetrics.lastElectionDate' field: " +
tojson(newPrimaryElectionCandidateMetrics));
-assert(newPrimaryElectionCandidateMetrics.termAtElection,
- () => "Response should have an 'electionCandidateMetrics.termAtElection' field: " +
+assert(newPrimaryElectionCandidateMetrics.electionTerm,
+ () => "Response should have an 'electionCandidateMetrics.electionTerm' field: " +
tojson(newPrimaryElectionCandidateMetrics));
-assert.eq(newPrimaryElectionCandidateMetrics.termAtElection, 2);
+assert.eq(newPrimaryElectionCandidateMetrics.electionTerm, 2);
assert(
newPrimaryElectionCandidateMetrics.lastCommittedOpTimeAtElection,
() =>
@@ -109,8 +110,65 @@ assert.eq(newPrimaryElectionCandidateMetrics.electionTimeoutMillis, expectedElec
// priorPrimaryMemberId field.
assert.eq(newPrimaryElectionCandidateMetrics.priorPrimaryMemberId, 0);
+let newPrimaryElectionParticipantMetrics = newPrimaryReplSetGetStatus.electionParticipantMetrics;
+
+// The new primary should not have its 'electionParticipantMetrics' field set, since it was the
+// candidate in this election and did not vote for any other node.
+assert(!newPrimaryElectionParticipantMetrics,
+ () => "Response should not have an 'electionParticipantMetric' field: " +
+ tojson(newPrimaryReplSetGetStatus));
+
+originalPrimaryReplSetGetStatus =
+ assert.commandWorked(originalPrimary.adminCommand({replSetGetStatus: 1}));
+let originalPrimaryElectionParticipantMetrics =
+ originalPrimaryReplSetGetStatus.electionParticipantMetrics;
+
+// Check that the 'electionParticipantMetrics' section of the replSetGetStatus response for the
+// original primary has all of the required fields and that they are set correctly.
+assert(originalPrimaryElectionParticipantMetrics.votedForCandidate,
+ () => "Response should have an 'electionParticipantMetrics.votedForCandidate' field: " +
+ tojson(originalPrimaryElectionParticipantMetrics));
+assert.eq(originalPrimaryElectionParticipantMetrics.votedForCandidate, true);
+assert(originalPrimaryElectionParticipantMetrics.electionTerm,
+ () => "Response should have an 'electionParticipantMetrics.electionTerm' field: " +
+ tojson(originalPrimaryElectionParticipantMetrics));
+assert.eq(originalPrimaryElectionParticipantMetrics.electionTerm, 2);
+assert(originalPrimaryElectionParticipantMetrics.lastVoteDate,
+ () => "Response should have an 'electionParticipantMetrics.lastVoteDate' field: " +
+ tojson(originalPrimaryElectionParticipantMetrics));
+assert(
+ originalPrimaryElectionParticipantMetrics.electionCandidateMemberId,
+ () => "Response should have an 'electionParticipantMetrics.electionCandidateMemberId' field: " +
+ tojson(originalPrimaryElectionParticipantMetrics));
+assert.eq(originalPrimaryElectionParticipantMetrics.electionCandidateMemberId, 1);
+// Since the node voted for the new primary, we directly assert that its voteReason is equal to
+// empty string.
+assert.eq(originalPrimaryElectionParticipantMetrics.voteReason, "");
+assert(
+ originalPrimaryElectionParticipantMetrics.lastAppliedOpTimeAtElection,
+ () =>
+ "Response should have an 'electionParticipantMetrics.lastAppliedOpTimeAtElection' field: " +
+ tojson(originalPrimaryElectionParticipantMetrics));
+assert(originalPrimaryElectionParticipantMetrics.maxAppliedOpTimeInSet,
+ () => "Response should have an 'electionParticipantMetrics.maxAppliedOpTimeInSet' field: " +
+ tojson(originalPrimaryElectionParticipantMetrics));
+assert(originalPrimaryElectionParticipantMetrics.priorityAtElection,
+ () => "Response should have an 'electionParticipantMetrics.priorityAtElection' field: " +
+ tojson(originalPrimaryElectionParticipantMetrics));
+assert.eq(originalPrimaryElectionParticipantMetrics.priorityAtElection, 1);
+
+originalPrimaryElectionCandidateMetrics = originalPrimaryReplSetGetStatus.electionCandidateMetrics;
+
+// The original primary should not have its 'electionCandidateMetrics' field set, since it was not a
+// candidate in this election.
+assert(!originalPrimaryElectionCandidateMetrics,
+ () => "Response should not have an 'electionCandidateMetrics' field: " +
+ tojson(originalPrimaryCandidateMetrics));
+
+// testElectionHandoff steps down the primary with a non-zero step down period, so we need to
+// unfreeze the node to allow it to initiate an election again.
+assert.commandWorked(originalPrimary.adminCommand({replSetFreeze: 0}));
// Step up the original primary.
-sleep(ElectionHandoffTest.stepDownPeriodSecs * 1000);
ElectionHandoffTest.testElectionHandoff(rst, 1, 0);
originalPrimaryReplSetGetStatus =
@@ -119,7 +177,7 @@ originalPrimaryElectionCandidateMetrics = originalPrimaryReplSetGetStatus.electi
// Check that the original primary's metrics are also being set properly after the second election.
assert.eq(originalPrimaryElectionCandidateMetrics.lastElectionReason, "stepUpRequestSkipDryRun");
-assert.eq(originalPrimaryElectionCandidateMetrics.termAtElection, 3);
+assert.eq(originalPrimaryElectionCandidateMetrics.electionTerm, 3);
assert.eq(originalPrimaryElectionCandidateMetrics.numVotesNeeded, 2);
assert.eq(originalPrimaryElectionCandidateMetrics.priorityAtElection, 1);
assert.eq(originalPrimaryElectionCandidateMetrics.electionTimeoutMillis,
@@ -128,11 +186,45 @@ assert.eq(originalPrimaryElectionCandidateMetrics.priorPrimaryMemberId, 1);
newPrimaryReplSetGetStatus = assert.commandWorked(newPrimary.adminCommand({replSetGetStatus: 1}));
newPrimaryElectionCandidateMetrics = newPrimaryReplSetGetStatus.electionCandidateMetrics;
+newPrimaryElectionParticipantMetrics = newPrimaryReplSetGetStatus.electionParticipantMetrics;
// The other node should not have an electionCandidateMetrics, as it just stepped down.
assert(!newPrimaryElectionCandidateMetrics,
() => "Response should not have an 'electionCandidateMetrics' field: " +
tojson(newPrimaryReplSetGetStatus));
+// Check that the primary that just stepped down has its 'electionParticipantMetrics' field set
+// correctly.
+assert.eq(newPrimaryElectionParticipantMetrics.votedForCandidate, true);
+assert.eq(newPrimaryElectionParticipantMetrics.electionTerm, 3);
+assert.eq(newPrimaryElectionParticipantMetrics.electionCandidateMemberId, 0);
+assert.eq(newPrimaryElectionParticipantMetrics.voteReason, "");
+assert.eq(newPrimaryElectionParticipantMetrics.priorityAtElection, 1);
+
+// Since the election participant metrics are only set in the real election, set up a failpoint that
+// tells a voting node to vote yes in the dry run election and no in the real election.
+assert.commandWorked(originalPrimary.adminCommand(
+ {configureFailPoint: "voteYesInDryRunButNoInRealElection", mode: "alwaysOn"}));
+
+// Attempt to step up the new primary a second time. Due to the failpoint, the current primary
+// should vote no, and as a result the election should fail.
+assert.commandWorked(newPrimary.adminCommand({replSetFreeze: 0}));
+assert.commandFailedWithCode(newPrimary.adminCommand({replSetStepUp: 1}), ErrorCodes.CommandFailed);
+
+originalPrimaryReplSetGetStatus =
+ assert.commandWorked(originalPrimary.adminCommand({replSetGetStatus: 1}));
+originalPrimaryElectionParticipantMetrics =
+ originalPrimaryReplSetGetStatus.electionParticipantMetrics;
+
+// Check that the metrics in 'electionParticipantMetrics' were updated for the original primary
+// after the second election that it participated in.
+assert.eq(originalPrimaryElectionParticipantMetrics.votedForCandidate, false);
+assert.eq(originalPrimaryElectionParticipantMetrics.electionTerm, 4);
+assert.eq(originalPrimaryElectionParticipantMetrics.electionCandidateMemberId, 1);
+assert.eq(
+ originalPrimaryElectionParticipantMetrics.voteReason,
+ "forced to vote no in real election due to failpoint voteYesInDryRunButNoInRealElection set");
+assert.eq(originalPrimaryElectionParticipantMetrics.priorityAtElection, 1);
+
rst.stopSet();
})(); \ No newline at end of file
diff --git a/jstests/replsets/election_participant_new_term_metrics.js b/jstests/replsets/election_participant_new_term_metrics.js
new file mode 100644
index 00000000000..aa76bdf4546
--- /dev/null
+++ b/jstests/replsets/election_participant_new_term_metrics.js
@@ -0,0 +1,109 @@
+/**
+ * This test checks that the 'newTermStartDate' and 'newTermAppliedDate' metrics in
+ * 'electionParticipantMetrics' are set and updated correctly. We test this with a three node
+ * replica set by successfully stepping up one of the secondaries, then failing to step up the
+ * original primary. We check that the metrics are appropriately set or unset after each election.
+ */
+
+(function() {
+"use strict";
+
+const testName = jsTestName();
+const rst = ReplSetTest({name: testName, nodes: [{}, {}, {rsConfig: {priority: 0}}]});
+rst.startSet();
+
+// Make sure there are no election timeouts firing for the duration of the test. This helps
+// ensure that the test will only pass if the election succeeds.
+rst.initiateWithHighElectionTimeout();
+
+const originalPrimary = rst.getPrimary();
+const newPrimary = rst.getSecondaries()[0];
+const testNode = rst.getSecondaries()[1];
+
+// Set up a failpoint that forces the original primary to vote no in this election. This guarantees
+// that 'testNode' will be a participant in this election, since its vote will be needed for the new
+// primary to win.
+assert.commandWorked(
+ originalPrimary.adminCommand({configureFailPoint: "voteNoInElection", mode: "alwaysOn"}));
+
+// Step up the new primary.
+assert.commandWorked(newPrimary.adminCommand({replSetStepUp: 1}));
+rst.awaitNodesAgreeOnPrimary();
+assert.eq(newPrimary, rst.getPrimary());
+
+// Since the new term oplog entry needs to be replicated onto testNode for the metrics to be set, we
+// must await replication before checking the metrics.
+rst.awaitReplication();
+
+let testNodeReplSetGetStatus = assert.commandWorked(testNode.adminCommand({replSetGetStatus: 1}));
+let testNodeElectionParticipantMetrics = testNodeReplSetGetStatus.electionParticipantMetrics;
+
+const originalNewTermStartDate = testNodeElectionParticipantMetrics.newTermStartDate;
+const originalNewTermAppliedDate = testNodeElectionParticipantMetrics.newTermAppliedDate;
+
+// These fields should be set, since testNode has received and applied the new term oplog entry
+// after the election.
+assert(originalNewTermStartDate,
+ () => "Response should have an 'electionParticipantMetrics.newTermStartDate' field: " +
+ tojson(testNodeElectionParticipantMetrics));
+assert(originalNewTermAppliedDate,
+ () => "Response should have an 'electionParticipantMetrics.newTermAppliedDate' field: " +
+ tojson(testNodeElectionParticipantMetrics));
+
+// Set up a failpoint that forces newPrimary and testNode to vote no in the election, in addition to
+// the new primary above. This will cause the dry run to fail for the original primary.
+assert.commandWorked(
+ newPrimary.adminCommand({configureFailPoint: "voteNoInElection", mode: "alwaysOn"}));
+assert.commandWorked(
+ testNode.adminCommand({configureFailPoint: "voteNoInElection", mode: "alwaysOn"}));
+
+// Attempt to step up the original primary.
+assert.commandFailedWithCode(originalPrimary.adminCommand({replSetStepUp: 1}),
+ ErrorCodes.CommandFailed);
+
+testNodeReplSetGetStatus = assert.commandWorked(testNode.adminCommand({replSetGetStatus: 1}));
+testNodeElectionParticipantMetrics = testNodeReplSetGetStatus.electionParticipantMetrics;
+
+// The 'newTermStartDate' and 'newTermAppliedDate' fields should not be cleared, since the term is
+// not incremented when a dry run election is initiated.
+assert(testNodeElectionParticipantMetrics.newTermStartDate,
+ () => "Response should have an 'electionParticipantMetrics.newTermStartDate' field: " +
+ tojson(testNodeElectionParticipantMetrics));
+assert(testNodeElectionParticipantMetrics.newTermAppliedDate,
+ () => "Response should have an 'electionParticipantMetrics.newTermAppliedDate' field: " +
+ tojson(testNodeElectionParticipantMetrics));
+
+// The fields should store the same dates, since a new term oplog entry was not received.
+assert.eq(originalNewTermStartDate, testNodeElectionParticipantMetrics.newTermStartDate);
+assert.eq(originalNewTermAppliedDate, testNodeElectionParticipantMetrics.newTermAppliedDate);
+
+// Clear the previous failpoint and set up a new one that forces the two current secondaries to vote
+// yes for the candidate in the dry run election and no in the real election. This will cause the
+// real election to fail.
+assert.commandWorked(
+ newPrimary.adminCommand({configureFailPoint: "voteNoInElection", mode: "off"}));
+assert.commandWorked(newPrimary.adminCommand(
+ {configureFailPoint: "voteYesInDryRunButNoInRealElection", mode: "alwaysOn"}));
+assert.commandWorked(testNode.adminCommand({configureFailPoint: "voteNoInElection", mode: "off"}));
+assert.commandWorked(testNode.adminCommand(
+ {configureFailPoint: "voteYesInDryRunButNoInRealElection", mode: "alwaysOn"}));
+
+// Attempt to step up the original primary.
+assert.commandFailedWithCode(originalPrimary.adminCommand({replSetStepUp: 1}),
+ ErrorCodes.CommandFailed);
+
+testNodeReplSetGetStatus = assert.commandWorked(testNode.adminCommand({replSetGetStatus: 1}));
+testNodeElectionParticipantMetrics = testNodeReplSetGetStatus.electionParticipantMetrics;
+
+// Since the election succeeded in the dry run, a new term was encountered by the secondary.
+// However, because the real election failed, there was no new term oplog entry, so these fields
+// should not be set.
+assert(!testNodeElectionParticipantMetrics.newTermStartDate,
+ () => "Response should not have an 'electionParticipantMetrics.newTermStartDate' field: " +
+ tojson(testNodeElectionParticipantMetrics));
+assert(!testNodeElectionParticipantMetrics.newTermAppliedDate,
+ () => "Response should not have an 'electionParticipantMetrics.newTermAppliedDate' field: " +
+ tojson(testNodeElectionParticipantMetrics));
+
+rst.stopSet();
+})(); \ No newline at end of file
diff --git a/src/mongo/db/pipeline/document_source_change_stream_test.cpp b/src/mongo/db/pipeline/document_source_change_stream_test.cpp
index 501045ceadf..eb2583f705b 100644
--- a/src/mongo/db/pipeline/document_source_change_stream_test.cpp
+++ b/src/mongo/db/pipeline/document_source_change_stream_test.cpp
@@ -1423,8 +1423,8 @@ TEST_F(ChangeStreamStageTest, MatchFiltersCreateCollection) {
TEST_F(ChangeStreamStageTest, MatchFiltersNoOp) {
auto noOp = makeOplogEntry(OpTypeEnum::kNoop, // op type
{}, // namespace
- BSON("msg"
- << "new primary")); // o
+ BSON(repl::ReplicationCoordinator::newPrimaryMsgField
+ << repl::ReplicationCoordinator::newPrimaryMsg)); // o
checkTransformation(noOp, boost::none);
}
@@ -2076,8 +2076,8 @@ TEST_F(ChangeStreamStageDBTest, RenameFromUserToSystemCollectionShouldIncludeNot
TEST_F(ChangeStreamStageDBTest, MatchFiltersNoOp) {
OplogEntry noOp = makeOplogEntry(OpTypeEnum::kNoop,
NamespaceString(),
- BSON("msg"
- << "new primary"));
+ BSON(repl::ReplicationCoordinator::newPrimaryMsgField
+ << repl::ReplicationCoordinator::newPrimaryMsg));
checkTransformation(noOp, boost::none);
}
diff --git a/src/mongo/db/repl/SConscript b/src/mongo/db/repl/SConscript
index cf9596327b4..3cd1dbbd937 100644
--- a/src/mongo/db/repl/SConscript
+++ b/src/mongo/db/repl/SConscript
@@ -719,6 +719,7 @@ env.Library(
'repl_coordinator_interface',
'repl_settings',
'storage_interface',
+ 'replication_metrics',
],
LIBDEPS_PRIVATE=[
'$BUILD_DIR/mongo/db/commands/mongod_fsync',
diff --git a/src/mongo/db/repl/replication_coordinator.h b/src/mongo/db/repl/replication_coordinator.h
index 5c56aac38c5..433249a086f 100644
--- a/src/mongo/db/repl/replication_coordinator.h
+++ b/src/mongo/db/repl/replication_coordinator.h
@@ -888,7 +888,7 @@ public:
* Increment the counter for the number of ops applied during catchup if the node is in catchup
* mode.
*/
- virtual void incrementNumCatchUpOpsIfCatchingUp(int numOps) = 0;
+ virtual void incrementNumCatchUpOpsIfCatchingUp(long numOps) = 0;
/**
* Signals that drop pending collections have been removed from storage.
@@ -919,6 +919,16 @@ public:
virtual void attemptToAdvanceStableTimestamp() = 0;
+ /**
+ * Field name of the newPrimaryMsg within the 'o' field in the new term oplog entry.
+ */
+ inline static constexpr StringData newPrimaryMsgField = "msg"_sd;
+
+ /**
+ * Message string passed in the new term oplog entry after a primary has stepped up.
+ */
+ inline static constexpr StringData newPrimaryMsg = "new primary"_sd;
+
protected:
ReplicationCoordinator();
};
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
index bcaf5d94b72..54cf29caf20 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
@@ -481,8 +481,8 @@ OpTime ReplicationCoordinatorExternalStateImpl::onTransitionToPrimary(OperationC
WriteUnitOfWork wuow(opCtx);
opCtx->getClient()->getServiceContext()->getOpObserver()->onOpMessage(
opCtx,
- BSON("msg"
- << "new primary"));
+ BSON(ReplicationCoordinator::newPrimaryMsgField
+ << ReplicationCoordinator::newPrimaryMsg));
wuow.commit();
});
const auto loadLastOpTimeAndWallTimeResult = loadLastOpTimeAndWallTime(opCtx);
@@ -490,7 +490,7 @@ OpTime ReplicationCoordinatorExternalStateImpl::onTransitionToPrimary(OperationC
auto opTimeToReturn = loadLastOpTimeAndWallTimeResult.getValue().opTime;
auto newTermStartDate = loadLastOpTimeAndWallTimeResult.getValue().wallTime;
- ReplicationMetrics::get(opCtx).setNewTermStartDate(newTermStartDate);
+ ReplicationMetrics::get(opCtx).setCandidateNewTermStartDate(newTermStartDate);
auto replCoord = ReplicationCoordinator::get(opCtx);
replCoord->createWMajorityWriteAvailabilityDateWaiter(opTimeToReturn);
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index 0f686fd0e40..8283b8ddfdc 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -2385,6 +2385,8 @@ Status ReplicationCoordinatorImpl::processReplSetGetStatus(
BSONObj electionCandidateMetrics =
ReplicationMetrics::get(getServiceContext()).getElectionCandidateMetricsBSON();
+ BSONObj electionParticipantMetrics =
+ ReplicationMetrics::get(getServiceContext()).getElectionParticipantMetricsBSON();
stdx::lock_guard<stdx::mutex> lk(_mutex);
Status result(ErrorCodes::InternalError, "didn't set status in prepareStatusResponse");
@@ -2395,6 +2397,7 @@ Status ReplicationCoordinatorImpl::processReplSetGetStatus(
_getCurrentCommittedSnapshotOpTimeAndWallTime_inlock(),
initialSyncProgress,
electionCandidateMetrics,
+ electionParticipantMetrics,
_storage->getLastStableCheckpointTimestampDeprecated(_service),
_storage->getLastStableRecoveryTimestamp(_service)},
response,
@@ -3033,6 +3036,9 @@ void ReplicationCoordinatorImpl::_onFollowerModeStateChange() {
void ReplicationCoordinatorImpl::CatchupState::start_inlock() {
log() << "Entering primary catch-up mode.";
+ // Reset the number of catchup operations performed before starting catchup.
+ _numCatchUpOps = 0;
+
// No catchup in single node replica set.
if (_repl->_rsConfig.getNumMembers() == 1) {
abort_inlock(PrimaryCatchUpConclusionReason::kSkipped);
@@ -3076,8 +3082,6 @@ void ReplicationCoordinatorImpl::CatchupState::start_inlock() {
return;
}
_timeoutCbh = status.getValue();
-
- _numCatchUpOps = 0;
}
void ReplicationCoordinatorImpl::CatchupState::abort_inlock(PrimaryCatchUpConclusionReason reason) {
@@ -3160,7 +3164,7 @@ void ReplicationCoordinatorImpl::CatchupState::signalHeartbeatUpdate_inlock() {
_repl->_opTimeWaiterList.add_inlock(_waiter.get());
}
-void ReplicationCoordinatorImpl::CatchupState::incrementNumCatchUpOps_inlock(int numOps) {
+void ReplicationCoordinatorImpl::CatchupState::incrementNumCatchUpOps_inlock(long numOps) {
_numCatchUpOps += numOps;
}
@@ -3173,7 +3177,7 @@ Status ReplicationCoordinatorImpl::abortCatchupIfNeeded(PrimaryCatchUpConclusion
return Status(ErrorCodes::IllegalOperation, "The node is not in catch-up mode.");
}
-void ReplicationCoordinatorImpl::incrementNumCatchUpOpsIfCatchingUp(int numOps) {
+void ReplicationCoordinatorImpl::incrementNumCatchUpOpsIfCatchingUp(long numOps) {
stdx::lock_guard<stdx::mutex> lk(_mutex);
if (_catchupState) {
_catchupState->incrementNumCatchUpOps_inlock(numOps);
@@ -3751,14 +3755,40 @@ Status ReplicationCoordinatorImpl::processReplSetRequestVotes(
_topCoord->processReplSetRequestVotes(args, response);
}
- if (!args.isADryRun() && response->getVoteGranted()) {
- LastVote lastVote{args.getTerm(), args.getCandidateIndex()};
+ if (!args.isADryRun()) {
+ const int candidateIndex = args.getCandidateIndex();
+ LastVote lastVote{args.getTerm(), candidateIndex};
- Status status = _externalState->storeLocalLastVoteDocument(opCtx, lastVote);
- if (!status.isOK()) {
- error() << "replSetRequestVotes failed to store LastVote document; " << status;
- return status;
+ const bool votedForCandidate = response->getVoteGranted();
+
+ if (votedForCandidate) {
+ Status status = _externalState->storeLocalLastVoteDocument(opCtx, lastVote);
+ if (!status.isOK()) {
+ error() << "replSetRequestVotes failed to store LastVote document; " << status;
+ return status;
+ }
}
+
+ // If the vote was not granted to the candidate, we still want to track metrics around the
+ // node's participation in the election.
+ const long long electionTerm = args.getTerm();
+ const Date_t lastVoteDate = _replExecutor->now();
+ const int electionCandidateMemberId =
+ _rsConfig.getMemberAt(candidateIndex).getId().getData();
+ const std::string voteReason = response->getReason();
+ const OpTime lastAppliedOpTime = _topCoord->getMyLastAppliedOpTime();
+ const OpTime maxAppliedOpTime = _topCoord->latestKnownOpTime();
+ const double priorityAtElection = _rsConfig.getMemberAt(_selfIndex).getPriority();
+
+ ReplicationMetrics::get(getServiceContext())
+ .setElectionParticipantMetrics(votedForCandidate,
+ electionTerm,
+ lastVoteDate,
+ electionCandidateMemberId,
+ voteReason,
+ lastAppliedOpTime,
+ maxAppliedOpTime,
+ priorityAtElection);
}
return Status::OK();
}
@@ -3898,6 +3928,10 @@ EventHandle ReplicationCoordinatorImpl::_updateTerm_inlock(
auto now = _replExecutor->now();
TopologyCoordinator::UpdateTermResult localUpdateTermResult = _topCoord->updateTerm(term, now);
if (localUpdateTermResult == TopologyCoordinator::UpdateTermResult::kUpdatedTerm) {
+ // When the node discovers a new term, the new term date metrics are now out-of-date, so we
+ // clear them.
+ ReplicationMetrics::get(getServiceContext()).clearParticipantNewTermDates();
+
_termShadow.store(term);
_cancelPriorityTakeover_inlock();
_cancelAndRescheduleElectionTimeout_inlock();
diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h
index ad0273da3b2..503186a1e8e 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.h
+++ b/src/mongo/db/repl/replication_coordinator_impl.h
@@ -325,7 +325,7 @@ public:
virtual Status abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) override;
- virtual void incrementNumCatchUpOpsIfCatchingUp(int numOps) override;
+ virtual void incrementNumCatchUpOpsIfCatchingUp(long numOps) override;
void signalDropPendingCollectionsRemovedFromStorage() final;
@@ -677,7 +677,7 @@ private:
// Heartbeat calls this function to update the target optime.
void signalHeartbeatUpdate_inlock();
// Increment the counter for the number of ops applied during catchup.
- void incrementNumCatchUpOps_inlock(int numOps);
+ void incrementNumCatchUpOps_inlock(long numOps);
private:
ReplicationCoordinatorImpl* _repl; // Not owned.
@@ -687,7 +687,7 @@ private:
// we can exit catchup mode.
std::unique_ptr<CallbackWaiter> _waiter;
// Counter for the number of ops applied during catchup.
- int _numCatchUpOps;
+ long _numCatchUpOps = 0;
};
// Inner class to manage the concurrency of _canAcceptNonLocalWrites and _canServeNonLocalReads.
diff --git a/src/mongo/db/repl/replication_coordinator_mock.cpp b/src/mongo/db/repl/replication_coordinator_mock.cpp
index 6f95f6a5fd5..be960f2b5b8 100644
--- a/src/mongo/db/repl/replication_coordinator_mock.cpp
+++ b/src/mongo/db/repl/replication_coordinator_mock.cpp
@@ -537,7 +537,7 @@ Status ReplicationCoordinatorMock::abortCatchupIfNeeded(PrimaryCatchUpConclusion
return Status::OK();
}
-void ReplicationCoordinatorMock::incrementNumCatchUpOpsIfCatchingUp(int numOps) {
+void ReplicationCoordinatorMock::incrementNumCatchUpOpsIfCatchingUp(long numOps) {
return;
}
diff --git a/src/mongo/db/repl/replication_coordinator_mock.h b/src/mongo/db/repl/replication_coordinator_mock.h
index f3cc223981e..8ea9a9ddd8e 100644
--- a/src/mongo/db/repl/replication_coordinator_mock.h
+++ b/src/mongo/db/repl/replication_coordinator_mock.h
@@ -306,7 +306,7 @@ public:
virtual Status abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) override;
- virtual void incrementNumCatchUpOpsIfCatchingUp(int numOps) override;
+ virtual void incrementNumCatchUpOpsIfCatchingUp(long numOps) override;
void signalDropPendingCollectionsRemovedFromStorage() final;
diff --git a/src/mongo/db/repl/replication_coordinator_noop.cpp b/src/mongo/db/repl/replication_coordinator_noop.cpp
index ee000942302..a506da67996 100644
--- a/src/mongo/db/repl/replication_coordinator_noop.cpp
+++ b/src/mongo/db/repl/replication_coordinator_noop.cpp
@@ -334,7 +334,7 @@ Status ReplicationCoordinatorNoOp::abortCatchupIfNeeded(PrimaryCatchUpConclusion
MONGO_UNREACHABLE;
}
-void ReplicationCoordinatorNoOp::incrementNumCatchUpOpsIfCatchingUp(int numOps) {
+void ReplicationCoordinatorNoOp::incrementNumCatchUpOpsIfCatchingUp(long numOps) {
MONGO_UNREACHABLE;
}
diff --git a/src/mongo/db/repl/replication_coordinator_noop.h b/src/mongo/db/repl/replication_coordinator_noop.h
index 412057ab92a..e6b1b3ecd43 100644
--- a/src/mongo/db/repl/replication_coordinator_noop.h
+++ b/src/mongo/db/repl/replication_coordinator_noop.h
@@ -245,7 +245,7 @@ public:
Status abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) final;
- void incrementNumCatchUpOpsIfCatchingUp(int numOps) final;
+ void incrementNumCatchUpOpsIfCatchingUp(long numOps) final;
void signalDropPendingCollectionsRemovedFromStorage() final;
diff --git a/src/mongo/db/repl/replication_metrics.cpp b/src/mongo/db/repl/replication_metrics.cpp
index 0794c43fd0c..e7d55c50660 100644
--- a/src/mongo/db/repl/replication_metrics.cpp
+++ b/src/mongo/db/repl/replication_metrics.cpp
@@ -264,7 +264,7 @@ long ReplicationMetrics::getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_f
void ReplicationMetrics::setElectionCandidateMetrics(
const StartElectionReasonEnum reason,
const Date_t lastElectionDate,
- const long long termAtElection,
+ const long long electionTerm,
const OpTime lastCommittedOpTime,
const OpTime lastSeenOpTime,
const int numVotesNeeded,
@@ -277,7 +277,7 @@ void ReplicationMetrics::setElectionCandidateMetrics(
_nodeIsCandidateOrPrimary = true;
_electionCandidateMetrics.setLastElectionReason(reason);
_electionCandidateMetrics.setLastElectionDate(lastElectionDate);
- _electionCandidateMetrics.setTermAtElection(termAtElection);
+ _electionCandidateMetrics.setElectionTerm(electionTerm);
_electionCandidateMetrics.setLastCommittedOpTimeAtElection(lastCommittedOpTime);
_electionCandidateMetrics.setLastSeenOpTimeAtElection(lastSeenOpTime);
_electionCandidateMetrics.setNumVotesNeeded(numVotesNeeded);
@@ -292,14 +292,15 @@ void ReplicationMetrics::setTargetCatchupOpTime(OpTime opTime) {
_electionCandidateMetrics.setTargetCatchupOpTime(opTime);
}
-void ReplicationMetrics::setNumCatchUpOps(int numCatchUpOps) {
+void ReplicationMetrics::setNumCatchUpOps(long numCatchUpOps) {
stdx::lock_guard<stdx::mutex> lk(_mutex);
+ invariant(numCatchUpOps >= 0);
_electionCandidateMetrics.setNumCatchUpOps(numCatchUpOps);
_totalNumCatchUpOps += numCatchUpOps;
_updateAverageCatchUpOps(lk);
}
-void ReplicationMetrics::setNewTermStartDate(Date_t newTermStartDate) {
+void ReplicationMetrics::setCandidateNewTermStartDate(Date_t newTermStartDate) {
stdx::lock_guard<stdx::mutex> lk(_mutex);
_electionCandidateMetrics.setNewTermStartDate(newTermStartDate);
}
@@ -336,6 +337,48 @@ void ReplicationMetrics::clearElectionCandidateMetrics() {
_nodeIsCandidateOrPrimary = false;
}
+void ReplicationMetrics::setElectionParticipantMetrics(const bool votedForCandidate,
+ const long long electionTerm,
+ const Date_t lastVoteDate,
+ const int electionCandidateMemberId,
+ const std::string voteReason,
+ const OpTime lastAppliedOpTime,
+ const OpTime maxAppliedOpTimeInSet,
+ const double priorityAtElection) {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+
+ _nodeHasVotedInElection = true;
+ _electionParticipantMetrics.setVotedForCandidate(votedForCandidate);
+ _electionParticipantMetrics.setElectionTerm(electionTerm);
+ _electionParticipantMetrics.setLastVoteDate(lastVoteDate);
+ _electionParticipantMetrics.setElectionCandidateMemberId(electionCandidateMemberId);
+ _electionParticipantMetrics.setVoteReason(voteReason);
+ _electionParticipantMetrics.setLastAppliedOpTimeAtElection(lastAppliedOpTime);
+ _electionParticipantMetrics.setMaxAppliedOpTimeInSet(maxAppliedOpTimeInSet);
+ _electionParticipantMetrics.setPriorityAtElection(priorityAtElection);
+}
+
+BSONObj ReplicationMetrics::getElectionParticipantMetricsBSON() {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ if (_nodeHasVotedInElection) {
+ return _electionParticipantMetrics.toBSON();
+ }
+ return BSONObj();
+}
+
+void ReplicationMetrics::setParticipantNewTermDates(Date_t newTermStartDate,
+ Date_t newTermAppliedDate) {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ _electionParticipantMetrics.setNewTermStartDate(newTermStartDate);
+ _electionParticipantMetrics.setNewTermAppliedDate(newTermAppliedDate);
+}
+
+void ReplicationMetrics::clearParticipantNewTermDates() {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ _electionParticipantMetrics.setNewTermStartDate(boost::none);
+ _electionParticipantMetrics.setNewTermAppliedDate(boost::none);
+}
+
void ReplicationMetrics::_updateAverageCatchUpOps(WithLock lk) {
long numCatchUps = _electionMetrics.getNumCatchUps();
if (numCatchUps > 0) {
diff --git a/src/mongo/db/repl/replication_metrics.h b/src/mongo/db/repl/replication_metrics.h
index 0d8c025e18b..59d27ace445 100644
--- a/src/mongo/db/repl/replication_metrics.h
+++ b/src/mongo/db/repl/replication_metrics.h
@@ -83,7 +83,7 @@ public:
// consistent state.
void setElectionCandidateMetrics(const StartElectionReasonEnum reason,
const Date_t lastElectionDate,
- const long long termAtElection,
+ const long long electionTerm,
const OpTime lastCommittedOpTime,
const OpTime lastSeenOpTime,
const int numVotesNeeded,
@@ -91,8 +91,8 @@ public:
const Milliseconds electionTimeoutMillis,
const boost::optional<int> priorPrimary);
void setTargetCatchupOpTime(OpTime opTime);
- void setNumCatchUpOps(int numCatchUpOps);
- void setNewTermStartDate(Date_t newTermStartDate);
+ void setNumCatchUpOps(long numCatchUpOps);
+ void setCandidateNewTermStartDate(Date_t newTermStartDate);
void setWMajorityWriteAvailabilityDate(Date_t wMajorityWriteAvailabilityDate);
boost::optional<OpTime> getTargetCatchupOpTime_forTesting();
@@ -101,6 +101,25 @@ public:
BSONObj getElectionCandidateMetricsBSON();
void clearElectionCandidateMetrics();
+ // Election participant metrics
+
+ // All the election participant metrics that should be set when a node votes in an election are
+ // set in this one function, so that the 'electionParticipantMetrics' section of replSetStatus
+ // shows a consistent state.
+ void setElectionParticipantMetrics(const bool votedForCandidate,
+ const long long electionTerm,
+ const Date_t lastVoteDate,
+ const int electionCandidateMemberId,
+ const std::string voteReason,
+ const OpTime lastAppliedOpTime,
+ const OpTime maxAppliedOpTimeInSet,
+ const double priorityAtElection);
+
+ BSONObj getElectionParticipantMetricsBSON();
+ void setParticipantNewTermDates(Date_t newTermStartDate, Date_t newTermAppliedDate);
+ void clearParticipantNewTermDates();
+
+
private:
class ElectionMetricsSSS;
@@ -112,6 +131,7 @@ private:
ElectionParticipantMetrics _electionParticipantMetrics;
bool _nodeIsCandidateOrPrimary = false;
+ bool _nodeHasVotedInElection = false;
// This field is a double so that the division result in _updateAverageCatchUpOps will be a
// double without any casting.
diff --git a/src/mongo/db/repl/replication_metrics.idl b/src/mongo/db/repl/replication_metrics.idl
index e214c2b0dc8..4fb062411ed 100644
--- a/src/mongo/db/repl/replication_metrics.idl
+++ b/src/mongo/db/repl/replication_metrics.idl
@@ -144,8 +144,8 @@ structs:
lastElectionDate:
description: "Time the node called for the election"
type: date
- termAtElection:
- description: "New term at the time of election"
+ electionTerm:
+ description: "Proposed new term for this election"
type: long
lastCommittedOpTimeAtElection:
description: "Last OpTime the node committed before calling the election"
@@ -189,6 +189,36 @@ structs:
description: "Stores metrics that are specific to the last election in which the node voted"
strict: true
fields:
+ votedForCandidate:
+ description: "States if the node has voted yes or no for the candidate in this election"
+ type: bool
+ electionTerm:
+ description: "The term of the candidate that is running for election"
+ type: long
+ lastVoteDate:
+ description: "Time the node voted"
+ type: date
+ electionCandidateMemberId:
+ description: "MemberId of the node requesting a vote"
+ type: int
+ voteReason:
+ description: "Reason why the node voted the way it did"
+ type: string
+ lastAppliedOpTimeAtElection:
+ description: "Latest applied OpTime at the time of voting"
+ type: optime
+ maxAppliedOpTimeInSet:
+ description: "Highest applied time of any node in this replica set, as currently
+ known by this node"
+ type: optime
priorityAtElection:
description: "The node's priority at the time of the election"
type: double
+ newTermStartDate:
+ description: "Time the new term oplog entry was written by the primary"
+ type: date
+ optional: true
+ newTermAppliedDate:
+ description: "Time this node applied the new term oplog entry"
+ type: date
+ optional: true
diff --git a/src/mongo/db/repl/sync_tail.cpp b/src/mongo/db/repl/sync_tail.cpp
index 6938895799c..50d648a5790 100644
--- a/src/mongo/db/repl/sync_tail.cpp
+++ b/src/mongo/db/repl/sync_tail.cpp
@@ -69,6 +69,7 @@
#include "mongo/db/repl/repl_client_info.h"
#include "mongo/db/repl/repl_set_config.h"
#include "mongo/db/repl/replication_coordinator.h"
+#include "mongo/db/repl/replication_metrics.h"
#include "mongo/db/repl/transaction_oplog_application.h"
#include "mongo/db/session.h"
#include "mongo/db/session_txn_record_gen.h"
@@ -327,6 +328,18 @@ Status SyncTail::syncApply(OperationContext* opCtx,
if (opType == OpTypeEnum::kNoop) {
incrementOpsAppliedStats();
+
+ auto oplogEntry = OplogEntryBase::parse(IDLParserErrorContext("syncApply"), op);
+ auto opObj = oplogEntry.getObject();
+ if (opObj.hasField(ReplicationCoordinator::newPrimaryMsgField) &&
+ opObj.getField(ReplicationCoordinator::newPrimaryMsgField).str() ==
+ ReplicationCoordinator::newPrimaryMsg) {
+
+ invariant(oplogEntry.getWallClockTime());
+ ReplicationMetrics::get(opCtx).setParticipantNewTermDates(
+ oplogEntry.getWallClockTime().get(), applyStartTime);
+ }
+
return Status::OK();
} else if (OplogEntry::isCrudOpType(opType)) {
return finishApply(writeConflictRetry(opCtx, "syncApply_CRUD", nss.ns(), [&] {
diff --git a/src/mongo/db/repl/sync_tail_test.cpp b/src/mongo/db/repl/sync_tail_test.cpp
index e9fb2cb0fb1..188e00875a9 100644
--- a/src/mongo/db/repl/sync_tail_test.cpp
+++ b/src/mongo/db/repl/sync_tail_test.cpp
@@ -232,32 +232,6 @@ auto parseFromOplogEntryArray(const BSONObj& obj, int elem) {
return OpTime(tsArray.Array()[elem].timestamp(), termArray.Array()[elem].Long());
};
-TEST_F(SyncTailTest, SyncApplyNoNamespaceBadOp) {
- const BSONObj op = BSON("op"
- << "x");
- ASSERT_THROWS(
- SyncTail::syncApply(_opCtx.get(), op, OplogApplication::Mode::kInitialSync, boost::none),
- ExceptionFor<ErrorCodes::BadValue>);
-}
-
-TEST_F(SyncTailTest, SyncApplyNoNamespaceNoOp) {
- ASSERT_OK(SyncTail::syncApply(_opCtx.get(),
- BSON("op"
- << "n"),
- OplogApplication::Mode::kInitialSync,
- boost::none));
-}
-
-TEST_F(SyncTailTest, SyncApplyBadOp) {
- const BSONObj op = BSON("op"
- << "x"
- << "ns"
- << "test.t");
- ASSERT_THROWS(
- SyncTail::syncApply(_opCtx.get(), op, OplogApplication::Mode::kInitialSync, boost::none),
- ExceptionFor<ErrorCodes::BadValue>);
-}
-
TEST_F(SyncTailTest, SyncApplyInsertDocumentDatabaseMissing) {
NamespaceString nss("test.t");
auto op = makeOplogEntry(OpTypeEnum::kInsert, nss, {});
diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp
index 8ee1a117226..b58f91471b1 100644
--- a/src/mongo/db/repl/topology_coordinator.cpp
+++ b/src/mongo/db/repl/topology_coordinator.cpp
@@ -65,6 +65,8 @@ namespace mongo {
namespace repl {
MONGO_FAIL_POINT_DEFINE(forceSyncSourceCandidate);
+MONGO_FAIL_POINT_DEFINE(voteNoInElection);
+MONGO_FAIL_POINT_DEFINE(voteYesInDryRunButNoInRealElection);
// If this fail point is enabled, TopologyCoordinator::shouldChangeSyncSource() will ignore
// the option TopologyCoordinator::Options::maxSyncSourceLagSecs. The sync source will not be
@@ -1388,7 +1390,7 @@ std::string TopologyCoordinator::_getReplSetStatusString() {
// Construct a ReplSetStatusArgs using default parameters. Missing parameters will not be
// included in the status string.
ReplSetStatusArgs rsStatusArgs{
- Date_t::now(), 0U, OpTimeAndWallTime(), BSONObj(), BSONObj(), boost::none};
+ Date_t::now(), 0U, OpTimeAndWallTime(), BSONObj(), BSONObj(), BSONObj(), boost::none};
BSONObjBuilder builder;
Status result(ErrorCodes::InternalError, "didn't set status in prepareStatusResponse");
prepareStatusResponse(rsStatusArgs, &builder, &result);
@@ -1411,6 +1413,7 @@ void TopologyCoordinator::prepareStatusResponse(const ReplSetStatusArgs& rsStatu
const Date_t lastOpDurableWall = getMyLastDurableOpTimeAndWallTime().wallTime;
const BSONObj& initialSyncStatus = rsStatusArgs.initialSyncStatus;
const BSONObj& electionCandidateMetrics = rsStatusArgs.electionCandidateMetrics;
+ const BSONObj& electionParticipantMetrics = rsStatusArgs.electionParticipantMetrics;
const boost::optional<Timestamp>& lastStableRecoveryTimestamp =
rsStatusArgs.lastStableRecoveryTimestamp;
const boost::optional<Timestamp>& lastStableCheckpointTimestampDeprecated =
@@ -1617,6 +1620,10 @@ void TopologyCoordinator::prepareStatusResponse(const ReplSetStatusArgs& rsStatu
response->append("electionCandidateMetrics", electionCandidateMetrics);
}
+ if (!electionParticipantMetrics.isEmpty()) {
+ response->append("electionParticipantMetrics", electionParticipantMetrics);
+ }
+
response->append("members", membersOut);
*result = Status::OK();
}
@@ -2690,6 +2697,29 @@ void TopologyCoordinator::processReplSetRequestVotes(const ReplSetRequestVotesAr
ReplSetRequestVotesResponse* response) {
response->setTerm(_term);
+ if (MONGO_unlikely(voteNoInElection.shouldFail())) {
+ log() << "failpoint voteNoInElection enabled";
+ response->setVoteGranted(false);
+ response->setReason(str::stream() << "forced to vote no during dry run election due to "
+ "failpoint voteNoInElection set");
+ return;
+ }
+
+ if (MONGO_unlikely(voteYesInDryRunButNoInRealElection.shouldFail())) {
+ log() << "failpoint voteYesInDryRunButNoInRealElection enabled";
+ if (args.isADryRun()) {
+ response->setVoteGranted(true);
+ response->setReason(str::stream() << "forced to vote yes in dry run due to failpoint "
+ "voteYesInDryRunButNoInRealElection set");
+ } else {
+ response->setVoteGranted(false);
+ response->setReason(str::stream()
+ << "forced to vote no in real election due to failpoint "
+ "voteYesInDryRunButNoInRealElection set");
+ }
+ return;
+ }
+
if (args.getTerm() < _term) {
response->setVoteGranted(false);
response->setReason(str::stream() << "candidate's term (" << args.getTerm()
diff --git a/src/mongo/db/repl/topology_coordinator.h b/src/mongo/db/repl/topology_coordinator.h
index e23f820fef3..db0bf31428d 100644
--- a/src/mongo/db/repl/topology_coordinator.h
+++ b/src/mongo/db/repl/topology_coordinator.h
@@ -306,6 +306,7 @@ public:
const OpTimeAndWallTime readConcernMajorityOpTime;
const BSONObj initialSyncStatus;
const BSONObj electionCandidateMetrics;
+ const BSONObj electionParticipantMetrics;
// boost::none if the storage engine does not support RTT, or if it does but does not
// persist data to necessitate taking checkpoints. Timestamp::min() if a checkpoint is yet
diff --git a/src/mongo/db/repl/topology_coordinator_v1_test.cpp b/src/mongo/db/repl/topology_coordinator_v1_test.cpp
index 04d0a152219..170bfc1587b 100644
--- a/src/mongo/db/repl/topology_coordinator_v1_test.cpp
+++ b/src/mongo/db/repl/topology_coordinator_v1_test.cpp
@@ -1538,7 +1538,8 @@ TEST_F(TopoCoordTest, ReplSetGetStatus) {
Timestamp lastStableRecoveryTimestamp(2, 2);
Timestamp lastStableCheckpointTimestampDeprecated(2, 2);
BSONObj initialSyncStatus = BSON("failedInitialSyncAttempts" << 1);
- BSONObj electionCandidateMetrics = BSON("DummyElectionMetrics" << 1);
+ BSONObj electionCandidateMetrics = BSON("DummyElectionCandidateMetrics" << 1);
+ BSONObj electionParticipantMetrics = BSON("DummyElectionParticipantMetrics" << 1);
std::string setName = "mySet";
ReplSetHeartbeatResponse hb;
@@ -1595,6 +1596,7 @@ TEST_F(TopoCoordTest, ReplSetGetStatus) {
{readConcernMajorityOpTime, readConcernMajorityWallTime},
initialSyncStatus,
electionCandidateMetrics,
+ electionParticipantMetrics,
lastStableCheckpointTimestampDeprecated,
lastStableRecoveryTimestamp},
&statusBuilder,
@@ -1708,6 +1710,7 @@ TEST_F(TopoCoordTest, ReplSetGetStatus) {
ASSERT_EQUALS(3, rsStatus["writeMajorityCount"].numberInt());
ASSERT_BSONOBJ_EQ(initialSyncStatus, rsStatus["initialSyncStatus"].Obj());
ASSERT_BSONOBJ_EQ(electionCandidateMetrics, rsStatus["electionCandidateMetrics"].Obj());
+ ASSERT_BSONOBJ_EQ(electionParticipantMetrics, rsStatus["electionParticipantMetrics"].Obj());
// Test no lastStableRecoveryTimestamp field.
BSONObjBuilder statusBuilder2;
@@ -1727,6 +1730,7 @@ TEST_F(TopoCoordTest, ReplSetGetStatus) {
ASSERT_FALSE(rsStatus.hasField("lastStableRecoveryTimestamp"));
ASSERT_FALSE(rsStatus.hasField("lastStableCheckpointTimestamp"));
ASSERT_FALSE(rsStatus.hasField("electionCandidateMetrics"));
+ ASSERT_FALSE(rsStatus.hasField("electionParticipantMetrics"));
}
TEST_F(TopoCoordTest, ReplSetGetStatusWriteMajorityDifferentFromMajorityVoteCount) {
diff --git a/src/mongo/embedded/replication_coordinator_embedded.cpp b/src/mongo/embedded/replication_coordinator_embedded.cpp
index 767a8369d31..be0946a5058 100644
--- a/src/mongo/embedded/replication_coordinator_embedded.cpp
+++ b/src/mongo/embedded/replication_coordinator_embedded.cpp
@@ -359,7 +359,7 @@ Status ReplicationCoordinatorEmbedded::abortCatchupIfNeeded(PrimaryCatchUpConclu
UASSERT_NOT_IMPLEMENTED;
}
-void ReplicationCoordinatorEmbedded::incrementNumCatchUpOpsIfCatchingUp(int numOps) {
+void ReplicationCoordinatorEmbedded::incrementNumCatchUpOpsIfCatchingUp(long numOps) {
UASSERT_NOT_IMPLEMENTED;
}
diff --git a/src/mongo/embedded/replication_coordinator_embedded.h b/src/mongo/embedded/replication_coordinator_embedded.h
index e39955ea365..8d7788a0f41 100644
--- a/src/mongo/embedded/replication_coordinator_embedded.h
+++ b/src/mongo/embedded/replication_coordinator_embedded.h
@@ -253,7 +253,7 @@ public:
Status abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) override;
- void incrementNumCatchUpOpsIfCatchingUp(int numOps) override;
+ void incrementNumCatchUpOpsIfCatchingUp(long numOps) override;
void signalDropPendingCollectionsRemovedFromStorage() final;