summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--jstests/replsets/catchup.js22
-rw-r--r--jstests/replsets/stepdown.js9
-rw-r--r--jstests/replsets/unconditional_step_down.js14
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp13
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp7
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp57
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp8
-rw-r--r--src/mongo/db/repl/replication_metrics.cpp30
-rw-r--r--src/mongo/db/repl/replication_metrics.h15
-rw-r--r--src/mongo/db/repl/replication_metrics.idl13
-rw-r--r--src/mongo/db/repl/topology_coordinator.cpp8
-rw-r--r--src/mongo/db/repl/topology_coordinator.h1
-rw-r--r--src/mongo/db/repl/topology_coordinator_v1_test.cpp7
13 files changed, 199 insertions, 5 deletions
diff --git a/jstests/replsets/catchup.js b/jstests/replsets/catchup.js
index 6657e6b9076..d6bd38af46e 100644
--- a/jstests/replsets/catchup.js
+++ b/jstests/replsets/catchup.js
@@ -118,6 +118,15 @@ verifyCatchUpConclusionReason(initialNewPrimaryStatus.electionMetrics,
newNewPrimaryStatus.electionMetrics,
'numCatchUpsAlreadyCaughtUp');
+// Check that the 'electionCandidateMetrics' section of the replSetGetStatus response does not have
+// a 'targetCatchupOpTime' field if the target opTime for catchup is not set.
+let res = assert.commandWorked(newPrimary.adminCommand({replSetGetStatus: 1}));
+assert(res.electionCandidateMetrics,
+ () => "Response should have an 'electionCandidateMetrics' field: " + tojson(res));
+assert(!res.electionCandidateMetrics.targetCatchupOpTime,
+ () => "Response should not have an 'electionCandidateMetrics.targetCatchupOpTime' field: " +
+ tojson(res.electionCandidateMetrics));
+
jsTest.log("Case 2: The primary needs to catch up, succeeds in time.");
initialNewPrimaryStatus =
assert.commandWorked(rst.getSecondaries()[0].adminCommand({serverStatus: 1}));
@@ -140,6 +149,19 @@ verifyCatchUpConclusionReason(initialNewPrimaryStatus.electionMetrics,
newNewPrimaryStatus.electionMetrics,
'numCatchUpsSucceeded');
+// Check that the 'electionCandidateMetrics' section of the replSetGetStatus response has a
+// 'targetCatchupOpTime' field once heartbeats have updated the target opTime for catchup, and that
+// it has the correct value.
+res = assert.commandWorked(stepUpResults.newPrimary.adminCommand({replSetGetStatus: 1}));
+assert(res.electionCandidateMetrics,
+ () => "Response should have an 'electionCandidateMetrics' field: " + tojson(res));
+assert(res.electionCandidateMetrics.targetCatchupOpTime,
+ () => "Response should have an 'electionCandidateMetrics.targetCatchupOpTime' field: " +
+ tojson(res.electionCandidateMetrics));
+assert.eq(res.electionCandidateMetrics.targetCatchupOpTime.ts,
+ stepUpResults.latestOpOnOldPrimary.ts);
+assert.eq(res.electionCandidateMetrics.targetCatchupOpTime.t, stepUpResults.latestOpOnOldPrimary.t);
+
// Wait for all secondaries to catch up
rst.awaitReplication();
// Check the latest op on old primary is preserved on the new one.
diff --git a/jstests/replsets/stepdown.js b/jstests/replsets/stepdown.js
index 934d9b9a2f4..6abdd335e92 100644
--- a/jstests/replsets/stepdown.js
+++ b/jstests/replsets/stepdown.js
@@ -64,6 +64,9 @@ try {
assert.writeOK(master.getDB("foo").bar.insert({x: i}));
}
+ let res = assert.commandWorked(master.adminCommand({replSetGetStatus: 1}));
+ assert(res.electionCandidateMetrics,
+ () => "Response should have an 'electionCandidateMetrics' field: " + tojson(res));
let intitialServerStatus = assert.commandWorked(master.adminCommand({serverStatus: 1}));
jsTestLog('Do stepdown of primary ' + master + ' that should not work');
@@ -156,6 +159,12 @@ try {
assert.eq(r2.ismaster, false);
assert.eq(r2.secondary, true);
+ // Check that the 'electionCandidateMetrics' section of the replSetGetStatus response has been
+ // cleared, since the node is no longer primary.
+ res = assert.commandWorked(master.adminCommand({replSetGetStatus: 1}));
+ assert(!res.electionCandidateMetrics,
+ () => "Response should not have an 'electionCandidateMetrics' field: " + tojson(res));
+
// This section checks that the metrics are incremented accurately when the command fails due to
// an error while stepping down. This is one reason the replSetStepDown command could fail once
// we call stepDown in the replication coordinator, but success in this case gives us confidence
diff --git a/jstests/replsets/unconditional_step_down.js b/jstests/replsets/unconditional_step_down.js
index c9f95bcb1ac..4d5e37821db 100644
--- a/jstests/replsets/unconditional_step_down.js
+++ b/jstests/replsets/unconditional_step_down.js
@@ -106,6 +106,10 @@ function runStepDownTest({testMsg, stepDownFn, toRemovedState}) {
jsTestLog("Wait for write cmd to reach the fail point");
waitForCurOpByFailPoint(primaryDB, collNss, writeFailPoint);
+ let res = assert.commandWorked(primary.adminCommand({replSetGetStatus: 1}));
+ assert(res.electionCandidateMetrics,
+ () => "Response should have an 'electionCandidateMetrics' field: " + tojson(res));
+
jsTestLog("Trigger step down");
var oldConfig = stepDownFn();
@@ -119,6 +123,16 @@ function runStepDownTest({testMsg, stepDownFn, toRemovedState}) {
(toRemovedState) ? ReplSetTest.State.REMOVED : ReplSetTest.State.SECONDARY);
assert.commandWorked(primary.adminCommand({configureFailPoint: writeFailPoint, mode: "off"}));
+
+ // Check that the 'electionCandidateMetrics' section of the replSetGetStatus response has been
+ // cleared, since the node is no longer primary.
+ if (!toRemovedState) {
+ res = assert.commandWorked(primary.adminCommand({replSetGetStatus: 1}));
+ assert(
+ !res.electionCandidateMetrics,
+ () => "Response should not have an 'electionCandidateMetrics' field: " + tojson(res));
+ }
+
// Get the new primary.
refreshConnection();
}
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index f425290749b..45e11716d92 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -2102,6 +2102,10 @@ void ReplicationCoordinatorImpl::stepDown(OperationContext* opCtx,
lk.lock();
_updateAndLogStatsOnStepDown(&arsd);
+
+ // Clear the node's election candidate metrics since it is no longer primary.
+ ReplicationMetrics::get(opCtx).clearElectionCandidateMetrics();
+
_topCoord->finishUnconditionalStepDown();
onExitGuard.dismiss();
@@ -2365,6 +2369,9 @@ Status ReplicationCoordinatorImpl::processReplSetGetStatus(
}
}
+ BSONObj electionCandidateMetrics =
+ ReplicationMetrics::get(getServiceContext()).getElectionCandidateMetricsBSON();
+
stdx::lock_guard<stdx::mutex> lk(_mutex);
Status result(ErrorCodes::InternalError, "didn't set status in prepareStatusResponse");
_topCoord->prepareStatusResponse(
@@ -2373,6 +2380,7 @@ Status ReplicationCoordinatorImpl::processReplSetGetStatus(
static_cast<unsigned>(time(nullptr) - serverGlobalParams.started),
_getCurrentCommittedSnapshotOpTimeAndWallTime_inlock(),
initialSyncProgress,
+ electionCandidateMetrics,
_storage->getLastStableRecoveryTimestamp(_service),
_externalState->tooStale()},
response,
@@ -2678,6 +2686,9 @@ void ReplicationCoordinatorImpl::_finishReplSetReconfig(OperationContext* opCtx,
lk.lock();
_updateAndLogStatsOnStepDown(&arsd.get());
+
+ // Clear the node's election candidate metrics since it is no longer primary.
+ ReplicationMetrics::get(opCtx).clearElectionCandidateMetrics();
} else {
// Release the rstl lock as the node might have stepped down due to
// other unconditional step down code paths like learning new term via heartbeat &
@@ -3095,6 +3106,8 @@ void ReplicationCoordinatorImpl::CatchupState::signalHeartbeatUpdate_inlock() {
return;
}
+ ReplicationMetrics::get(getGlobalServiceContext()).setTargetCatchupOpTime(targetOpTime.get());
+
log() << "Heartbeats updated catchup target optime to " << *targetOpTime;
log() << "Latest known optime per replica set member:";
auto opTimesPerMember = _repl->_topCoord->latestKnownOpTimeSinceHeartbeatRestartPerMember();
diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp
index 2b7d7bd62f9..ea12a516ba1 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp
@@ -64,6 +64,10 @@ public:
if (_replCoord->_electionFinishedEvent.isValid()) {
_replCoord->_replExecutor->signalEvent(_replCoord->_electionFinishedEvent);
}
+
+ // Clear the node's election candidate metrics if it loses either the dry-run or actual
+ // election, since it will not become primary.
+ ReplicationMetrics::get(getGlobalServiceContext()).clearElectionCandidateMetrics();
}
void dismiss() {
@@ -141,6 +145,9 @@ void ReplicationCoordinatorImpl::_startElectSelfV1_inlock(
long long term = _topCoord->getTerm();
int primaryIndex = -1;
+ Date_t now = _replExecutor->now();
+ ReplicationMetrics::get(getServiceContext()).setElectionCandidateMetrics(now);
+
if (reason == TopologyCoordinator::StartElectionReason::kStepUpRequestSkipDryRun) {
long long newTerm = term + 1;
log() << "skipping dry run and running for election in term " << newTerm;
diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp
index 3f0f3334d83..4c7afc1130f 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp
@@ -48,6 +48,8 @@
#include "mongo/util/fail_point_service.h"
#include "mongo/util/log.h"
+#include <boost/optional/optional_io.hpp>
+
namespace mongo {
namespace repl {
namespace {
@@ -344,6 +346,10 @@ TEST_F(ReplCoordTest, ElectionFailsWhenInsufficientVotesAreReceivedDuringDryRun)
simulateEnoughHeartbeatsForAllNodesUp();
+ // Check that the node's election candidate metrics are unset before it becomes primary.
+ ASSERT_BSONOBJ_EQ(
+ BSONObj(), ReplicationMetrics::get(getServiceContext()).getElectionCandidateMetricsBSON());
+
auto electionTimeoutWhen = getReplCoord()->getElectionTimeout_forTest();
ASSERT_NOT_EQUALS(Date_t(), electionTimeoutWhen);
log() << "Election timeout scheduled at " << electionTimeoutWhen << " (simulator time)";
@@ -368,6 +374,11 @@ TEST_F(ReplCoordTest, ElectionFailsWhenInsufficientVotesAreReceivedDuringDryRun)
<< false << "reason"
<< "don't like him much")));
voteRequests++;
+ // Check that the node's election candidate metrics are set once it has called an
+ // election.
+ ASSERT_BSONOBJ_NE(
+ BSONObj(),
+ ReplicationMetrics::get(getServiceContext()).getElectionCandidateMetricsBSON());
} else {
net->blackHole(noi);
}
@@ -377,6 +388,11 @@ TEST_F(ReplCoordTest, ElectionFailsWhenInsufficientVotesAreReceivedDuringDryRun)
stopCapturingLogMessages();
ASSERT_EQUALS(
1, countLogLinesContaining("not running for primary, we received insufficient votes"));
+
+ // Check that the node's election candidate metrics have been cleared, since it lost the dry-run
+ // election and will not become primary.
+ ASSERT_BSONOBJ_EQ(
+ BSONObj(), ReplicationMetrics::get(getServiceContext()).getElectionCandidateMetricsBSON());
}
TEST_F(ReplCoordTest, ElectionFailsWhenDryRunResponseContainsANewerTerm) {
@@ -667,9 +683,17 @@ TEST_F(ReplCoordTest, ElectionFailsWhenVoteRequestResponseContainsANewerTerm) {
replCoordSetMyLastDurableOpTime(time1, Date_t() + Seconds(time1.getSecs()));
ASSERT_OK(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY));
+ // Check that the node's election candidate metrics are unset before it becomes primary.
+ ASSERT_BSONOBJ_EQ(
+ BSONObj(), ReplicationMetrics::get(getServiceContext()).getElectionCandidateMetricsBSON());
+
simulateEnoughHeartbeatsForAllNodesUp();
simulateSuccessfulDryRun();
+ // Check that the node's election candidate metrics are set once it has called an election.
+ ASSERT_BSONOBJ_NE(
+ BSONObj(), ReplicationMetrics::get(getServiceContext()).getElectionCandidateMetricsBSON());
+
NetworkInterfaceMock* net = getNet();
net->enterNetwork();
while (net->hasReadyRequests()) {
@@ -694,6 +718,11 @@ TEST_F(ReplCoordTest, ElectionFailsWhenVoteRequestResponseContainsANewerTerm) {
stopCapturingLogMessages();
ASSERT_EQUALS(1,
countLogLinesContaining("not becoming primary, we have been superseded already"));
+
+ // Check that the node's election candidate metrics have been cleared, since it lost the actual
+ // election and will not become primary.
+ ASSERT_BSONOBJ_EQ(
+ BSONObj(), ReplicationMetrics::get(getServiceContext()).getElectionCandidateMetricsBSON());
}
TEST_F(ReplCoordTest, ElectionFailsWhenTermChangesDuringDryRun) {
@@ -2228,6 +2257,10 @@ TEST_F(PrimaryCatchUpTest, PrimaryDoesNotNeedToCatchUp) {
ASSERT_EQ(0,
ReplicationMetrics::get(opCtx.get())
.getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
+
+ // Check that the targetCatchupOpTime metric was not set.
+ ASSERT_EQUALS(boost::none,
+ ReplicationMetrics::get(getServiceContext()).getTargetCatchupOpTime_forTesting());
}
// Heartbeats set a future target OpTime and we reached that successfully.
@@ -2237,11 +2270,23 @@ TEST_F(PrimaryCatchUpTest, CatchupSucceeds) {
OpTime time1(Timestamp(100, 1), 0);
OpTime time2(Timestamp(100, 2), 0);
ReplSetConfig config = setUp3NodeReplSetAndRunForElection(time1);
+
+ // Check that the targetCatchupOpTime metric is unset before the target opTime for catchup is
+ // set.
+ ASSERT_EQUALS(boost::none,
+ ReplicationMetrics::get(getServiceContext()).getTargetCatchupOpTime_forTesting());
+
processHeartbeatRequests([this, time2](const NetworkOpIter noi) {
auto net = getNet();
// The old primary accepted one more op and all nodes caught up after voting for me.
net->scheduleResponse(noi, net->now(), makeHeartbeatResponse(time2));
});
+
+ // Check that the targetCatchupOpTime metric was set correctly when heartbeats updated the
+ // target opTime for catchup.
+ ASSERT_EQUALS(time2,
+ ReplicationMetrics::get(getServiceContext()).getTargetCatchupOpTime_forTesting());
+
ASSERT(getReplCoord()->getApplierState() == ApplierState::Running);
advanceMyLastAppliedOpTime(time2, Date_t() + Seconds(time2.getSecs()));
ASSERT(getReplCoord()->getApplierState() == ApplierState::Draining);
@@ -2431,6 +2476,10 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringCatchUp) {
// Other nodes are ahead of me.
getNet()->scheduleResponse(noi, getNet()->now(), makeHeartbeatResponse(time2));
});
+
+ ASSERT_EQUALS(time2,
+ ReplicationMetrics::get(getServiceContext()).getTargetCatchupOpTime_forTesting());
+
ASSERT(getReplCoord()->getApplierState() == ApplierState::Running);
TopologyCoordinator::UpdateTermResult updateTermResult;
auto evh = getReplCoord()->updateTerm_forTest(2, &updateTermResult);
@@ -2461,6 +2510,10 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringCatchUp) {
ASSERT_EQ(0,
ReplicationMetrics::get(opCtx.get())
.getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting());
+
+ // Check that the targetCatchupOpTime metric was cleared when the node stepped down.
+ ASSERT_EQUALS(boost::none,
+ ReplicationMetrics::get(getServiceContext()).getTargetCatchupOpTime_forTesting());
}
TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringDrainMode) {
@@ -2566,6 +2619,8 @@ TEST_F(PrimaryCatchUpTest, FreshestNodeBecomesAvailableLater) {
ASSERT(getReplCoord()->getApplierState() == ApplierState::Running);
stopCapturingLogMessages();
ASSERT_EQ(1, countLogLinesContaining("Heartbeats updated catchup target optime"));
+ ASSERT_EQUALS(time3,
+ ReplicationMetrics::get(getServiceContext()).getTargetCatchupOpTime_forTesting());
// 3) Advancing its applied optime to time 2 isn't enough.
advanceMyLastAppliedOpTime(time2, Date_t() + Seconds(time2.getSecs()));
@@ -2586,6 +2641,8 @@ TEST_F(PrimaryCatchUpTest, FreshestNodeBecomesAvailableLater) {
ASSERT(getReplCoord()->getApplierState() == ApplierState::Running);
stopCapturingLogMessages();
ASSERT_EQ(1, countLogLinesContaining("Heartbeats updated catchup target optime"));
+ ASSERT_EQUALS(time4,
+ ReplicationMetrics::get(getServiceContext()).getTargetCatchupOpTime_forTesting());
// 5) Advancing to time 3 isn't enough now.
advanceMyLastAppliedOpTime(time3, Date_t() + Seconds(time3.getSecs()));
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
index e3c3d60eca6..cb85f948095 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
@@ -49,6 +49,7 @@
#include "mongo/db/repl/repl_set_heartbeat_args_v1.h"
#include "mongo/db/repl/repl_set_heartbeat_response.h"
#include "mongo/db/repl/replication_coordinator_impl.h"
+#include "mongo/db/repl/replication_metrics.h"
#include "mongo/db/repl/replication_process.h"
#include "mongo/db/repl/topology_coordinator.h"
#include "mongo/db/repl/vote_requester.h"
@@ -407,6 +408,10 @@ void ReplicationCoordinatorImpl::_stepDownFinish(
lk.lock();
_updateAndLogStatsOnStepDown(&arsd);
+
+ // Clear the node's election candidate metrics since it is no longer primary.
+ ReplicationMetrics::get(opCtx.get()).clearElectionCandidateMetrics();
+
_topCoord->finishUnconditionalStepDown();
const auto action = _updateMemberStateFromTopologyCoordinator(lk, opCtx.get());
@@ -629,6 +634,9 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish(
lk.lock();
_updateAndLogStatsOnStepDown(&arsd.get());
+
+ // Clear the node's election candidate metrics since it is no longer primary.
+ ReplicationMetrics::get(opCtx.get()).clearElectionCandidateMetrics();
} else {
// Release the rstl lock as the node might have stepped down due to
// other unconditional step down code paths like learning new term via heartbeat &
diff --git a/src/mongo/db/repl/replication_metrics.cpp b/src/mongo/db/repl/replication_metrics.cpp
index 7a56c228e23..af4fab849b5 100644
--- a/src/mongo/db/repl/replication_metrics.cpp
+++ b/src/mongo/db/repl/replication_metrics.cpp
@@ -262,11 +262,41 @@ int ReplicationMetrics::getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_fo
return _electionMetrics.getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd();
}
+void ReplicationMetrics::setElectionCandidateMetrics(Date_t lastElectionDate) {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ _electionCandidateMetrics.setLastElectionDate(lastElectionDate);
+ _nodeIsCandidateOrPrimary = true;
+}
+
+void ReplicationMetrics::setTargetCatchupOpTime(OpTime opTime) {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ _electionCandidateMetrics.setTargetCatchupOpTime(opTime);
+}
+
+boost::optional<OpTime> ReplicationMetrics::getTargetCatchupOpTime_forTesting() {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ return _electionCandidateMetrics.getTargetCatchupOpTime();
+}
+
BSONObj ReplicationMetrics::getElectionMetricsBSON() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
return _electionMetrics.toBSON();
}
+BSONObj ReplicationMetrics::getElectionCandidateMetricsBSON() {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ if (_nodeIsCandidateOrPrimary) {
+ return _electionCandidateMetrics.toBSON();
+ }
+ return BSONObj();
+}
+
+void ReplicationMetrics::clearElectionCandidateMetrics() {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ _electionCandidateMetrics.setTargetCatchupOpTime(boost::none);
+ _nodeIsCandidateOrPrimary = false;
+}
+
class ReplicationMetrics::ElectionMetricsSSS : public ServerStatusSection {
public:
ElectionMetricsSSS() : ServerStatusSection("electionMetrics") {}
diff --git a/src/mongo/db/repl/replication_metrics.h b/src/mongo/db/repl/replication_metrics.h
index de2c9876f68..790036be571 100644
--- a/src/mongo/db/repl/replication_metrics.h
+++ b/src/mongo/db/repl/replication_metrics.h
@@ -48,6 +48,7 @@ public:
ReplicationMetrics();
~ReplicationMetrics();
+ // Election metrics
void incrementNumElectionsCalledForReason(TopologyCoordinator::StartElectionReason reason);
void incrementNumElectionsSuccessfulForReason(TopologyCoordinator::StartElectionReason reason);
void incrementNumStepDownsCausedByHigherTerm();
@@ -75,7 +76,19 @@ public:
int getNumCatchUpsFailedWithNewTerm_forTesting();
int getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting();
+ // Election candidate metrics
+
+ // All the election candidate metrics that should be set when a node calls an election are set
+ // in this one function, so that the 'electionCandidateMetrics' section of replSetStatus shows a
+ // consistent state.
+ void setElectionCandidateMetrics(Date_t lastElectionDate);
+ void setTargetCatchupOpTime(OpTime opTime);
+
+ boost::optional<OpTime> getTargetCatchupOpTime_forTesting();
+
BSONObj getElectionMetricsBSON();
+ BSONObj getElectionCandidateMetricsBSON();
+ void clearElectionCandidateMetrics();
private:
class ElectionMetricsSSS;
@@ -84,6 +97,8 @@ private:
ElectionMetrics _electionMetrics;
ElectionCandidateMetrics _electionCandidateMetrics;
ElectionParticipantMetrics _electionParticipantMetrics;
+
+ bool _nodeIsCandidateOrPrimary = false;
};
} // namespace repl
diff --git a/src/mongo/db/repl/replication_metrics.idl b/src/mongo/db/repl/replication_metrics.idl
index d5ee27d7bb2..5821858d870 100644
--- a/src/mongo/db/repl/replication_metrics.idl
+++ b/src/mongo/db/repl/replication_metrics.idl
@@ -34,9 +34,11 @@ global:
cpp_namespace: "mongo::repl"
cpp_includes:
- "mongo/db/repl/election_reason_counter_parser.h"
+ - "mongo/db/repl/optime.h"
imports:
- "mongo/idl/basic_types.idl"
+ - "mongo/db/repl/replication_types.idl"
types:
ElectionReasonCounter:
@@ -120,9 +122,14 @@ structs:
candidate"
strict: true
fields:
- priorityAtElection:
- description: "The node's priority at the time of the election"
- type: double
+ lastElectionDate:
+ description: "Time the node called the dry run election, or the actual election if
+ it skipped dry run"
+ type: date
+ targetCatchupOpTime:
+ description: "The node's target opTime for catchup"
+ type: optime
+ optional: true
ElectionParticipantMetrics:
description: "Stores metrics that are specific to the last election in which the node voted"
diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp
index 1aa98b65c8a..c79248d7af6 100644
--- a/src/mongo/db/repl/topology_coordinator.cpp
+++ b/src/mongo/db/repl/topology_coordinator.cpp
@@ -1417,7 +1417,8 @@ const MemberConfig* TopologyCoordinator::_currentPrimaryMember() const {
std::string TopologyCoordinator::_getReplSetStatusString() {
// Construct a ReplSetStatusArgs using default parameters. Missing parameters will not be
// included in the status string.
- ReplSetStatusArgs rsStatusArgs{Date_t::now(), 0U, OpTimeAndWallTime(), BSONObj(), boost::none};
+ ReplSetStatusArgs rsStatusArgs{
+ Date_t::now(), 0U, OpTimeAndWallTime(), BSONObj(), BSONObj(), boost::none};
BSONObjBuilder builder;
Status result(ErrorCodes::InternalError, "didn't set status in prepareStatusResponse");
prepareStatusResponse(rsStatusArgs, &builder, &result);
@@ -1439,6 +1440,7 @@ void TopologyCoordinator::prepareStatusResponse(const ReplSetStatusArgs& rsStatu
const OpTime lastOpDurable = getMyLastDurableOpTime();
const Date_t lastOpDurableWall = getMyLastDurableOpTimeAndWallTime().wallTime;
const BSONObj& initialSyncStatus = rsStatusArgs.initialSyncStatus;
+ const BSONObj& electionCandidateMetrics = rsStatusArgs.electionCandidateMetrics;
const boost::optional<Timestamp>& lastStableRecoveryTimestamp =
rsStatusArgs.lastStableRecoveryTimestamp;
@@ -1639,6 +1641,10 @@ void TopologyCoordinator::prepareStatusResponse(const ReplSetStatusArgs& rsStatu
response->append("initialSyncStatus", initialSyncStatus);
}
+ if (!electionCandidateMetrics.isEmpty()) {
+ response->append("electionCandidateMetrics", electionCandidateMetrics);
+ }
+
response->append("members", membersOut);
*result = Status::OK();
}
diff --git a/src/mongo/db/repl/topology_coordinator.h b/src/mongo/db/repl/topology_coordinator.h
index d53a581e82b..d81e9f52154 100644
--- a/src/mongo/db/repl/topology_coordinator.h
+++ b/src/mongo/db/repl/topology_coordinator.h
@@ -301,6 +301,7 @@ public:
const unsigned selfUptime;
const OpTimeAndWallTime readConcernMajorityOpTime;
const BSONObj initialSyncStatus;
+ const BSONObj electionCandidateMetrics;
// boost::none if the storage engine does not support recovery to a timestamp.
// Timestamp::min() if a stable recovery timestamp is yet to be taken.
diff --git a/src/mongo/db/repl/topology_coordinator_v1_test.cpp b/src/mongo/db/repl/topology_coordinator_v1_test.cpp
index e0cc0edbe3b..8060e7f9488 100644
--- a/src/mongo/db/repl/topology_coordinator_v1_test.cpp
+++ b/src/mongo/db/repl/topology_coordinator_v1_test.cpp
@@ -1538,6 +1538,7 @@ TEST_F(TopoCoordTest, ReplSetGetStatus) {
Timestamp lastStableRecoveryTimestamp(2, 2);
Timestamp lastStableCheckpointTimestampDeprecated(2, 2);
BSONObj initialSyncStatus = BSON("failedInitialSyncAttempts" << 1);
+ BSONObj electionCandidateMetrics = BSON("DummyElectionMetrics" << 1);
std::string setName = "mySet";
ReplSetHeartbeatResponse hb;
@@ -1593,6 +1594,7 @@ TEST_F(TopoCoordTest, ReplSetGetStatus) {
static_cast<unsigned>(durationCount<Seconds>(uptimeSecs)),
{readConcernMajorityOpTime, readConcernMajorityWallTime},
initialSyncStatus,
+ electionCandidateMetrics,
lastStableRecoveryTimestamp},
&statusBuilder,
&resultStatus);
@@ -1698,6 +1700,7 @@ TEST_F(TopoCoordTest, ReplSetGetStatus) {
ASSERT_EQUALS(3, rsStatus["majorityVoteCount"].numberInt());
ASSERT_EQUALS(3, rsStatus["writeMajorityCount"].numberInt());
ASSERT_BSONOBJ_EQ(initialSyncStatus, rsStatus["initialSyncStatus"].Obj());
+ ASSERT_BSONOBJ_EQ(electionCandidateMetrics, rsStatus["electionCandidateMetrics"].Obj());
// Test no lastStableRecoveryTimestamp field.
BSONObjBuilder statusBuilder2;
@@ -1706,7 +1709,8 @@ TEST_F(TopoCoordTest, ReplSetGetStatus) {
curTime,
static_cast<unsigned>(durationCount<Seconds>(uptimeSecs)),
{readConcernMajorityOpTime, readConcernMajorityWallTime},
- initialSyncStatus},
+ initialSyncStatus,
+ BSONObj()},
&statusBuilder2,
&resultStatus);
ASSERT_OK(resultStatus);
@@ -1714,6 +1718,7 @@ TEST_F(TopoCoordTest, ReplSetGetStatus) {
unittest::log() << rsStatus;
ASSERT_EQUALS(setName, rsStatus["set"].String());
ASSERT_FALSE(rsStatus.hasField("lastStableRecoveryTimestamp"));
+ ASSERT_FALSE(rsStatus.hasField("electionCandidateMetrics"));
}
TEST_F(TopoCoordTest, ReplSetGetStatusWriteMajorityDifferentFromMajorityVoteCount) {