diff options
-rw-r--r-- | jstests/replsets/catchup.js | 22 | ||||
-rw-r--r-- | jstests/replsets/stepdown.js | 9 | ||||
-rw-r--r-- | jstests/replsets/unconditional_step_down.js | 14 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.cpp | 13 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp | 7 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp | 57 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp | 8 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_metrics.cpp | 30 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_metrics.h | 15 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_metrics.idl | 13 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator.cpp | 8 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator.h | 1 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator_v1_test.cpp | 7 |
13 files changed, 199 insertions, 5 deletions
diff --git a/jstests/replsets/catchup.js b/jstests/replsets/catchup.js index 6657e6b9076..d6bd38af46e 100644 --- a/jstests/replsets/catchup.js +++ b/jstests/replsets/catchup.js @@ -118,6 +118,15 @@ verifyCatchUpConclusionReason(initialNewPrimaryStatus.electionMetrics, newNewPrimaryStatus.electionMetrics, 'numCatchUpsAlreadyCaughtUp'); +// Check that the 'electionCandidateMetrics' section of the replSetGetStatus response does not have +// a 'targetCatchupOpTime' field if the target opTime for catchup is not set. +let res = assert.commandWorked(newPrimary.adminCommand({replSetGetStatus: 1})); +assert(res.electionCandidateMetrics, + () => "Response should have an 'electionCandidateMetrics' field: " + tojson(res)); +assert(!res.electionCandidateMetrics.targetCatchupOpTime, + () => "Response should not have an 'electionCandidateMetrics.targetCatchupOpTime' field: " + + tojson(res.electionCandidateMetrics)); + jsTest.log("Case 2: The primary needs to catch up, succeeds in time."); initialNewPrimaryStatus = assert.commandWorked(rst.getSecondaries()[0].adminCommand({serverStatus: 1})); @@ -140,6 +149,19 @@ verifyCatchUpConclusionReason(initialNewPrimaryStatus.electionMetrics, newNewPrimaryStatus.electionMetrics, 'numCatchUpsSucceeded'); +// Check that the 'electionCandidateMetrics' section of the replSetGetStatus response has a +// 'targetCatchupOpTime' field once heartbeats have updated the target opTime for catchup, and that +// it has the correct value. +res = assert.commandWorked(stepUpResults.newPrimary.adminCommand({replSetGetStatus: 1})); +assert(res.electionCandidateMetrics, + () => "Response should have an 'electionCandidateMetrics' field: " + tojson(res)); +assert(res.electionCandidateMetrics.targetCatchupOpTime, + () => "Response should have an 'electionCandidateMetrics.targetCatchupOpTime' field: " + + tojson(res.electionCandidateMetrics)); +assert.eq(res.electionCandidateMetrics.targetCatchupOpTime.ts, + stepUpResults.latestOpOnOldPrimary.ts); +assert.eq(res.electionCandidateMetrics.targetCatchupOpTime.t, stepUpResults.latestOpOnOldPrimary.t); + // Wait for all secondaries to catch up rst.awaitReplication(); // Check the latest op on old primary is preserved on the new one. diff --git a/jstests/replsets/stepdown.js b/jstests/replsets/stepdown.js index 934d9b9a2f4..6abdd335e92 100644 --- a/jstests/replsets/stepdown.js +++ b/jstests/replsets/stepdown.js @@ -64,6 +64,9 @@ try { assert.writeOK(master.getDB("foo").bar.insert({x: i})); } + let res = assert.commandWorked(master.adminCommand({replSetGetStatus: 1})); + assert(res.electionCandidateMetrics, + () => "Response should have an 'electionCandidateMetrics' field: " + tojson(res)); let intitialServerStatus = assert.commandWorked(master.adminCommand({serverStatus: 1})); jsTestLog('Do stepdown of primary ' + master + ' that should not work'); @@ -156,6 +159,12 @@ try { assert.eq(r2.ismaster, false); assert.eq(r2.secondary, true); + // Check that the 'electionCandidateMetrics' section of the replSetGetStatus response has been + // cleared, since the node is no longer primary. + res = assert.commandWorked(master.adminCommand({replSetGetStatus: 1})); + assert(!res.electionCandidateMetrics, + () => "Response should not have an 'electionCandidateMetrics' field: " + tojson(res)); + // This section checks that the metrics are incremented accurately when the command fails due to // an error while stepping down. This is one reason the replSetStepDown command could fail once // we call stepDown in the replication coordinator, but success in this case gives us confidence diff --git a/jstests/replsets/unconditional_step_down.js b/jstests/replsets/unconditional_step_down.js index c9f95bcb1ac..4d5e37821db 100644 --- a/jstests/replsets/unconditional_step_down.js +++ b/jstests/replsets/unconditional_step_down.js @@ -106,6 +106,10 @@ function runStepDownTest({testMsg, stepDownFn, toRemovedState}) { jsTestLog("Wait for write cmd to reach the fail point"); waitForCurOpByFailPoint(primaryDB, collNss, writeFailPoint); + let res = assert.commandWorked(primary.adminCommand({replSetGetStatus: 1})); + assert(res.electionCandidateMetrics, + () => "Response should have an 'electionCandidateMetrics' field: " + tojson(res)); + jsTestLog("Trigger step down"); var oldConfig = stepDownFn(); @@ -119,6 +123,16 @@ function runStepDownTest({testMsg, stepDownFn, toRemovedState}) { (toRemovedState) ? ReplSetTest.State.REMOVED : ReplSetTest.State.SECONDARY); assert.commandWorked(primary.adminCommand({configureFailPoint: writeFailPoint, mode: "off"})); + + // Check that the 'electionCandidateMetrics' section of the replSetGetStatus response has been + // cleared, since the node is no longer primary. + if (!toRemovedState) { + res = assert.commandWorked(primary.adminCommand({replSetGetStatus: 1})); + assert( + !res.electionCandidateMetrics, + () => "Response should not have an 'electionCandidateMetrics' field: " + tojson(res)); + } + // Get the new primary. refreshConnection(); } diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index f425290749b..45e11716d92 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -2102,6 +2102,10 @@ void ReplicationCoordinatorImpl::stepDown(OperationContext* opCtx, lk.lock(); _updateAndLogStatsOnStepDown(&arsd); + + // Clear the node's election candidate metrics since it is no longer primary. + ReplicationMetrics::get(opCtx).clearElectionCandidateMetrics(); + _topCoord->finishUnconditionalStepDown(); onExitGuard.dismiss(); @@ -2365,6 +2369,9 @@ Status ReplicationCoordinatorImpl::processReplSetGetStatus( } } + BSONObj electionCandidateMetrics = + ReplicationMetrics::get(getServiceContext()).getElectionCandidateMetricsBSON(); + stdx::lock_guard<stdx::mutex> lk(_mutex); Status result(ErrorCodes::InternalError, "didn't set status in prepareStatusResponse"); _topCoord->prepareStatusResponse( @@ -2373,6 +2380,7 @@ Status ReplicationCoordinatorImpl::processReplSetGetStatus( static_cast<unsigned>(time(nullptr) - serverGlobalParams.started), _getCurrentCommittedSnapshotOpTimeAndWallTime_inlock(), initialSyncProgress, + electionCandidateMetrics, _storage->getLastStableRecoveryTimestamp(_service), _externalState->tooStale()}, response, @@ -2678,6 +2686,9 @@ void ReplicationCoordinatorImpl::_finishReplSetReconfig(OperationContext* opCtx, lk.lock(); _updateAndLogStatsOnStepDown(&arsd.get()); + + // Clear the node's election candidate metrics since it is no longer primary. + ReplicationMetrics::get(opCtx).clearElectionCandidateMetrics(); } else { // Release the rstl lock as the node might have stepped down due to // other unconditional step down code paths like learning new term via heartbeat & @@ -3095,6 +3106,8 @@ void ReplicationCoordinatorImpl::CatchupState::signalHeartbeatUpdate_inlock() { return; } + ReplicationMetrics::get(getGlobalServiceContext()).setTargetCatchupOpTime(targetOpTime.get()); + log() << "Heartbeats updated catchup target optime to " << *targetOpTime; log() << "Latest known optime per replica set member:"; auto opTimesPerMember = _repl->_topCoord->latestKnownOpTimeSinceHeartbeatRestartPerMember(); diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp index 2b7d7bd62f9..ea12a516ba1 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp @@ -64,6 +64,10 @@ public: if (_replCoord->_electionFinishedEvent.isValid()) { _replCoord->_replExecutor->signalEvent(_replCoord->_electionFinishedEvent); } + + // Clear the node's election candidate metrics if it loses either the dry-run or actual + // election, since it will not become primary. + ReplicationMetrics::get(getGlobalServiceContext()).clearElectionCandidateMetrics(); } void dismiss() { @@ -141,6 +145,9 @@ void ReplicationCoordinatorImpl::_startElectSelfV1_inlock( long long term = _topCoord->getTerm(); int primaryIndex = -1; + Date_t now = _replExecutor->now(); + ReplicationMetrics::get(getServiceContext()).setElectionCandidateMetrics(now); + if (reason == TopologyCoordinator::StartElectionReason::kStepUpRequestSkipDryRun) { long long newTerm = term + 1; log() << "skipping dry run and running for election in term " << newTerm; diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp index 3f0f3334d83..4c7afc1130f 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp @@ -48,6 +48,8 @@ #include "mongo/util/fail_point_service.h" #include "mongo/util/log.h" +#include <boost/optional/optional_io.hpp> + namespace mongo { namespace repl { namespace { @@ -344,6 +346,10 @@ TEST_F(ReplCoordTest, ElectionFailsWhenInsufficientVotesAreReceivedDuringDryRun) simulateEnoughHeartbeatsForAllNodesUp(); + // Check that the node's election candidate metrics are unset before it becomes primary. + ASSERT_BSONOBJ_EQ( + BSONObj(), ReplicationMetrics::get(getServiceContext()).getElectionCandidateMetricsBSON()); + auto electionTimeoutWhen = getReplCoord()->getElectionTimeout_forTest(); ASSERT_NOT_EQUALS(Date_t(), electionTimeoutWhen); log() << "Election timeout scheduled at " << electionTimeoutWhen << " (simulator time)"; @@ -368,6 +374,11 @@ TEST_F(ReplCoordTest, ElectionFailsWhenInsufficientVotesAreReceivedDuringDryRun) << false << "reason" << "don't like him much"))); voteRequests++; + // Check that the node's election candidate metrics are set once it has called an + // election. + ASSERT_BSONOBJ_NE( + BSONObj(), + ReplicationMetrics::get(getServiceContext()).getElectionCandidateMetricsBSON()); } else { net->blackHole(noi); } @@ -377,6 +388,11 @@ TEST_F(ReplCoordTest, ElectionFailsWhenInsufficientVotesAreReceivedDuringDryRun) stopCapturingLogMessages(); ASSERT_EQUALS( 1, countLogLinesContaining("not running for primary, we received insufficient votes")); + + // Check that the node's election candidate metrics have been cleared, since it lost the dry-run + // election and will not become primary. + ASSERT_BSONOBJ_EQ( + BSONObj(), ReplicationMetrics::get(getServiceContext()).getElectionCandidateMetricsBSON()); } TEST_F(ReplCoordTest, ElectionFailsWhenDryRunResponseContainsANewerTerm) { @@ -667,9 +683,17 @@ TEST_F(ReplCoordTest, ElectionFailsWhenVoteRequestResponseContainsANewerTerm) { replCoordSetMyLastDurableOpTime(time1, Date_t() + Seconds(time1.getSecs())); ASSERT_OK(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + // Check that the node's election candidate metrics are unset before it becomes primary. + ASSERT_BSONOBJ_EQ( + BSONObj(), ReplicationMetrics::get(getServiceContext()).getElectionCandidateMetricsBSON()); + simulateEnoughHeartbeatsForAllNodesUp(); simulateSuccessfulDryRun(); + // Check that the node's election candidate metrics are set once it has called an election. + ASSERT_BSONOBJ_NE( + BSONObj(), ReplicationMetrics::get(getServiceContext()).getElectionCandidateMetricsBSON()); + NetworkInterfaceMock* net = getNet(); net->enterNetwork(); while (net->hasReadyRequests()) { @@ -694,6 +718,11 @@ TEST_F(ReplCoordTest, ElectionFailsWhenVoteRequestResponseContainsANewerTerm) { stopCapturingLogMessages(); ASSERT_EQUALS(1, countLogLinesContaining("not becoming primary, we have been superseded already")); + + // Check that the node's election candidate metrics have been cleared, since it lost the actual + // election and will not become primary. + ASSERT_BSONOBJ_EQ( + BSONObj(), ReplicationMetrics::get(getServiceContext()).getElectionCandidateMetricsBSON()); } TEST_F(ReplCoordTest, ElectionFailsWhenTermChangesDuringDryRun) { @@ -2228,6 +2257,10 @@ TEST_F(PrimaryCatchUpTest, PrimaryDoesNotNeedToCatchUp) { ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()) .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting()); + + // Check that the targetCatchupOpTime metric was not set. + ASSERT_EQUALS(boost::none, + ReplicationMetrics::get(getServiceContext()).getTargetCatchupOpTime_forTesting()); } // Heartbeats set a future target OpTime and we reached that successfully. @@ -2237,11 +2270,23 @@ TEST_F(PrimaryCatchUpTest, CatchupSucceeds) { OpTime time1(Timestamp(100, 1), 0); OpTime time2(Timestamp(100, 2), 0); ReplSetConfig config = setUp3NodeReplSetAndRunForElection(time1); + + // Check that the targetCatchupOpTime metric is unset before the target opTime for catchup is + // set. + ASSERT_EQUALS(boost::none, + ReplicationMetrics::get(getServiceContext()).getTargetCatchupOpTime_forTesting()); + processHeartbeatRequests([this, time2](const NetworkOpIter noi) { auto net = getNet(); // The old primary accepted one more op and all nodes caught up after voting for me. net->scheduleResponse(noi, net->now(), makeHeartbeatResponse(time2)); }); + + // Check that the targetCatchupOpTime metric was set correctly when heartbeats updated the + // target opTime for catchup. + ASSERT_EQUALS(time2, + ReplicationMetrics::get(getServiceContext()).getTargetCatchupOpTime_forTesting()); + ASSERT(getReplCoord()->getApplierState() == ApplierState::Running); advanceMyLastAppliedOpTime(time2, Date_t() + Seconds(time2.getSecs())); ASSERT(getReplCoord()->getApplierState() == ApplierState::Draining); @@ -2431,6 +2476,10 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringCatchUp) { // Other nodes are ahead of me. getNet()->scheduleResponse(noi, getNet()->now(), makeHeartbeatResponse(time2)); }); + + ASSERT_EQUALS(time2, + ReplicationMetrics::get(getServiceContext()).getTargetCatchupOpTime_forTesting()); + ASSERT(getReplCoord()->getApplierState() == ApplierState::Running); TopologyCoordinator::UpdateTermResult updateTermResult; auto evh = getReplCoord()->updateTerm_forTest(2, &updateTermResult); @@ -2461,6 +2510,10 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringCatchUp) { ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()) .getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting()); + + // Check that the targetCatchupOpTime metric was cleared when the node stepped down. + ASSERT_EQUALS(boost::none, + ReplicationMetrics::get(getServiceContext()).getTargetCatchupOpTime_forTesting()); } TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringDrainMode) { @@ -2566,6 +2619,8 @@ TEST_F(PrimaryCatchUpTest, FreshestNodeBecomesAvailableLater) { ASSERT(getReplCoord()->getApplierState() == ApplierState::Running); stopCapturingLogMessages(); ASSERT_EQ(1, countLogLinesContaining("Heartbeats updated catchup target optime")); + ASSERT_EQUALS(time3, + ReplicationMetrics::get(getServiceContext()).getTargetCatchupOpTime_forTesting()); // 3) Advancing its applied optime to time 2 isn't enough. advanceMyLastAppliedOpTime(time2, Date_t() + Seconds(time2.getSecs())); @@ -2586,6 +2641,8 @@ TEST_F(PrimaryCatchUpTest, FreshestNodeBecomesAvailableLater) { ASSERT(getReplCoord()->getApplierState() == ApplierState::Running); stopCapturingLogMessages(); ASSERT_EQ(1, countLogLinesContaining("Heartbeats updated catchup target optime")); + ASSERT_EQUALS(time4, + ReplicationMetrics::get(getServiceContext()).getTargetCatchupOpTime_forTesting()); // 5) Advancing to time 3 isn't enough now. advanceMyLastAppliedOpTime(time3, Date_t() + Seconds(time3.getSecs())); diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index e3c3d60eca6..cb85f948095 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -49,6 +49,7 @@ #include "mongo/db/repl/repl_set_heartbeat_args_v1.h" #include "mongo/db/repl/repl_set_heartbeat_response.h" #include "mongo/db/repl/replication_coordinator_impl.h" +#include "mongo/db/repl/replication_metrics.h" #include "mongo/db/repl/replication_process.h" #include "mongo/db/repl/topology_coordinator.h" #include "mongo/db/repl/vote_requester.h" @@ -407,6 +408,10 @@ void ReplicationCoordinatorImpl::_stepDownFinish( lk.lock(); _updateAndLogStatsOnStepDown(&arsd); + + // Clear the node's election candidate metrics since it is no longer primary. + ReplicationMetrics::get(opCtx.get()).clearElectionCandidateMetrics(); + _topCoord->finishUnconditionalStepDown(); const auto action = _updateMemberStateFromTopologyCoordinator(lk, opCtx.get()); @@ -629,6 +634,9 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish( lk.lock(); _updateAndLogStatsOnStepDown(&arsd.get()); + + // Clear the node's election candidate metrics since it is no longer primary. + ReplicationMetrics::get(opCtx.get()).clearElectionCandidateMetrics(); } else { // Release the rstl lock as the node might have stepped down due to // other unconditional step down code paths like learning new term via heartbeat & diff --git a/src/mongo/db/repl/replication_metrics.cpp b/src/mongo/db/repl/replication_metrics.cpp index 7a56c228e23..af4fab849b5 100644 --- a/src/mongo/db/repl/replication_metrics.cpp +++ b/src/mongo/db/repl/replication_metrics.cpp @@ -262,11 +262,41 @@ int ReplicationMetrics::getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_fo return _electionMetrics.getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd(); } +void ReplicationMetrics::setElectionCandidateMetrics(Date_t lastElectionDate) { + stdx::lock_guard<stdx::mutex> lk(_mutex); + _electionCandidateMetrics.setLastElectionDate(lastElectionDate); + _nodeIsCandidateOrPrimary = true; +} + +void ReplicationMetrics::setTargetCatchupOpTime(OpTime opTime) { + stdx::lock_guard<stdx::mutex> lk(_mutex); + _electionCandidateMetrics.setTargetCatchupOpTime(opTime); +} + +boost::optional<OpTime> ReplicationMetrics::getTargetCatchupOpTime_forTesting() { + stdx::lock_guard<stdx::mutex> lk(_mutex); + return _electionCandidateMetrics.getTargetCatchupOpTime(); +} + BSONObj ReplicationMetrics::getElectionMetricsBSON() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _electionMetrics.toBSON(); } +BSONObj ReplicationMetrics::getElectionCandidateMetricsBSON() { + stdx::lock_guard<stdx::mutex> lk(_mutex); + if (_nodeIsCandidateOrPrimary) { + return _electionCandidateMetrics.toBSON(); + } + return BSONObj(); +} + +void ReplicationMetrics::clearElectionCandidateMetrics() { + stdx::lock_guard<stdx::mutex> lk(_mutex); + _electionCandidateMetrics.setTargetCatchupOpTime(boost::none); + _nodeIsCandidateOrPrimary = false; +} + class ReplicationMetrics::ElectionMetricsSSS : public ServerStatusSection { public: ElectionMetricsSSS() : ServerStatusSection("electionMetrics") {} diff --git a/src/mongo/db/repl/replication_metrics.h b/src/mongo/db/repl/replication_metrics.h index de2c9876f68..790036be571 100644 --- a/src/mongo/db/repl/replication_metrics.h +++ b/src/mongo/db/repl/replication_metrics.h @@ -48,6 +48,7 @@ public: ReplicationMetrics(); ~ReplicationMetrics(); + // Election metrics void incrementNumElectionsCalledForReason(TopologyCoordinator::StartElectionReason reason); void incrementNumElectionsSuccessfulForReason(TopologyCoordinator::StartElectionReason reason); void incrementNumStepDownsCausedByHigherTerm(); @@ -75,7 +76,19 @@ public: int getNumCatchUpsFailedWithNewTerm_forTesting(); int getNumCatchUpsFailedWithReplSetAbortPrimaryCatchUpCmd_forTesting(); + // Election candidate metrics + + // All the election candidate metrics that should be set when a node calls an election are set + // in this one function, so that the 'electionCandidateMetrics' section of replSetStatus shows a + // consistent state. + void setElectionCandidateMetrics(Date_t lastElectionDate); + void setTargetCatchupOpTime(OpTime opTime); + + boost::optional<OpTime> getTargetCatchupOpTime_forTesting(); + BSONObj getElectionMetricsBSON(); + BSONObj getElectionCandidateMetricsBSON(); + void clearElectionCandidateMetrics(); private: class ElectionMetricsSSS; @@ -84,6 +97,8 @@ private: ElectionMetrics _electionMetrics; ElectionCandidateMetrics _electionCandidateMetrics; ElectionParticipantMetrics _electionParticipantMetrics; + + bool _nodeIsCandidateOrPrimary = false; }; } // namespace repl diff --git a/src/mongo/db/repl/replication_metrics.idl b/src/mongo/db/repl/replication_metrics.idl index d5ee27d7bb2..5821858d870 100644 --- a/src/mongo/db/repl/replication_metrics.idl +++ b/src/mongo/db/repl/replication_metrics.idl @@ -34,9 +34,11 @@ global: cpp_namespace: "mongo::repl" cpp_includes: - "mongo/db/repl/election_reason_counter_parser.h" + - "mongo/db/repl/optime.h" imports: - "mongo/idl/basic_types.idl" + - "mongo/db/repl/replication_types.idl" types: ElectionReasonCounter: @@ -120,9 +122,14 @@ structs: candidate" strict: true fields: - priorityAtElection: - description: "The node's priority at the time of the election" - type: double + lastElectionDate: + description: "Time the node called the dry run election, or the actual election if + it skipped dry run" + type: date + targetCatchupOpTime: + description: "The node's target opTime for catchup" + type: optime + optional: true ElectionParticipantMetrics: description: "Stores metrics that are specific to the last election in which the node voted" diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp index 1aa98b65c8a..c79248d7af6 100644 --- a/src/mongo/db/repl/topology_coordinator.cpp +++ b/src/mongo/db/repl/topology_coordinator.cpp @@ -1417,7 +1417,8 @@ const MemberConfig* TopologyCoordinator::_currentPrimaryMember() const { std::string TopologyCoordinator::_getReplSetStatusString() { // Construct a ReplSetStatusArgs using default parameters. Missing parameters will not be // included in the status string. - ReplSetStatusArgs rsStatusArgs{Date_t::now(), 0U, OpTimeAndWallTime(), BSONObj(), boost::none}; + ReplSetStatusArgs rsStatusArgs{ + Date_t::now(), 0U, OpTimeAndWallTime(), BSONObj(), BSONObj(), boost::none}; BSONObjBuilder builder; Status result(ErrorCodes::InternalError, "didn't set status in prepareStatusResponse"); prepareStatusResponse(rsStatusArgs, &builder, &result); @@ -1439,6 +1440,7 @@ void TopologyCoordinator::prepareStatusResponse(const ReplSetStatusArgs& rsStatu const OpTime lastOpDurable = getMyLastDurableOpTime(); const Date_t lastOpDurableWall = getMyLastDurableOpTimeAndWallTime().wallTime; const BSONObj& initialSyncStatus = rsStatusArgs.initialSyncStatus; + const BSONObj& electionCandidateMetrics = rsStatusArgs.electionCandidateMetrics; const boost::optional<Timestamp>& lastStableRecoveryTimestamp = rsStatusArgs.lastStableRecoveryTimestamp; @@ -1639,6 +1641,10 @@ void TopologyCoordinator::prepareStatusResponse(const ReplSetStatusArgs& rsStatu response->append("initialSyncStatus", initialSyncStatus); } + if (!electionCandidateMetrics.isEmpty()) { + response->append("electionCandidateMetrics", electionCandidateMetrics); + } + response->append("members", membersOut); *result = Status::OK(); } diff --git a/src/mongo/db/repl/topology_coordinator.h b/src/mongo/db/repl/topology_coordinator.h index d53a581e82b..d81e9f52154 100644 --- a/src/mongo/db/repl/topology_coordinator.h +++ b/src/mongo/db/repl/topology_coordinator.h @@ -301,6 +301,7 @@ public: const unsigned selfUptime; const OpTimeAndWallTime readConcernMajorityOpTime; const BSONObj initialSyncStatus; + const BSONObj electionCandidateMetrics; // boost::none if the storage engine does not support recovery to a timestamp. // Timestamp::min() if a stable recovery timestamp is yet to be taken. diff --git a/src/mongo/db/repl/topology_coordinator_v1_test.cpp b/src/mongo/db/repl/topology_coordinator_v1_test.cpp index e0cc0edbe3b..8060e7f9488 100644 --- a/src/mongo/db/repl/topology_coordinator_v1_test.cpp +++ b/src/mongo/db/repl/topology_coordinator_v1_test.cpp @@ -1538,6 +1538,7 @@ TEST_F(TopoCoordTest, ReplSetGetStatus) { Timestamp lastStableRecoveryTimestamp(2, 2); Timestamp lastStableCheckpointTimestampDeprecated(2, 2); BSONObj initialSyncStatus = BSON("failedInitialSyncAttempts" << 1); + BSONObj electionCandidateMetrics = BSON("DummyElectionMetrics" << 1); std::string setName = "mySet"; ReplSetHeartbeatResponse hb; @@ -1593,6 +1594,7 @@ TEST_F(TopoCoordTest, ReplSetGetStatus) { static_cast<unsigned>(durationCount<Seconds>(uptimeSecs)), {readConcernMajorityOpTime, readConcernMajorityWallTime}, initialSyncStatus, + electionCandidateMetrics, lastStableRecoveryTimestamp}, &statusBuilder, &resultStatus); @@ -1698,6 +1700,7 @@ TEST_F(TopoCoordTest, ReplSetGetStatus) { ASSERT_EQUALS(3, rsStatus["majorityVoteCount"].numberInt()); ASSERT_EQUALS(3, rsStatus["writeMajorityCount"].numberInt()); ASSERT_BSONOBJ_EQ(initialSyncStatus, rsStatus["initialSyncStatus"].Obj()); + ASSERT_BSONOBJ_EQ(electionCandidateMetrics, rsStatus["electionCandidateMetrics"].Obj()); // Test no lastStableRecoveryTimestamp field. BSONObjBuilder statusBuilder2; @@ -1706,7 +1709,8 @@ TEST_F(TopoCoordTest, ReplSetGetStatus) { curTime, static_cast<unsigned>(durationCount<Seconds>(uptimeSecs)), {readConcernMajorityOpTime, readConcernMajorityWallTime}, - initialSyncStatus}, + initialSyncStatus, + BSONObj()}, &statusBuilder2, &resultStatus); ASSERT_OK(resultStatus); @@ -1714,6 +1718,7 @@ TEST_F(TopoCoordTest, ReplSetGetStatus) { unittest::log() << rsStatus; ASSERT_EQUALS(setName, rsStatus["set"].String()); ASSERT_FALSE(rsStatus.hasField("lastStableRecoveryTimestamp")); + ASSERT_FALSE(rsStatus.hasField("electionCandidateMetrics")); } TEST_F(TopoCoordTest, ReplSetGetStatusWriteMajorityDifferentFromMajorityVoteCount) { |