diff options
author | Medha Potluri <medha.potluri@mongodb.com> | 2019-07-03 10:39:58 -0400 |
---|---|---|
committer | Medha Potluri <medha.potluri@mongodb.com> | 2019-07-26 15:38:46 -0400 |
commit | dc4db514a1ee737db0553f9535033453502b3ac7 (patch) | |
tree | 163304fd669cadb87c37358ac6e8dfd4105f9556 /src/mongo | |
parent | 5090e4efc24b88d28fa83d30457e1d097f2fc273 (diff) | |
download | mongo-dc4db514a1ee737db0553f9535033453502b3ac7.tar.gz |
SERVER-41501 Track the number of elections that require primary catchup in serverStatus
Diffstat (limited to 'src/mongo')
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp | 33 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_metrics.cpp | 10 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_metrics.h | 2 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_metrics.idl | 7 |
5 files changed, 57 insertions, 1 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index 0e33068b3cf..3f56ada6698 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -3097,7 +3097,13 @@ void ReplicationCoordinatorImpl::CatchupState::signalHeartbeatUpdate_inlock() { if (_waiter) { _repl->_opTimeWaiterList.remove_inlock(_waiter.get()); + } else { + // Only increment the 'numCatchUps' election metric the first time we add a waiter, so that + // we only increment it once each time a primary has to catch up. If there is already an + // existing waiter, then the node is catching up and has already been counted. + ReplicationMetrics::get(getGlobalServiceContext()).incrementNumCatchUps(); } + auto targetOpTimeCB = [this, targetOpTime]() { // Double check the target time since stepdown may signal us too. const auto myLastApplied = _repl->_getMyLastAppliedOpTime_inlock(); diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp index 00e6aab8daa..d3ed5c96e6d 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp @@ -2342,6 +2342,9 @@ TEST_F(PrimaryCatchUpTest, PrimaryDoesNotNeedToCatchUp) { signalDrainComplete(opCtx.get()); Lock::GlobalLock lock(opCtx.get(), MODE_IX); ASSERT_TRUE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test")); + + // Check that the number of elections requiring primary catchup was not incremented. + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting()); } // Heartbeats set a future target OpTime and we reached that successfully. @@ -2365,6 +2368,9 @@ TEST_F(PrimaryCatchUpTest, CatchupSucceeds) { signalDrainComplete(opCtx.get()); Lock::GlobalLock lock(opCtx.get(), MODE_IX); ASSERT_TRUE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test")); + + // Check that the number of elections requiring primary catchup was incremented. + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting()); } TEST_F(PrimaryCatchUpTest, CatchupTimeout) { @@ -2385,6 +2391,9 @@ TEST_F(PrimaryCatchUpTest, CatchupTimeout) { signalDrainComplete(opCtx.get()); Lock::GlobalLock lock(opCtx.get(), MODE_IX); ASSERT_TRUE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test")); + + // Check that the number of elections requiring primary catchup was incremented. + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting()); } TEST_F(PrimaryCatchUpTest, CannotSeeAllNodes) { @@ -2410,6 +2419,9 @@ TEST_F(PrimaryCatchUpTest, CannotSeeAllNodes) { signalDrainComplete(opCtx.get()); Lock::GlobalLock lock(opCtx.get(), MODE_IX); ASSERT_TRUE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test")); + + // Check that the number of elections requiring primary catchup was not incremented. + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting()); } TEST_F(PrimaryCatchUpTest, HeartbeatTimeout) { @@ -2435,6 +2447,9 @@ TEST_F(PrimaryCatchUpTest, HeartbeatTimeout) { signalDrainComplete(opCtx.get()); Lock::GlobalLock lock(opCtx.get(), MODE_IX); ASSERT_TRUE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test")); + + // Check that the number of elections requiring primary catchup was not incremented. + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting()); } TEST_F(PrimaryCatchUpTest, PrimaryStepsDownBeforeHeartbeatRefreshing) { @@ -2458,6 +2473,9 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownBeforeHeartbeatRefreshing) { auto opCtx = makeOperationContext(); Lock::GlobalLock lock(opCtx.get(), MODE_IX); ASSERT_FALSE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test")); + + // Check that the number of elections requiring primary catchup was not incremented. + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting()); } TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringCatchUp) { @@ -2487,6 +2505,9 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringCatchUp) { auto opCtx = makeOperationContext(); Lock::GlobalLock lock(opCtx.get(), MODE_IX); ASSERT_FALSE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test")); + + // Check that the number of elections requiring primary catchup was incremented. + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting()); } TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringDrainMode) { @@ -2535,6 +2556,9 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringDrainMode) { Lock::GlobalLock lock(opCtx.get(), MODE_IX); ASSERT(replCoord->getApplierState() == ApplierState::Stopped); ASSERT_TRUE(replCoord->canAcceptWritesForDatabase(opCtx.get(), "test")); + + // Check that the number of elections requiring primary catchup was incremented. + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting()); } TEST_F(PrimaryCatchUpTest, FreshestNodeBecomesAvailableLater) { @@ -2598,6 +2622,9 @@ TEST_F(PrimaryCatchUpTest, FreshestNodeBecomesAvailableLater) { signalDrainComplete(opCtx.get()); Lock::GlobalLock lock(opCtx.get(), MODE_IX); ASSERT_TRUE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test")); + + // Check that the number of elections requiring primary catchup was incremented. + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting()); } TEST_F(PrimaryCatchUpTest, InfiniteTimeoutAndAbort) { @@ -2643,6 +2670,9 @@ TEST_F(PrimaryCatchUpTest, InfiniteTimeoutAndAbort) { signalDrainComplete(opCtx.get()); Lock::GlobalLock lock(opCtx.get(), MODE_IX); ASSERT_TRUE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test")); + + // Check that the number of elections requiring primary catchup was incremented. + ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting()); } TEST_F(PrimaryCatchUpTest, ZeroTimeout) { @@ -2657,6 +2687,9 @@ TEST_F(PrimaryCatchUpTest, ZeroTimeout) { signalDrainComplete(opCtx.get()); Lock::GlobalLock lock(opCtx.get(), MODE_IX); ASSERT_TRUE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test")); + + // Check that the number of elections requiring primary catchup was not incremented. + ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting()); } } // namespace diff --git a/src/mongo/db/repl/replication_metrics.cpp b/src/mongo/db/repl/replication_metrics.cpp index 8cdc1405827..8a210e83a10 100644 --- a/src/mongo/db/repl/replication_metrics.cpp +++ b/src/mongo/db/repl/replication_metrics.cpp @@ -128,6 +128,11 @@ void ReplicationMetrics::incrementNumStepDownsCausedByHigherTerm() { _electionMetrics.getNumStepDownsCausedByHigherTerm() + 1); } +void ReplicationMetrics::incrementNumCatchUps() { + stdx::lock_guard<stdx::mutex> lk(_mutex); + _electionMetrics.setNumCatchUps(_electionMetrics.getNumCatchUps() + 1); +} + int ReplicationMetrics::getNumStepUpCmdsCalled_forTesting() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _electionMetrics.getStepUpCmd().getCalled(); @@ -183,6 +188,11 @@ int ReplicationMetrics::getNumStepDownsCausedByHigherTerm_forTesting() { return _electionMetrics.getNumStepDownsCausedByHigherTerm(); } +int ReplicationMetrics::getNumCatchUps_forTesting() { + stdx::lock_guard<stdx::mutex> lk(_mutex); + return _electionMetrics.getNumCatchUps(); +} + BSONObj ReplicationMetrics::getElectionMetricsBSON() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _electionMetrics.toBSON(); diff --git a/src/mongo/db/repl/replication_metrics.h b/src/mongo/db/repl/replication_metrics.h index 564be3b08dd..c03c1fa6ca2 100644 --- a/src/mongo/db/repl/replication_metrics.h +++ b/src/mongo/db/repl/replication_metrics.h @@ -51,6 +51,7 @@ public: void incrementNumElectionsCalledForReason(TopologyCoordinator::StartElectionReason reason); void incrementNumElectionsSuccessfulForReason(TopologyCoordinator::StartElectionReason reason); void incrementNumStepDownsCausedByHigherTerm(); + void incrementNumCatchUps(); int getNumStepUpCmdsCalled_forTesting(); int getNumPriorityTakeoversCalled_forTesting(); @@ -63,6 +64,7 @@ public: int getNumElectionTimeoutsSuccessful_forTesting(); int getNumFreezeTimeoutsSuccessful_forTesting(); int getNumStepDownsCausedByHigherTerm_forTesting(); + int getNumCatchUps_forTesting(); BSONObj getElectionMetricsBSON(); diff --git a/src/mongo/db/repl/replication_metrics.idl b/src/mongo/db/repl/replication_metrics.idl index a14042cbcf1..eb93135cb1b 100644 --- a/src/mongo/db/repl/replication_metrics.idl +++ b/src/mongo/db/repl/replication_metrics.idl @@ -76,7 +76,12 @@ structs: description: "Number of times this node stepped down because it saw a higher term" type: long default: 0 - + numCatchUps: + description: "Number of elections that required the primary to catch up because it + was behind" + type: long + default: 0 + ElectionCandidateMetrics: description: "Stores metrics that are specific to the last election in which the node was a candidate" |