summaryrefslogtreecommitdiff
path: root/src/mongo
diff options
context:
space:
mode:
authorMedha Potluri <medha.potluri@mongodb.com>2019-07-03 10:39:58 -0400
committerMedha Potluri <medha.potluri@mongodb.com>2019-07-26 15:38:46 -0400
commitdc4db514a1ee737db0553f9535033453502b3ac7 (patch)
tree163304fd669cadb87c37358ac6e8dfd4105f9556 /src/mongo
parent5090e4efc24b88d28fa83d30457e1d097f2fc273 (diff)
downloadmongo-dc4db514a1ee737db0553f9535033453502b3ac7.tar.gz
SERVER-41501 Track the number of elections that require primary catchup in serverStatus
Diffstat (limited to 'src/mongo')
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp6
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp33
-rw-r--r--src/mongo/db/repl/replication_metrics.cpp10
-rw-r--r--src/mongo/db/repl/replication_metrics.h2
-rw-r--r--src/mongo/db/repl/replication_metrics.idl7
5 files changed, 57 insertions, 1 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index 0e33068b3cf..3f56ada6698 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -3097,7 +3097,13 @@ void ReplicationCoordinatorImpl::CatchupState::signalHeartbeatUpdate_inlock() {
if (_waiter) {
_repl->_opTimeWaiterList.remove_inlock(_waiter.get());
+ } else {
+ // Only increment the 'numCatchUps' election metric the first time we add a waiter, so that
+ // we only increment it once each time a primary has to catch up. If there is already an
+ // existing waiter, then the node is catching up and has already been counted.
+ ReplicationMetrics::get(getGlobalServiceContext()).incrementNumCatchUps();
}
+
auto targetOpTimeCB = [this, targetOpTime]() {
// Double check the target time since stepdown may signal us too.
const auto myLastApplied = _repl->_getMyLastAppliedOpTime_inlock();
diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp
index 00e6aab8daa..d3ed5c96e6d 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp
@@ -2342,6 +2342,9 @@ TEST_F(PrimaryCatchUpTest, PrimaryDoesNotNeedToCatchUp) {
signalDrainComplete(opCtx.get());
Lock::GlobalLock lock(opCtx.get(), MODE_IX);
ASSERT_TRUE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test"));
+
+ // Check that the number of elections requiring primary catchup was not incremented.
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
}
// Heartbeats set a future target OpTime and we reached that successfully.
@@ -2365,6 +2368,9 @@ TEST_F(PrimaryCatchUpTest, CatchupSucceeds) {
signalDrainComplete(opCtx.get());
Lock::GlobalLock lock(opCtx.get(), MODE_IX);
ASSERT_TRUE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test"));
+
+ // Check that the number of elections requiring primary catchup was incremented.
+ ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
}
TEST_F(PrimaryCatchUpTest, CatchupTimeout) {
@@ -2385,6 +2391,9 @@ TEST_F(PrimaryCatchUpTest, CatchupTimeout) {
signalDrainComplete(opCtx.get());
Lock::GlobalLock lock(opCtx.get(), MODE_IX);
ASSERT_TRUE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test"));
+
+ // Check that the number of elections requiring primary catchup was incremented.
+ ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
}
TEST_F(PrimaryCatchUpTest, CannotSeeAllNodes) {
@@ -2410,6 +2419,9 @@ TEST_F(PrimaryCatchUpTest, CannotSeeAllNodes) {
signalDrainComplete(opCtx.get());
Lock::GlobalLock lock(opCtx.get(), MODE_IX);
ASSERT_TRUE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test"));
+
+ // Check that the number of elections requiring primary catchup was not incremented.
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
}
TEST_F(PrimaryCatchUpTest, HeartbeatTimeout) {
@@ -2435,6 +2447,9 @@ TEST_F(PrimaryCatchUpTest, HeartbeatTimeout) {
signalDrainComplete(opCtx.get());
Lock::GlobalLock lock(opCtx.get(), MODE_IX);
ASSERT_TRUE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test"));
+
+ // Check that the number of elections requiring primary catchup was not incremented.
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
}
TEST_F(PrimaryCatchUpTest, PrimaryStepsDownBeforeHeartbeatRefreshing) {
@@ -2458,6 +2473,9 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownBeforeHeartbeatRefreshing) {
auto opCtx = makeOperationContext();
Lock::GlobalLock lock(opCtx.get(), MODE_IX);
ASSERT_FALSE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test"));
+
+ // Check that the number of elections requiring primary catchup was not incremented.
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
}
TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringCatchUp) {
@@ -2487,6 +2505,9 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringCatchUp) {
auto opCtx = makeOperationContext();
Lock::GlobalLock lock(opCtx.get(), MODE_IX);
ASSERT_FALSE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test"));
+
+ // Check that the number of elections requiring primary catchup was incremented.
+ ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
}
TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringDrainMode) {
@@ -2535,6 +2556,9 @@ TEST_F(PrimaryCatchUpTest, PrimaryStepsDownDuringDrainMode) {
Lock::GlobalLock lock(opCtx.get(), MODE_IX);
ASSERT(replCoord->getApplierState() == ApplierState::Stopped);
ASSERT_TRUE(replCoord->canAcceptWritesForDatabase(opCtx.get(), "test"));
+
+ // Check that the number of elections requiring primary catchup was incremented.
+ ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
}
TEST_F(PrimaryCatchUpTest, FreshestNodeBecomesAvailableLater) {
@@ -2598,6 +2622,9 @@ TEST_F(PrimaryCatchUpTest, FreshestNodeBecomesAvailableLater) {
signalDrainComplete(opCtx.get());
Lock::GlobalLock lock(opCtx.get(), MODE_IX);
ASSERT_TRUE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test"));
+
+ // Check that the number of elections requiring primary catchup was incremented.
+ ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
}
TEST_F(PrimaryCatchUpTest, InfiniteTimeoutAndAbort) {
@@ -2643,6 +2670,9 @@ TEST_F(PrimaryCatchUpTest, InfiniteTimeoutAndAbort) {
signalDrainComplete(opCtx.get());
Lock::GlobalLock lock(opCtx.get(), MODE_IX);
ASSERT_TRUE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test"));
+
+ // Check that the number of elections requiring primary catchup was incremented.
+ ASSERT_EQ(1, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
}
TEST_F(PrimaryCatchUpTest, ZeroTimeout) {
@@ -2657,6 +2687,9 @@ TEST_F(PrimaryCatchUpTest, ZeroTimeout) {
signalDrainComplete(opCtx.get());
Lock::GlobalLock lock(opCtx.get(), MODE_IX);
ASSERT_TRUE(getReplCoord()->canAcceptWritesForDatabase(opCtx.get(), "test"));
+
+ // Check that the number of elections requiring primary catchup was not incremented.
+ ASSERT_EQ(0, ReplicationMetrics::get(opCtx.get()).getNumCatchUps_forTesting());
}
} // namespace
diff --git a/src/mongo/db/repl/replication_metrics.cpp b/src/mongo/db/repl/replication_metrics.cpp
index 8cdc1405827..8a210e83a10 100644
--- a/src/mongo/db/repl/replication_metrics.cpp
+++ b/src/mongo/db/repl/replication_metrics.cpp
@@ -128,6 +128,11 @@ void ReplicationMetrics::incrementNumStepDownsCausedByHigherTerm() {
_electionMetrics.getNumStepDownsCausedByHigherTerm() + 1);
}
+void ReplicationMetrics::incrementNumCatchUps() {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ _electionMetrics.setNumCatchUps(_electionMetrics.getNumCatchUps() + 1);
+}
+
int ReplicationMetrics::getNumStepUpCmdsCalled_forTesting() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
return _electionMetrics.getStepUpCmd().getCalled();
@@ -183,6 +188,11 @@ int ReplicationMetrics::getNumStepDownsCausedByHigherTerm_forTesting() {
return _electionMetrics.getNumStepDownsCausedByHigherTerm();
}
+int ReplicationMetrics::getNumCatchUps_forTesting() {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ return _electionMetrics.getNumCatchUps();
+}
+
BSONObj ReplicationMetrics::getElectionMetricsBSON() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
return _electionMetrics.toBSON();
diff --git a/src/mongo/db/repl/replication_metrics.h b/src/mongo/db/repl/replication_metrics.h
index 564be3b08dd..c03c1fa6ca2 100644
--- a/src/mongo/db/repl/replication_metrics.h
+++ b/src/mongo/db/repl/replication_metrics.h
@@ -51,6 +51,7 @@ public:
void incrementNumElectionsCalledForReason(TopologyCoordinator::StartElectionReason reason);
void incrementNumElectionsSuccessfulForReason(TopologyCoordinator::StartElectionReason reason);
void incrementNumStepDownsCausedByHigherTerm();
+ void incrementNumCatchUps();
int getNumStepUpCmdsCalled_forTesting();
int getNumPriorityTakeoversCalled_forTesting();
@@ -63,6 +64,7 @@ public:
int getNumElectionTimeoutsSuccessful_forTesting();
int getNumFreezeTimeoutsSuccessful_forTesting();
int getNumStepDownsCausedByHigherTerm_forTesting();
+ int getNumCatchUps_forTesting();
BSONObj getElectionMetricsBSON();
diff --git a/src/mongo/db/repl/replication_metrics.idl b/src/mongo/db/repl/replication_metrics.idl
index a14042cbcf1..eb93135cb1b 100644
--- a/src/mongo/db/repl/replication_metrics.idl
+++ b/src/mongo/db/repl/replication_metrics.idl
@@ -76,7 +76,12 @@ structs:
description: "Number of times this node stepped down because it saw a higher term"
type: long
default: 0
-
+ numCatchUps:
+ description: "Number of elections that required the primary to catch up because it
+ was behind"
+ type: long
+ default: 0
+
ElectionCandidateMetrics:
description: "Stores metrics that are specific to the last election in which the node was a
candidate"