summaryrefslogtreecommitdiff
path: root/src/mongo/db/repl
diff options
context:
space:
mode:
authorSuganthi Mani <suganthi.mani@mongodb.com>2019-02-07 12:30:37 -0500
committerSuganthi Mani <suganthi.mani@mongodb.com>2019-02-13 23:49:50 -0500
commit6089c4c1d8f166b6b61cec980672779b7cedc303 (patch)
treeefcf098e1e798684340731ba294e294a70ec3677 /src/mongo/db/repl
parent83336cb56b269195110253918d226cbba4377a03 (diff)
downloadmongo-6089c4c1d8f166b6b61cec980672779b7cedc303.tar.gz
SERVER-38696 Add additional metrics and logging for new step down sequence.
Diffstat (limited to 'src/mongo/db/repl')
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp45
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.h20
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp1
3 files changed, 62 insertions, 4 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index f128d9c42e6..b1fdd0ecdad 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -99,6 +99,16 @@ MONGO_FAIL_POINT_DEFINE(stepdownHangBeforePerformingPostMemberStateUpdateActions
MONGO_FAIL_POINT_DEFINE(transitionToPrimaryHangBeforeTakingGlobalExclusiveLock);
MONGO_FAIL_POINT_DEFINE(holdStableTimestampAtSpecificTimestamp);
+// Tracks the number of operations killed on step down.
+Counter64 userOpsKilled;
+ServerStatusMetricField<Counter64> displayuserOpsKilled("repl.stepDown.userOperationsKilled",
+ &userOpsKilled);
+
+// Tracks the number of operations left running on step down.
+Counter64 userOpsRunning;
+ServerStatusMetricField<Counter64> displayUserOpsRunning("repl.stepDown.userOperationsRunning",
+ &userOpsRunning);
+
using CallbackArgs = executor::TaskExecutor::CallbackArgs;
using CallbackFn = executor::TaskExecutor::CallbackFn;
using CallbackHandle = executor::TaskExecutor::CallbackHandle;
@@ -1040,6 +1050,10 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx,
invariant(status);
}
+ // Reset the counters on step up.
+ userOpsKilled.decrement(userOpsKilled.get());
+ userOpsRunning.decrement(userOpsRunning.get());
+
// Must calculate the commit level again because firstOpTimeOfMyTerm wasn't set when we logged
// our election in onTransitionToPrimary(), above.
_updateLastCommittedOpTime(lk);
@@ -1751,8 +1765,18 @@ void ReplicationCoordinatorImpl::waitForStepDownAttempt_forTest() {
}
}
+void ReplicationCoordinatorImpl::_updateAndLogStatsOnStepDown(const KillOpContainer* koc) const {
+ userOpsRunning.increment(koc->getUserOpsRunning());
+
+ BSONObjBuilder bob;
+ bob.appendNumber("userOpsKilled", userOpsKilled.get());
+ bob.appendNumber("userOpsRunning", userOpsRunning.get());
+
+ log() << "Successfully stepped down from primary, stats: " << bob.obj();
+}
+
void ReplicationCoordinatorImpl::_killUserOperationsOnStepDown(
- const OperationContext* stepDownOpCtx) {
+ const OperationContext* stepDownOpCtx, KillOpContainer* koc) {
ServiceContext* serviceCtx = stepDownOpCtx->getServiceContext();
invariant(serviceCtx);
@@ -1766,12 +1790,15 @@ void ReplicationCoordinatorImpl::_killUserOperationsOnStepDown(
OperationContext* toKill = client->getOperationContext();
// Don't kill the stepdown thread.
- if (toKill && toKill->getOpID() != stepDownOpCtx->getOpID()) {
+ if (toKill && !toKill->isKillPending() && toKill->getOpID() != stepDownOpCtx->getOpID()) {
const GlobalLockAcquisitionTracker& globalLockTracker =
GlobalLockAcquisitionTracker::get(toKill);
if (globalLockTracker.getGlobalWriteLocked() ||
globalLockTracker.getGlobalSharedLockTaken()) {
serviceCtx->killOperation(lk, toKill, ErrorCodes::InterruptedDueToStepDown);
+ userOpsKilled.increment();
+ } else {
+ koc->incrUserOpsRunningBy();
}
}
}
@@ -1792,7 +1819,10 @@ void ReplicationCoordinatorImpl::KillOpContainer::killOpThreadFn() {
OperationContext* opCtx = uniqueOpCtx.get();
while (true) {
- _replCord->_killUserOperationsOnStepDown(_stepDownOpCtx);
+ // Reset the value before killing user operations as we only want to track the number
+ // of operations that's running after step down.
+ _userOpsRunning = 0;
+ _replCord->_killUserOperationsOnStepDown(_stepDownOpCtx, this);
// Destroy all stashed transaction resources, in order to release locks.
SessionKiller::Matcher matcherAllSessions(
@@ -1831,6 +1861,14 @@ void ReplicationCoordinatorImpl::KillOpContainer::stopAndWaitForKillOpThread() {
_killOpThread.reset();
}
+size_t ReplicationCoordinatorImpl::KillOpContainer::getUserOpsRunning() const {
+ return _userOpsRunning;
+}
+
+void ReplicationCoordinatorImpl::KillOpContainer::incrUserOpsRunningBy(size_t val) {
+ _userOpsRunning += val;
+}
+
void ReplicationCoordinatorImpl::stepDown(OperationContext* opCtx,
const bool force,
const Milliseconds& waitTime,
@@ -1979,6 +2017,7 @@ void ReplicationCoordinatorImpl::stepDown(OperationContext* opCtx,
onExitGuard.dismiss();
updateMemberState();
+ _updateAndLogStatsOnStepDown(&koc);
// Schedule work to (potentially) step back up once the stepdown period has ended.
_scheduleWorkAt(stepDownUntil, [=](const executor::TaskExecutor::CallbackArgs& cbData) {
diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h
index 5632c829867..883d41555e5 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.h
+++ b/src/mongo/db/repl/replication_coordinator_impl.h
@@ -491,6 +491,16 @@ private:
*/
void stopAndWaitForKillOpThread();
+ /*
+ * Returns _userOpsRunning value.
+ */
+ size_t getUserOpsRunning() const;
+
+ /*
+ * Increments _userOpsRunning by val.
+ */
+ void incrUserOpsRunningBy(size_t val = 1);
+
private:
ReplicationCoordinatorImpl* const _replCord; // not owned.
OperationContext* const _stepDownOpCtx; // not owned.
@@ -502,6 +512,8 @@ private:
stdx::condition_variable _stopKillingOps;
// Once this is set to true, the killOpThreadFn method will terminate.
bool _killSignaled = false;
+ // Tracks number of operations left running on step down.
+ size_t _userOpsRunning = 0;
};
// Abstract struct that holds information about clients waiting for replication.
@@ -987,9 +999,15 @@ private:
executor::TaskExecutor::EventHandle _stepDownStart();
/**
+ * Update the "repl.stepDown.userOperationsRunning" counter and log number of operations
+ * killed and left running on step down.
+ */
+ void _updateAndLogStatsOnStepDown(const KillOpContainer* koc) const;
+
+ /**
* kill all user operations that have taken a global lock except in IS mode.
*/
- void _killUserOperationsOnStepDown(const OperationContext* stepDownOpCtx);
+ void _killUserOperationsOnStepDown(const OperationContext* stepDownOpCtx, KillOpContainer* koc);
/**
* Completes a step-down of the current node. Must be run with a global
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
index 6cdbcfda8c1..0d082fe65da 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
@@ -414,6 +414,7 @@ void ReplicationCoordinatorImpl::_stepDownFinish(
}
lk.unlock();
_performPostMemberStateUpdateAction(action);
+ _updateAndLogStatsOnStepDown(&koc);
_replExecutor->signalEvent(finishedEvent);
}