diff options
author | Suganthi Mani <suganthi.mani@mongodb.com> | 2019-02-07 12:30:37 -0500 |
---|---|---|
committer | Suganthi Mani <suganthi.mani@mongodb.com> | 2019-02-13 23:49:50 -0500 |
commit | 6089c4c1d8f166b6b61cec980672779b7cedc303 (patch) | |
tree | efcf098e1e798684340731ba294e294a70ec3677 /src/mongo/db/repl | |
parent | 83336cb56b269195110253918d226cbba4377a03 (diff) | |
download | mongo-6089c4c1d8f166b6b61cec980672779b7cedc303.tar.gz |
SERVER-38696 Add additional metrics and logging for new step down sequence.
Diffstat (limited to 'src/mongo/db/repl')
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.cpp | 45 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.h | 20 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp | 1 |
3 files changed, 62 insertions, 4 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index f128d9c42e6..b1fdd0ecdad 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -99,6 +99,16 @@ MONGO_FAIL_POINT_DEFINE(stepdownHangBeforePerformingPostMemberStateUpdateActions MONGO_FAIL_POINT_DEFINE(transitionToPrimaryHangBeforeTakingGlobalExclusiveLock); MONGO_FAIL_POINT_DEFINE(holdStableTimestampAtSpecificTimestamp); +// Tracks the number of operations killed on step down. +Counter64 userOpsKilled; +ServerStatusMetricField<Counter64> displayuserOpsKilled("repl.stepDown.userOperationsKilled", + &userOpsKilled); + +// Tracks the number of operations left running on step down. +Counter64 userOpsRunning; +ServerStatusMetricField<Counter64> displayUserOpsRunning("repl.stepDown.userOperationsRunning", + &userOpsRunning); + using CallbackArgs = executor::TaskExecutor::CallbackArgs; using CallbackFn = executor::TaskExecutor::CallbackFn; using CallbackHandle = executor::TaskExecutor::CallbackHandle; @@ -1040,6 +1050,10 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx, invariant(status); } + // Reset the counters on step up. + userOpsKilled.decrement(userOpsKilled.get()); + userOpsRunning.decrement(userOpsRunning.get()); + // Must calculate the commit level again because firstOpTimeOfMyTerm wasn't set when we logged // our election in onTransitionToPrimary(), above. _updateLastCommittedOpTime(lk); @@ -1751,8 +1765,18 @@ void ReplicationCoordinatorImpl::waitForStepDownAttempt_forTest() { } } +void ReplicationCoordinatorImpl::_updateAndLogStatsOnStepDown(const KillOpContainer* koc) const { + userOpsRunning.increment(koc->getUserOpsRunning()); + + BSONObjBuilder bob; + bob.appendNumber("userOpsKilled", userOpsKilled.get()); + bob.appendNumber("userOpsRunning", userOpsRunning.get()); + + log() << "Successfully stepped down from primary, stats: " << bob.obj(); +} + void ReplicationCoordinatorImpl::_killUserOperationsOnStepDown( - const OperationContext* stepDownOpCtx) { + const OperationContext* stepDownOpCtx, KillOpContainer* koc) { ServiceContext* serviceCtx = stepDownOpCtx->getServiceContext(); invariant(serviceCtx); @@ -1766,12 +1790,15 @@ void ReplicationCoordinatorImpl::_killUserOperationsOnStepDown( OperationContext* toKill = client->getOperationContext(); // Don't kill the stepdown thread. - if (toKill && toKill->getOpID() != stepDownOpCtx->getOpID()) { + if (toKill && !toKill->isKillPending() && toKill->getOpID() != stepDownOpCtx->getOpID()) { const GlobalLockAcquisitionTracker& globalLockTracker = GlobalLockAcquisitionTracker::get(toKill); if (globalLockTracker.getGlobalWriteLocked() || globalLockTracker.getGlobalSharedLockTaken()) { serviceCtx->killOperation(lk, toKill, ErrorCodes::InterruptedDueToStepDown); + userOpsKilled.increment(); + } else { + koc->incrUserOpsRunningBy(); } } } @@ -1792,7 +1819,10 @@ void ReplicationCoordinatorImpl::KillOpContainer::killOpThreadFn() { OperationContext* opCtx = uniqueOpCtx.get(); while (true) { - _replCord->_killUserOperationsOnStepDown(_stepDownOpCtx); + // Reset the value before killing user operations as we only want to track the number + // of operations that's running after step down. + _userOpsRunning = 0; + _replCord->_killUserOperationsOnStepDown(_stepDownOpCtx, this); // Destroy all stashed transaction resources, in order to release locks. SessionKiller::Matcher matcherAllSessions( @@ -1831,6 +1861,14 @@ void ReplicationCoordinatorImpl::KillOpContainer::stopAndWaitForKillOpThread() { _killOpThread.reset(); } +size_t ReplicationCoordinatorImpl::KillOpContainer::getUserOpsRunning() const { + return _userOpsRunning; +} + +void ReplicationCoordinatorImpl::KillOpContainer::incrUserOpsRunningBy(size_t val) { + _userOpsRunning += val; +} + void ReplicationCoordinatorImpl::stepDown(OperationContext* opCtx, const bool force, const Milliseconds& waitTime, @@ -1979,6 +2017,7 @@ void ReplicationCoordinatorImpl::stepDown(OperationContext* opCtx, onExitGuard.dismiss(); updateMemberState(); + _updateAndLogStatsOnStepDown(&koc); // Schedule work to (potentially) step back up once the stepdown period has ended. _scheduleWorkAt(stepDownUntil, [=](const executor::TaskExecutor::CallbackArgs& cbData) { diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h index 5632c829867..883d41555e5 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.h +++ b/src/mongo/db/repl/replication_coordinator_impl.h @@ -491,6 +491,16 @@ private: */ void stopAndWaitForKillOpThread(); + /* + * Returns _userOpsRunning value. + */ + size_t getUserOpsRunning() const; + + /* + * Increments _userOpsRunning by val. + */ + void incrUserOpsRunningBy(size_t val = 1); + private: ReplicationCoordinatorImpl* const _replCord; // not owned. OperationContext* const _stepDownOpCtx; // not owned. @@ -502,6 +512,8 @@ private: stdx::condition_variable _stopKillingOps; // Once this is set to true, the killOpThreadFn method will terminate. bool _killSignaled = false; + // Tracks number of operations left running on step down. + size_t _userOpsRunning = 0; }; // Abstract struct that holds information about clients waiting for replication. @@ -987,9 +999,15 @@ private: executor::TaskExecutor::EventHandle _stepDownStart(); /** + * Update the "repl.stepDown.userOperationsRunning" counter and log number of operations + * killed and left running on step down. + */ + void _updateAndLogStatsOnStepDown(const KillOpContainer* koc) const; + + /** * kill all user operations that have taken a global lock except in IS mode. */ - void _killUserOperationsOnStepDown(const OperationContext* stepDownOpCtx); + void _killUserOperationsOnStepDown(const OperationContext* stepDownOpCtx, KillOpContainer* koc); /** * Completes a step-down of the current node. Must be run with a global diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index 6cdbcfda8c1..0d082fe65da 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -414,6 +414,7 @@ void ReplicationCoordinatorImpl::_stepDownFinish( } lk.unlock(); _performPostMemberStateUpdateAction(action); + _updateAndLogStatsOnStepDown(&koc); _replExecutor->signalEvent(finishedEvent); } |