diff options
author | Samyukta Lanka <samy.lanka@mongodb.com> | 2020-01-13 23:27:43 +0000 |
---|---|---|
committer | evergreen <evergreen@mongodb.com> | 2020-01-13 23:27:43 +0000 |
commit | 515b5d3510124d307e6db8c85b72c8f680ed37e1 (patch) | |
tree | 7a5a928c3a70d9b1bb3f0a0a0855c44c26909751 /src/mongo | |
parent | cc283da7cdac667c1941b40d1fb155dbd15afe20 (diff) | |
download | mongo-515b5d3510124d307e6db8c85b72c8f680ed37e1.tar.gz |
SERVER-42825 Log and track metrics.repl.stateTransition counters after stopped killing user operation
(cherry picked from commit b3b494a72f0e19d7556bee627da7ae9b79e26a03)
SERVER-45497 Add tests that will be fixed by future backport to backports_required_for_multiversion_tests.yml
(cherry picked from commit 5fcedbdd44f19fdbaeb600b470b4166fbb2c1e97)
Diffstat (limited to 'src/mongo')
-rw-r--r-- | src/mongo/db/repl/replication_coordinator.h | 15 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.cpp | 86 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.h | 38 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp | 8 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_mock.cpp | 7 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_mock.h | 5 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_noop.cpp | 7 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_noop.h | 5 | ||||
-rw-r--r-- | src/mongo/db/repl/rollback_impl.cpp | 15 | ||||
-rw-r--r-- | src/mongo/db/repl/rollback_impl.h | 6 | ||||
-rw-r--r-- | src/mongo/db/service_entry_point_common.cpp | 31 | ||||
-rw-r--r-- | src/mongo/embedded/replication_coordinator_embedded.cpp | 7 | ||||
-rw-r--r-- | src/mongo/embedded/replication_coordinator_embedded.h | 5 |
13 files changed, 179 insertions, 56 deletions
diff --git a/src/mongo/db/repl/replication_coordinator.h b/src/mongo/db/repl/replication_coordinator.h index 433249a086f..1cdbba25b7a 100644 --- a/src/mongo/db/repl/replication_coordinator.h +++ b/src/mongo/db/repl/replication_coordinator.h @@ -929,6 +929,21 @@ public: */ inline static constexpr StringData newPrimaryMsg = "new primary"_sd; + /* + * Specifies the state transitions that kill user operations. Used for tracking state transition + * metrics. + */ + enum class OpsKillingStateTransitionEnum { kStepUp, kStepDown, kRollback }; + + /** + * Updates metrics around user ops when a state transition that kills user ops and select + * internal operations occurs (i.e. step up, step down, or rollback). Also logs the metrics. + */ + virtual void updateAndLogStateTransitionMetrics( + const ReplicationCoordinator::OpsKillingStateTransitionEnum stateTransition, + const size_t numOpsKilled, + const size_t numOpsRunning) const = 0; + protected: ReplicationCoordinator(); }; diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index 40d95badd5c..f10e828fdec 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -102,15 +102,20 @@ MONGO_FAIL_POINT_DEFINE(stepdownHangBeforeRSTLEnqueue); // Fail setMaintenanceMode with ErrorCodes::NotSecondary to simulate a concurrent election. MONGO_FAIL_POINT_DEFINE(setMaintenanceModeFailsWithNotSecondary); -// Tracks the number of operations killed on step down. +// Tracks the last state transition performed in this replca set. +std::string lastStateTransition; +ServerStatusMetricField<std::string> displayLastStateTransition( + "repl.stateTransition.lastStateTransition", &lastStateTransition); + +// Tracks the number of operations killed on state transition. Counter64 userOpsKilled; -ServerStatusMetricField<Counter64> displayuserOpsKilled("repl.stepDown.userOperationsKilled", +ServerStatusMetricField<Counter64> displayUserOpsKilled("repl.stateTransition.userOperationsKilled", &userOpsKilled); -// Tracks the number of operations left running on step down. +// Tracks the number of operations left running on state transition. Counter64 userOpsRunning; -ServerStatusMetricField<Counter64> displayUserOpsRunning("repl.stepDown.userOperationsRunning", - &userOpsRunning); +ServerStatusMetricField<Counter64> displayUserOpsRunning( + "repl.stateTransition.userOperationsRunning", &userOpsRunning); using CallbackArgs = executor::TaskExecutor::CallbackArgs; using CallbackFn = executor::TaskExecutor::CallbackFn; @@ -1024,7 +1029,8 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx, // internal operations. Although secondaries cannot accept writes, a step up can kill writes // that were blocked behind the RSTL lock held by a step down attempt. These writes will be // killed with a retryable error code during step up. - AutoGetRstlForStepUpStepDown arsu(this, opCtx); + AutoGetRstlForStepUpStepDown arsu( + this, opCtx, ReplicationCoordinator::OpsKillingStateTransitionEnum::kStepUp); lk.lock(); // Exit drain mode only if we're actually in draining mode, the apply buffer is empty in the @@ -1052,10 +1058,6 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx, invariant(status); } - // Reset the counters on step up. - userOpsKilled.decrement(userOpsKilled.get()); - userOpsRunning.decrement(userOpsRunning.get()); - // Must calculate the commit level again because firstOpTimeOfMyTerm wasn't set when we logged // our election in onTransitionToPrimary(), above. _updateLastCommittedOpTimeAndWallTime(lk); @@ -1819,15 +1821,38 @@ void ReplicationCoordinatorImpl::waitForStepDownAttempt_forTest() { } } -void ReplicationCoordinatorImpl::_updateAndLogStatsOnStepDown( - const AutoGetRstlForStepUpStepDown* arsd) const { - userOpsRunning.increment(arsd->getUserOpsRunning()); +void ReplicationCoordinatorImpl::updateAndLogStateTransitionMetrics( + const ReplicationCoordinator::OpsKillingStateTransitionEnum stateTransition, + const size_t numOpsKilled, + const size_t numOpsRunning) const { + + // Clear the current metrics before setting. + userOpsKilled.decrement(userOpsKilled.get()); + userOpsRunning.decrement(userOpsRunning.get()); + + switch (stateTransition) { + case ReplicationCoordinator::OpsKillingStateTransitionEnum::kStepUp: + lastStateTransition = "stepUp"; + break; + case ReplicationCoordinator::OpsKillingStateTransitionEnum::kStepDown: + lastStateTransition = "stepDown"; + break; + case ReplicationCoordinator::OpsKillingStateTransitionEnum::kRollback: + lastStateTransition = "rollback"; + break; + default: + MONGO_UNREACHABLE; + } + + userOpsKilled.increment(numOpsKilled); + userOpsRunning.increment(numOpsRunning); BSONObjBuilder bob; + bob.append("lastStateTransition", lastStateTransition); bob.appendNumber("userOpsKilled", userOpsKilled.get()); bob.appendNumber("userOpsRunning", userOpsRunning.get()); - log() << "Stepping down from primary, stats: " << bob.obj(); + log() << "State transition ops metrics: " << bob.obj(); } void ReplicationCoordinatorImpl::_killConflictingOpsOnStepUpAndStepDown( @@ -1850,19 +1875,25 @@ void ReplicationCoordinatorImpl::_killConflictingOpsOnStepUpAndStepDown( if (locker->wasGlobalLockTakenInModeConflictingWithWrites() || PrepareConflictTracker::get(toKill).isWaitingOnPrepareConflict()) { serviceCtx->killOperation(lk, toKill, reason); - userOpsKilled.increment(); + arsc->incrementUserOpsKilled(); } else { - arsc->incrUserOpsRunningBy(); + arsc->incrementUserOpsRunning(); } } } } ReplicationCoordinatorImpl::AutoGetRstlForStepUpStepDown::AutoGetRstlForStepUpStepDown( - ReplicationCoordinatorImpl* repl, OperationContext* opCtx, Date_t deadline) - : _replCord(repl), _opCtx(opCtx) { + ReplicationCoordinatorImpl* repl, + OperationContext* opCtx, + const ReplicationCoordinator::OpsKillingStateTransitionEnum stateTransition, + Date_t deadline) + : _replCord(repl), _opCtx(opCtx), _stateTransition(stateTransition) { invariant(_replCord && _opCtx); + // The state transition should never be rollback within this class. + invariant(_stateTransition != ReplicationCoordinator::OpsKillingStateTransitionEnum::kRollback); + // Enqueues RSTL in X mode. _rstlLock.emplace(_opCtx, MODE_X, ReplicationStateTransitionLockGuard::EnqueueOnly()); @@ -1912,6 +1943,8 @@ void ReplicationCoordinatorImpl::AutoGetRstlForStepUpStepDown::_killOpThreadFn() if (_stopKillingOps.wait_for( lock, Milliseconds(10).toSystemDuration(), [this] { return _killSignaled; })) { log() << "Stopped killing user operations"; + _replCord->updateAndLogStateTransitionMetrics( + _stateTransition, getUserOpsKilled(), getUserOpsRunning()); _killSignaled = false; return; } @@ -1932,11 +1965,19 @@ void ReplicationCoordinatorImpl::AutoGetRstlForStepUpStepDown::_stopAndWaitForKi _killOpThread.reset(); } +size_t ReplicationCoordinatorImpl::AutoGetRstlForStepUpStepDown::getUserOpsKilled() const { + return _userOpsKilled; +} + +void ReplicationCoordinatorImpl::AutoGetRstlForStepUpStepDown::incrementUserOpsKilled(size_t val) { + _userOpsKilled += val; +} + size_t ReplicationCoordinatorImpl::AutoGetRstlForStepUpStepDown::getUserOpsRunning() const { return _userOpsRunning; } -void ReplicationCoordinatorImpl::AutoGetRstlForStepUpStepDown::incrUserOpsRunningBy(size_t val) { +void ReplicationCoordinatorImpl::AutoGetRstlForStepUpStepDown::incrementUserOpsRunning(size_t val) { _userOpsRunning += val; } @@ -1982,7 +2023,8 @@ void ReplicationCoordinatorImpl::stepDown(OperationContext* opCtx, // fail if it does not acquire the lock immediately. In such a scenario, we use the // stepDownUntil deadline instead. auto deadline = force ? stepDownUntil : waitUntil; - AutoGetRstlForStepUpStepDown arsd(this, opCtx, deadline); + AutoGetRstlForStepUpStepDown arsd( + this, opCtx, ReplicationCoordinator::OpsKillingStateTransitionEnum::kStepDown, deadline); stdx::unique_lock<Latch> lk(_mutex); @@ -2099,7 +2141,6 @@ void ReplicationCoordinatorImpl::stepDown(OperationContext* opCtx, yieldLocksForPreparedTransactions(opCtx); lk.lock(); - _updateAndLogStatsOnStepDown(&arsd); // Clear the node's election candidate metrics since it is no longer primary. ReplicationMetrics::get(opCtx).clearElectionCandidateMetrics(); @@ -2672,7 +2713,7 @@ void ReplicationCoordinatorImpl::_finishReplSetReconfig(OperationContext* opCtx, // Primary node won't be electable or removed after the configuration change. // So, finish the reconfig under RSTL, so that the step down occurs safely. - arsd.emplace(this, opCtx); + arsd.emplace(this, opCtx, ReplicationCoordinator::OpsKillingStateTransitionEnum::kStepDown); lk.lock(); if (_topCoord->isSteppingDownUnconditionally()) { @@ -2686,7 +2727,6 @@ void ReplicationCoordinatorImpl::_finishReplSetReconfig(OperationContext* opCtx, yieldLocksForPreparedTransactions(opCtx); lk.lock(); - _updateAndLogStatsOnStepDown(&arsd.get()); // Clear the node's election candidate metrics since it is no longer primary. ReplicationMetrics::get(opCtx).clearElectionCandidateMetrics(); diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h index c42b47cf73a..4807735df26 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.h +++ b/src/mongo/db/repl/replication_coordinator_impl.h @@ -335,6 +335,11 @@ public: virtual void attemptToAdvanceStableTimestamp() override; + virtual void updateAndLogStateTransitionMetrics( + const ReplicationCoordinator::OpsKillingStateTransitionEnum stateTransition, + const size_t numOpsKilled, + const size_t numOpsRunning) const override; + // ================== Test support API =================== /** @@ -487,9 +492,11 @@ private: // operations (user/system) and aborts stashed running transactions. class AutoGetRstlForStepUpStepDown { public: - AutoGetRstlForStepUpStepDown(ReplicationCoordinatorImpl* repl, - OperationContext* opCtx, - Date_t deadline = Date_t::max()); + AutoGetRstlForStepUpStepDown( + ReplicationCoordinatorImpl* repl, + OperationContext* opCtx, + ReplicationCoordinator::OpsKillingStateTransitionEnum stateTransition, + Date_t deadline = Date_t::max()); // Disallows copying. AutoGetRstlForStepUpStepDown(const AutoGetRstlForStepUpStepDown&) = delete; @@ -506,6 +513,16 @@ private: void rstlReacquire(); /* + * Returns _userOpsKilled value. + */ + size_t getUserOpsKilled() const; + + /* + * Increments _userOpsKilled by val. + */ + void incrementUserOpsKilled(size_t val = 1); + + /* * Returns _userOpsRunning value. */ size_t getUserOpsRunning() const; @@ -513,7 +530,7 @@ private: /* * Increments _userOpsRunning by val. */ - void incrUserOpsRunningBy(size_t val = 1); + void incrementUserOpsRunning(size_t val = 1); /* * Returns the step up/step down opCtx. @@ -566,7 +583,9 @@ private: boost::optional<ReplicationStateTransitionLockGuard> _rstlLock; // Thread that will run killOpThreadFn(). std::unique_ptr<stdx::thread> _killOpThread; - // Tracks number of operations left running on step down. + // Tracks number of operations killed on step up / step down. + size_t _userOpsKilled = 0; + // Tracks number of operations left running on step up / step down. size_t _userOpsRunning = 0; // Protects killSignaled and stopKillingOps cond. variable. Mutex _mutex = MONGO_MAKE_LATCH("AutoGetRstlForStepUpStepDown::_mutex"); @@ -574,6 +593,9 @@ private: stdx::condition_variable _stopKillingOps; // Once this is set to true, the killOpThreadFn method will terminate. bool _killSignaled = false; + // The state transition that is in progress. Should never be set to rollback within this + // class. + ReplicationCoordinator::OpsKillingStateTransitionEnum _stateTransition; }; // Abstract struct that holds information about clients waiting for replication. @@ -1092,12 +1114,6 @@ private: executor::TaskExecutor::EventHandle _stepDownStart(); /** - * Update the "repl.stepDown.userOperationsRunning" counter and log number of operations - * killed and left running on step down. - */ - void _updateAndLogStatsOnStepDown(const AutoGetRstlForStepUpStepDown* arsd) const; - - /** * kill all conflicting operations that are blocked either on prepare conflict or have taken * global lock not in MODE_IS. The conflicting operations can be either user or system * operations marked as killable. diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index 5b2a7730e04..a1afae9f3ae 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -405,7 +405,8 @@ void ReplicationCoordinatorImpl::_stepDownFinish( // kill all write operations which are no longer safe to run on step down. Also, operations that // have taken global lock in S mode and operations blocked on prepare conflict will be killed to // avoid 3-way deadlock between read, prepared transaction and step down thread. - AutoGetRstlForStepUpStepDown arsd(this, opCtx.get()); + AutoGetRstlForStepUpStepDown arsd( + this, opCtx.get(), ReplicationCoordinator::OpsKillingStateTransitionEnum::kStepDown); stdx::unique_lock<Latch> lk(_mutex); // This node has already stepped down due to reconfig. So, signal anyone who is waiting on the @@ -422,7 +423,6 @@ void ReplicationCoordinatorImpl::_stepDownFinish( yieldLocksForPreparedTransactions(opCtx.get()); lk.lock(); - _updateAndLogStatsOnStepDown(&arsd); // Clear the node's election candidate metrics since it is no longer primary. ReplicationMetrics::get(opCtx.get()).clearElectionCandidateMetrics(); @@ -635,7 +635,8 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish( // Primary node will be either unelectable or removed after the configuration change. // So, finish the reconfig under RSTL, so that the step down occurs safely. - arsd.emplace(this, opCtx.get()); + arsd.emplace( + this, opCtx.get(), ReplicationCoordinator::OpsKillingStateTransitionEnum::kStepDown); lk.lock(); if (_topCoord->isSteppingDownUnconditionally()) { @@ -649,7 +650,6 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish( yieldLocksForPreparedTransactions(opCtx.get()); lk.lock(); - _updateAndLogStatsOnStepDown(&arsd.get()); // Clear the node's election candidate metrics since it is no longer primary. ReplicationMetrics::get(opCtx.get()).clearElectionCandidateMetrics(); diff --git a/src/mongo/db/repl/replication_coordinator_mock.cpp b/src/mongo/db/repl/replication_coordinator_mock.cpp index be960f2b5b8..8ce36495bd7 100644 --- a/src/mongo/db/repl/replication_coordinator_mock.cpp +++ b/src/mongo/db/repl/replication_coordinator_mock.cpp @@ -558,5 +558,12 @@ void ReplicationCoordinatorMock::attemptToAdvanceStableTimestamp() { return; } +void ReplicationCoordinatorMock::updateAndLogStateTransitionMetrics( + const ReplicationCoordinator::OpsKillingStateTransitionEnum stateTransition, + const size_t numOpsKilled, + const size_t numOpsRunning) const { + return; +} + } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator_mock.h b/src/mongo/db/repl/replication_coordinator_mock.h index 8ea9a9ddd8e..21f6c7a89db 100644 --- a/src/mongo/db/repl/replication_coordinator_mock.h +++ b/src/mongo/db/repl/replication_coordinator_mock.h @@ -316,6 +316,11 @@ public: virtual void attemptToAdvanceStableTimestamp() override; + virtual void updateAndLogStateTransitionMetrics( + const ReplicationCoordinator::OpsKillingStateTransitionEnum stateTransition, + const size_t numOpsKilled, + const size_t numOpsRunning) const override; + virtual void setCanAcceptNonLocalWrites(bool canAcceptNonLocalWrites); private: diff --git a/src/mongo/db/repl/replication_coordinator_noop.cpp b/src/mongo/db/repl/replication_coordinator_noop.cpp index a506da67996..0478a4a7418 100644 --- a/src/mongo/db/repl/replication_coordinator_noop.cpp +++ b/src/mongo/db/repl/replication_coordinator_noop.cpp @@ -466,5 +466,12 @@ void ReplicationCoordinatorNoOp::attemptToAdvanceStableTimestamp() { MONGO_UNREACHABLE; } +void ReplicationCoordinatorNoOp::updateAndLogStateTransitionMetrics( + const ReplicationCoordinator::OpsKillingStateTransitionEnum stateTransition, + const size_t numOpsKilled, + const size_t numOpsRunning) const { + MONGO_UNREACHABLE; +} + } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator_noop.h b/src/mongo/db/repl/replication_coordinator_noop.h index e6b1b3ecd43..55c72793ee6 100644 --- a/src/mongo/db/repl/replication_coordinator_noop.h +++ b/src/mongo/db/repl/replication_coordinator_noop.h @@ -255,6 +255,11 @@ public: void attemptToAdvanceStableTimestamp() final; + void updateAndLogStateTransitionMetrics( + const ReplicationCoordinator::OpsKillingStateTransitionEnum stateTransition, + const size_t numOpsKilled, + const size_t numOpsRunning) const final; + private: ServiceContext* const _service; }; diff --git a/src/mongo/db/repl/rollback_impl.cpp b/src/mongo/db/repl/rollback_impl.cpp index ae022c6e09e..fa82b1aee09 100644 --- a/src/mongo/db/repl/rollback_impl.cpp +++ b/src/mongo/db/repl/rollback_impl.cpp @@ -273,13 +273,13 @@ bool RollbackImpl::_isInShutdown() const { return _inShutdown; } -namespace { -void killAllUserOperations(OperationContext* opCtx) { +void RollbackImpl::_killAllUserOperations(OperationContext* opCtx) { invariant(opCtx); ServiceContext* serviceCtx = opCtx->getServiceContext(); invariant(serviceCtx); int numOpsKilled = 0; + int numOpsRunning = 0; for (ServiceContext::LockedClientsCursor cursor(serviceCtx); Client* client = cursor.next();) { stdx::lock_guard<Client> lk(*client); @@ -297,12 +297,17 @@ void killAllUserOperations(OperationContext* opCtx) { if (toKill && !toKill->isKillPending()) { serviceCtx->killOperation(lk, toKill, ErrorCodes::InterruptedDueToReplStateChange); numOpsKilled++; + } else { + numOpsRunning++; } } - log() << "Killed {} operation(s) while transitioning to ROLLBACK"_format(numOpsKilled); + // Update the metrics for tracking user operations during state transitions. + _replicationCoordinator->updateAndLogStateTransitionMetrics( + ReplicationCoordinator::OpsKillingStateTransitionEnum::kRollback, + numOpsKilled, + numOpsRunning); } -} // namespace Status RollbackImpl::_transitionToRollback(OperationContext* opCtx) { invariant(opCtx); @@ -318,7 +323,7 @@ Status RollbackImpl::_transitionToRollback(OperationContext* opCtx) { // Kill all user operations to ensure we can successfully acquire the RSTL. Since the node // must be a secondary, this is only killing readers, whose connections will be closed // shortly regardless. - killAllUserOperations(opCtx); + _killAllUserOperations(opCtx); rstlLock.waitForLockUntil(Date_t::max()); diff --git a/src/mongo/db/repl/rollback_impl.h b/src/mongo/db/repl/rollback_impl.h index 424b394fa95..517db073dd0 100644 --- a/src/mongo/db/repl/rollback_impl.h +++ b/src/mongo/db/repl/rollback_impl.h @@ -348,6 +348,12 @@ private: OperationContext* opCtx, RollBackLocalOperations::RollbackCommonPoint commonPoint) const; /** + * Kills all user operations currently being performed. Since this node is a secondary, these + * operations are all reads. + */ + void _killAllUserOperations(OperationContext* opCtx); + + /** * Uses the ReplicationCoordinator to transition the current member state to ROLLBACK. * If the transition to ROLLBACK fails, this could mean that we have been elected PRIMARY. In * this case, we return a NotSecondary error. diff --git a/src/mongo/db/service_entry_point_common.cpp b/src/mongo/db/service_entry_point_common.cpp index 8b64e1a6bc8..64b73e6008d 100644 --- a/src/mongo/db/service_entry_point_common.cpp +++ b/src/mongo/db/service_entry_point_common.cpp @@ -99,8 +99,8 @@ namespace mongo { MONGO_FAIL_POINT_DEFINE(rsStopGetMore); MONGO_FAIL_POINT_DEFINE(respondWithNotPrimaryInCommandDispatch); MONGO_FAIL_POINT_DEFINE(skipCheckingForNotMasterInCommandDispatch); -MONGO_FAIL_POINT_DEFINE(waitAfterReadCommandFinishesExecution); MONGO_FAIL_POINT_DEFINE(sleepMillisAfterCommandExecutionBegins); +MONGO_FAIL_POINT_DEFINE(waitAfterCommandFinishesExecution); // Tracks the number of times a legacy unacknowledged write failed due to // not master error resulted in network disconnection. @@ -587,18 +587,23 @@ bool runCommandImpl(OperationContext* opCtx, } } - // This failpoint should affect both getMores and commands which are read-only and thus don't - // support writeConcern. - if (!shouldWaitForWriteConcern || command->getLogicalOp() == LogicalOp::opGetMore) { - MONGO_FAIL_POINT_BLOCK(waitAfterReadCommandFinishesExecution, options) { - const BSONObj& data = options.getData(); - auto db = data["db"].str(); - if (db.empty() || request.getDatabase() == db) { - CurOpFailpointHelpers::waitWhileFailPointEnabled( - &waitAfterReadCommandFinishesExecution, - opCtx, - "waitAfterReadCommandFinishesExecution"); - } + // This fail point blocks all commands which are running on the specified namespace, or which + // are present in the given list of commands. If no namespace or command list are provided, then + // the fail point will block all commands. + MONGO_FAIL_POINT_BLOCK(waitAfterCommandFinishesExecution, options) { + const BSONObj& data = options.getData(); + auto ns = data["ns"].valueStringDataSafe(); + auto commands = + data.hasField("commands") ? data["commands"].Array() : std::vector<BSONElement>(); + + // If 'ns' or 'commands' is not set, block for all the namespaces or commands respectively. + if ((ns.empty() || invocation->ns().ns() == ns) && + (commands.empty() || + std::any_of(commands.begin(), commands.end(), [&request](auto& element) { + return element.valueStringDataSafe() == request.getCommandName(); + }))) { + CurOpFailpointHelpers::waitWhileFailPointEnabled( + &waitAfterCommandFinishesExecution, opCtx, "waitAfterCommandFinishesExecution"); } } diff --git a/src/mongo/embedded/replication_coordinator_embedded.cpp b/src/mongo/embedded/replication_coordinator_embedded.cpp index be0946a5058..62a60c2bad6 100644 --- a/src/mongo/embedded/replication_coordinator_embedded.cpp +++ b/src/mongo/embedded/replication_coordinator_embedded.cpp @@ -492,5 +492,12 @@ void ReplicationCoordinatorEmbedded::attemptToAdvanceStableTimestamp() { UASSERT_NOT_IMPLEMENTED; } +void ReplicationCoordinatorEmbedded::updateAndLogStateTransitionMetrics( + const ReplicationCoordinator::OpsKillingStateTransitionEnum stateTransition, + const size_t numOpsKilled, + const size_t numOpsRunning) const { + UASSERT_NOT_IMPLEMENTED; +} + } // namespace embedded } // namespace mongo diff --git a/src/mongo/embedded/replication_coordinator_embedded.h b/src/mongo/embedded/replication_coordinator_embedded.h index 8d7788a0f41..8b208e09f4d 100644 --- a/src/mongo/embedded/replication_coordinator_embedded.h +++ b/src/mongo/embedded/replication_coordinator_embedded.h @@ -263,6 +263,11 @@ public: void attemptToAdvanceStableTimestamp() override; + void updateAndLogStateTransitionMetrics( + const ReplicationCoordinator::OpsKillingStateTransitionEnum stateTransition, + const size_t numOpsKilled, + const size_t numOpsRunning) const override; + private: // Back pointer to the ServiceContext that has started the instance. ServiceContext* const _service; |