SERVER-42825 Log and track metrics.repl.stateTransition counters after stopped killing user operation

(cherry picked from commit b3b494a72f0e19d7556bee627da7ae9b79e26a03) SERVER-45497 Add tests that will be fixed by future backport to backports_required_for_multiversion_tests.yml (cherry picked from commit 5fcedbdd44f19fdbaeb600b470b4166fbb2c1e97)
author: Samyukta Lanka <samy.lanka@mongodb.com> 2020-01-13 23:27:43 +0000
committer: evergreen <evergreen@mongodb.com> 2020-01-13 23:27:43 +0000
commit: 515b5d3510124d307e6db8c85b72c8f680ed37e1 (patch)
tree: 7a5a928c3a70d9b1bb3f0a0a0855c44c26909751 /src/mongo
parent: cc283da7cdac667c1941b40d1fb155dbd15afe20 (diff)
download: mongo-515b5d3510124d307e6db8c85b72c8f680ed37e1.tar.gz
13 files changed, 179 insertions, 56 deletions
diff --git a/src/mongo/db/repl/replication_coordinator.h b/src/mongo/db/repl/replication_coordinator.h
index 433249a086f..1cdbba25b7a 100644
--- a/src/mongo/db/repl/replication_coordinator.h
+++ b/src/mongo/db/repl/replication_coordinator.h
@@ -929,6 +929,21 @@ public:
      */
     inline static constexpr StringData newPrimaryMsg = "new primary"_sd;
 
+    /*
+     * Specifies the state transitions that kill user operations. Used for tracking state transition
+     * metrics.
+     */
+    enum class OpsKillingStateTransitionEnum { kStepUp, kStepDown, kRollback };
+
+    /**
+     * Updates metrics around user ops when a state transition that kills user ops and select
+     * internal operations occurs (i.e. step up, step down, or rollback). Also logs the metrics.
+     */
+    virtual void updateAndLogStateTransitionMetrics(
+        const ReplicationCoordinator::OpsKillingStateTransitionEnum stateTransition,
+        const size_t numOpsKilled,
+        const size_t numOpsRunning) const = 0;
+
 protected:
     ReplicationCoordinator();
 };
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index 40d95badd5c..f10e828fdec 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -102,15 +102,20 @@ MONGO_FAIL_POINT_DEFINE(stepdownHangBeforeRSTLEnqueue);
 // Fail setMaintenanceMode with ErrorCodes::NotSecondary to simulate a concurrent election.
 MONGO_FAIL_POINT_DEFINE(setMaintenanceModeFailsWithNotSecondary);
 
-// Tracks the number of operations killed on step down.
+// Tracks the last state transition performed in this replca set.
+std::string lastStateTransition;
+ServerStatusMetricField<std::string> displayLastStateTransition(
+    "repl.stateTransition.lastStateTransition", &lastStateTransition);
+
+// Tracks the number of operations killed on state transition.
 Counter64 userOpsKilled;
-ServerStatusMetricField<Counter64> displayuserOpsKilled("repl.stepDown.userOperationsKilled",
+ServerStatusMetricField<Counter64> displayUserOpsKilled("repl.stateTransition.userOperationsKilled",
                                                         &userOpsKilled);
 
-// Tracks the number of operations left running on step down.
+// Tracks the number of operations left running on state transition.
 Counter64 userOpsRunning;
-ServerStatusMetricField<Counter64> displayUserOpsRunning("repl.stepDown.userOperationsRunning",
-                                                         &userOpsRunning);
+ServerStatusMetricField<Counter64> displayUserOpsRunning(
+    "repl.stateTransition.userOperationsRunning", &userOpsRunning);
 
 using CallbackArgs = executor::TaskExecutor::CallbackArgs;
 using CallbackFn = executor::TaskExecutor::CallbackFn;
@@ -1024,7 +1029,8 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx,
     // internal operations. Although secondaries cannot accept writes, a step up can kill writes
     // that were blocked behind the RSTL lock held by a step down attempt. These writes will be
     // killed with a retryable error code during step up.
-    AutoGetRstlForStepUpStepDown arsu(this, opCtx);
+    AutoGetRstlForStepUpStepDown arsu(
+        this, opCtx, ReplicationCoordinator::OpsKillingStateTransitionEnum::kStepUp);
     lk.lock();
 
     // Exit drain mode only if we're actually in draining mode, the apply buffer is empty in the
@@ -1052,10 +1058,6 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx,
         invariant(status);
     }
 
-    // Reset the counters on step up.
-    userOpsKilled.decrement(userOpsKilled.get());
-    userOpsRunning.decrement(userOpsRunning.get());
-
     // Must calculate the commit level again because firstOpTimeOfMyTerm wasn't set when we logged
     // our election in onTransitionToPrimary(), above.
     _updateLastCommittedOpTimeAndWallTime(lk);
@@ -1819,15 +1821,38 @@ void ReplicationCoordinatorImpl::waitForStepDownAttempt_forTest() {
     }
 }
 
-void ReplicationCoordinatorImpl::_updateAndLogStatsOnStepDown(
-    const AutoGetRstlForStepUpStepDown* arsd) const {
-    userOpsRunning.increment(arsd->getUserOpsRunning());
+void ReplicationCoordinatorImpl::updateAndLogStateTransitionMetrics(
+    const ReplicationCoordinator::OpsKillingStateTransitionEnum stateTransition,
+    const size_t numOpsKilled,
+    const size_t numOpsRunning) const {
+
+    // Clear the current metrics before setting.
+    userOpsKilled.decrement(userOpsKilled.get());
+    userOpsRunning.decrement(userOpsRunning.get());
+
+    switch (stateTransition) {
+        case ReplicationCoordinator::OpsKillingStateTransitionEnum::kStepUp:
+            lastStateTransition = "stepUp";
+            break;
+        case ReplicationCoordinator::OpsKillingStateTransitionEnum::kStepDown:
+            lastStateTransition = "stepDown";
+            break;
+        case ReplicationCoordinator::OpsKillingStateTransitionEnum::kRollback:
+            lastStateTransition = "rollback";
+            break;
+        default:
+            MONGO_UNREACHABLE;
+    }
+
+    userOpsKilled.increment(numOpsKilled);
+    userOpsRunning.increment(numOpsRunning);
 
     BSONObjBuilder bob;
+    bob.append("lastStateTransition", lastStateTransition);
     bob.appendNumber("userOpsKilled", userOpsKilled.get());
     bob.appendNumber("userOpsRunning", userOpsRunning.get());
 
-    log() << "Stepping down from primary, stats: " << bob.obj();
+    log() << "State transition ops metrics: " << bob.obj();
 }
 
 void ReplicationCoordinatorImpl::_killConflictingOpsOnStepUpAndStepDown(
@@ -1850,19 +1875,25 @@ void ReplicationCoordinatorImpl::_killConflictingOpsOnStepUpAndStepDown(
             if (locker->wasGlobalLockTakenInModeConflictingWithWrites() ||
                 PrepareConflictTracker::get(toKill).isWaitingOnPrepareConflict()) {
                 serviceCtx->killOperation(lk, toKill, reason);
-                userOpsKilled.increment();
+                arsc->incrementUserOpsKilled();
             } else {
-                arsc->incrUserOpsRunningBy();
+                arsc->incrementUserOpsRunning();
             }
         }
     }
 }
 
 ReplicationCoordinatorImpl::AutoGetRstlForStepUpStepDown::AutoGetRstlForStepUpStepDown(
-    ReplicationCoordinatorImpl* repl, OperationContext* opCtx, Date_t deadline)
-    : _replCord(repl), _opCtx(opCtx) {
+    ReplicationCoordinatorImpl* repl,
+    OperationContext* opCtx,
+    const ReplicationCoordinator::OpsKillingStateTransitionEnum stateTransition,
+    Date_t deadline)
+    : _replCord(repl), _opCtx(opCtx), _stateTransition(stateTransition) {
     invariant(_replCord && _opCtx);
 
+    // The state transition should never be rollback within this class.
+    invariant(_stateTransition != ReplicationCoordinator::OpsKillingStateTransitionEnum::kRollback);
+
     // Enqueues RSTL in X mode.
     _rstlLock.emplace(_opCtx, MODE_X, ReplicationStateTransitionLockGuard::EnqueueOnly());
 
@@ -1912,6 +1943,8 @@ void ReplicationCoordinatorImpl::AutoGetRstlForStepUpStepDown::_killOpThreadFn()
             if (_stopKillingOps.wait_for(
                     lock, Milliseconds(10).toSystemDuration(), [this] { return _killSignaled; })) {
                 log() << "Stopped killing user operations";
+                _replCord->updateAndLogStateTransitionMetrics(
+                    _stateTransition, getUserOpsKilled(), getUserOpsRunning());
                 _killSignaled = false;
                 return;
             }
@@ -1932,11 +1965,19 @@ void ReplicationCoordinatorImpl::AutoGetRstlForStepUpStepDown::_stopAndWaitForKi
     _killOpThread.reset();
 }
 
+size_t ReplicationCoordinatorImpl::AutoGetRstlForStepUpStepDown::getUserOpsKilled() const {
+    return _userOpsKilled;
+}
+
+void ReplicationCoordinatorImpl::AutoGetRstlForStepUpStepDown::incrementUserOpsKilled(size_t val) {
+    _userOpsKilled += val;
+}
+
 size_t ReplicationCoordinatorImpl::AutoGetRstlForStepUpStepDown::getUserOpsRunning() const {
     return _userOpsRunning;
 }
 
-void ReplicationCoordinatorImpl::AutoGetRstlForStepUpStepDown::incrUserOpsRunningBy(size_t val) {
+void ReplicationCoordinatorImpl::AutoGetRstlForStepUpStepDown::incrementUserOpsRunning(size_t val) {
     _userOpsRunning += val;
 }
 
@@ -1982,7 +2023,8 @@ void ReplicationCoordinatorImpl::stepDown(OperationContext* opCtx,
     // fail if it does not acquire the lock immediately. In such a scenario, we use the
     // stepDownUntil deadline instead.
     auto deadline = force ? stepDownUntil : waitUntil;
-    AutoGetRstlForStepUpStepDown arsd(this, opCtx, deadline);
+    AutoGetRstlForStepUpStepDown arsd(
+        this, opCtx, ReplicationCoordinator::OpsKillingStateTransitionEnum::kStepDown, deadline);
 
     stdx::unique_lock<Latch> lk(_mutex);
 
@@ -2099,7 +2141,6 @@ void ReplicationCoordinatorImpl::stepDown(OperationContext* opCtx,
     yieldLocksForPreparedTransactions(opCtx);
 
     lk.lock();
-    _updateAndLogStatsOnStepDown(&arsd);
 
     // Clear the node's election candidate metrics since it is no longer primary.
     ReplicationMetrics::get(opCtx).clearElectionCandidateMetrics();
@@ -2672,7 +2713,7 @@ void ReplicationCoordinatorImpl::_finishReplSetReconfig(OperationContext* opCtx,
 
         // Primary node won't be electable or removed after the configuration change.
         // So, finish the reconfig under RSTL, so that the step down occurs safely.
-        arsd.emplace(this, opCtx);
+        arsd.emplace(this, opCtx, ReplicationCoordinator::OpsKillingStateTransitionEnum::kStepDown);
 
         lk.lock();
         if (_topCoord->isSteppingDownUnconditionally()) {
@@ -2686,7 +2727,6 @@ void ReplicationCoordinatorImpl::_finishReplSetReconfig(OperationContext* opCtx,
             yieldLocksForPreparedTransactions(opCtx);
 
             lk.lock();
-            _updateAndLogStatsOnStepDown(&arsd.get());
 
             // Clear the node's election candidate metrics since it is no longer primary.
             ReplicationMetrics::get(opCtx).clearElectionCandidateMetrics();
diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h
index c42b47cf73a..4807735df26 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.h
+++ b/src/mongo/db/repl/replication_coordinator_impl.h
@@ -335,6 +335,11 @@ public:
 
     virtual void attemptToAdvanceStableTimestamp() override;
 
+    virtual void updateAndLogStateTransitionMetrics(
+        const ReplicationCoordinator::OpsKillingStateTransitionEnum stateTransition,
+        const size_t numOpsKilled,
+        const size_t numOpsRunning) const override;
+
     // ================== Test support API ===================
 
     /**
@@ -487,9 +492,11 @@ private:
     // operations (user/system) and aborts stashed running transactions.
     class AutoGetRstlForStepUpStepDown {
     public:
-        AutoGetRstlForStepUpStepDown(ReplicationCoordinatorImpl* repl,
-                                     OperationContext* opCtx,
-                                     Date_t deadline = Date_t::max());
+        AutoGetRstlForStepUpStepDown(
+            ReplicationCoordinatorImpl* repl,
+            OperationContext* opCtx,
+            ReplicationCoordinator::OpsKillingStateTransitionEnum stateTransition,
+            Date_t deadline = Date_t::max());
 
         // Disallows copying.
         AutoGetRstlForStepUpStepDown(const AutoGetRstlForStepUpStepDown&) = delete;
@@ -506,6 +513,16 @@ private:
         void rstlReacquire();
 
         /*
+         * Returns _userOpsKilled value.
+         */
+        size_t getUserOpsKilled() const;
+
+        /*
+         * Increments _userOpsKilled by val.
+         */
+        void incrementUserOpsKilled(size_t val = 1);
+
+        /*
          * Returns _userOpsRunning value.
          */
         size_t getUserOpsRunning() const;
@@ -513,7 +530,7 @@ private:
         /*
          * Increments _userOpsRunning by val.
          */
-        void incrUserOpsRunningBy(size_t val = 1);
+        void incrementUserOpsRunning(size_t val = 1);
 
         /*
          * Returns the step up/step down opCtx.
@@ -566,7 +583,9 @@ private:
         boost::optional<ReplicationStateTransitionLockGuard> _rstlLock;
         // Thread that will run killOpThreadFn().
         std::unique_ptr<stdx::thread> _killOpThread;
-        // Tracks number of operations left running on step down.
+        // Tracks number of operations killed on step up / step down.
+        size_t _userOpsKilled = 0;
+        // Tracks number of operations left running on step up / step down.
         size_t _userOpsRunning = 0;
         // Protects killSignaled and stopKillingOps cond. variable.
         Mutex _mutex = MONGO_MAKE_LATCH("AutoGetRstlForStepUpStepDown::_mutex");
@@ -574,6 +593,9 @@ private:
         stdx::condition_variable _stopKillingOps;
         // Once this is set to true, the killOpThreadFn method will terminate.
         bool _killSignaled = false;
+        // The state transition that is in progress. Should never be set to rollback within this
+        // class.
+        ReplicationCoordinator::OpsKillingStateTransitionEnum _stateTransition;
     };
 
     // Abstract struct that holds information about clients waiting for replication.
@@ -1092,12 +1114,6 @@ private:
     executor::TaskExecutor::EventHandle _stepDownStart();
 
     /**
-     * Update the "repl.stepDown.userOperationsRunning" counter and log number of operations
-     * killed and left running on step down.
-     */
-    void _updateAndLogStatsOnStepDown(const AutoGetRstlForStepUpStepDown* arsd) const;
-
-    /**
      * kill all conflicting operations that are blocked either on prepare conflict or have taken
      * global lock not in MODE_IS. The conflicting operations can be either user or system
      * operations marked as killable.
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
index 5b2a7730e04..a1afae9f3ae 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
@@ -405,7 +405,8 @@ void ReplicationCoordinatorImpl::_stepDownFinish(
     // kill all write operations which are no longer safe to run on step down. Also, operations that
     // have taken global lock in S mode and operations blocked on prepare conflict will be killed to
     // avoid 3-way deadlock between read, prepared transaction and step down thread.
-    AutoGetRstlForStepUpStepDown arsd(this, opCtx.get());
+    AutoGetRstlForStepUpStepDown arsd(
+        this, opCtx.get(), ReplicationCoordinator::OpsKillingStateTransitionEnum::kStepDown);
     stdx::unique_lock<Latch> lk(_mutex);
 
     // This node has already stepped down due to reconfig. So, signal anyone who is waiting on the
@@ -422,7 +423,6 @@ void ReplicationCoordinatorImpl::_stepDownFinish(
     yieldLocksForPreparedTransactions(opCtx.get());
 
     lk.lock();
-    _updateAndLogStatsOnStepDown(&arsd);
 
     // Clear the node's election candidate metrics since it is no longer primary.
     ReplicationMetrics::get(opCtx.get()).clearElectionCandidateMetrics();
@@ -635,7 +635,8 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish(
 
         // Primary node will be either unelectable or removed after the configuration change.
         // So, finish the reconfig under RSTL, so that the step down occurs safely.
-        arsd.emplace(this, opCtx.get());
+        arsd.emplace(
+            this, opCtx.get(), ReplicationCoordinator::OpsKillingStateTransitionEnum::kStepDown);
 
         lk.lock();
         if (_topCoord->isSteppingDownUnconditionally()) {
@@ -649,7 +650,6 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish(
             yieldLocksForPreparedTransactions(opCtx.get());
 
             lk.lock();
-            _updateAndLogStatsOnStepDown(&arsd.get());
 
             // Clear the node's election candidate metrics since it is no longer primary.
             ReplicationMetrics::get(opCtx.get()).clearElectionCandidateMetrics();
diff --git a/src/mongo/db/repl/replication_coordinator_mock.cpp b/src/mongo/db/repl/replication_coordinator_mock.cpp
index be960f2b5b8..8ce36495bd7 100644
--- a/src/mongo/db/repl/replication_coordinator_mock.cpp
+++ b/src/mongo/db/repl/replication_coordinator_mock.cpp
@@ -558,5 +558,12 @@ void ReplicationCoordinatorMock::attemptToAdvanceStableTimestamp() {
     return;
 }
 
+void ReplicationCoordinatorMock::updateAndLogStateTransitionMetrics(
+    const ReplicationCoordinator::OpsKillingStateTransitionEnum stateTransition,
+    const size_t numOpsKilled,
+    const size_t numOpsRunning) const {
+    return;
+}
+
 }  // namespace repl
 }  // namespace mongo
diff --git a/src/mongo/db/repl/replication_coordinator_mock.h b/src/mongo/db/repl/replication_coordinator_mock.h
index 8ea9a9ddd8e..21f6c7a89db 100644
--- a/src/mongo/db/repl/replication_coordinator_mock.h
+++ b/src/mongo/db/repl/replication_coordinator_mock.h
@@ -316,6 +316,11 @@ public:
 
     virtual void attemptToAdvanceStableTimestamp() override;
 
+    virtual void updateAndLogStateTransitionMetrics(
+        const ReplicationCoordinator::OpsKillingStateTransitionEnum stateTransition,
+        const size_t numOpsKilled,
+        const size_t numOpsRunning) const override;
+
     virtual void setCanAcceptNonLocalWrites(bool canAcceptNonLocalWrites);
 
 private:
diff --git a/src/mongo/db/repl/replication_coordinator_noop.cpp b/src/mongo/db/repl/replication_coordinator_noop.cpp
index a506da67996..0478a4a7418 100644
--- a/src/mongo/db/repl/replication_coordinator_noop.cpp
+++ b/src/mongo/db/repl/replication_coordinator_noop.cpp
@@ -466,5 +466,12 @@ void ReplicationCoordinatorNoOp::attemptToAdvanceStableTimestamp() {
     MONGO_UNREACHABLE;
 }
 
+void ReplicationCoordinatorNoOp::updateAndLogStateTransitionMetrics(
+    const ReplicationCoordinator::OpsKillingStateTransitionEnum stateTransition,
+    const size_t numOpsKilled,
+    const size_t numOpsRunning) const {
+    MONGO_UNREACHABLE;
+}
+
 }  // namespace repl
 }  // namespace mongo
diff --git a/src/mongo/db/repl/replication_coordinator_noop.h b/src/mongo/db/repl/replication_coordinator_noop.h
index e6b1b3ecd43..55c72793ee6 100644
--- a/src/mongo/db/repl/replication_coordinator_noop.h
+++ b/src/mongo/db/repl/replication_coordinator_noop.h
@@ -255,6 +255,11 @@ public:
 
     void attemptToAdvanceStableTimestamp() final;
 
+    void updateAndLogStateTransitionMetrics(
+        const ReplicationCoordinator::OpsKillingStateTransitionEnum stateTransition,
+        const size_t numOpsKilled,
+        const size_t numOpsRunning) const final;
+
 private:
     ServiceContext* const _service;
 };
diff --git a/src/mongo/db/repl/rollback_impl.cpp b/src/mongo/db/repl/rollback_impl.cpp
index ae022c6e09e..fa82b1aee09 100644
--- a/src/mongo/db/repl/rollback_impl.cpp
+++ b/src/mongo/db/repl/rollback_impl.cpp
@@ -273,13 +273,13 @@ bool RollbackImpl::_isInShutdown() const {
     return _inShutdown;
 }
 
-namespace {
-void killAllUserOperations(OperationContext* opCtx) {
+void RollbackImpl::_killAllUserOperations(OperationContext* opCtx) {
     invariant(opCtx);
     ServiceContext* serviceCtx = opCtx->getServiceContext();
     invariant(serviceCtx);
 
     int numOpsKilled = 0;
+    int numOpsRunning = 0;
 
     for (ServiceContext::LockedClientsCursor cursor(serviceCtx); Client* client = cursor.next();) {
         stdx::lock_guard<Client> lk(*client);
@@ -297,12 +297,17 @@ void killAllUserOperations(OperationContext* opCtx) {
         if (toKill && !toKill->isKillPending()) {
             serviceCtx->killOperation(lk, toKill, ErrorCodes::InterruptedDueToReplStateChange);
             numOpsKilled++;
+        } else {
+            numOpsRunning++;
         }
     }
 
-    log() << "Killed {} operation(s) while transitioning to ROLLBACK"_format(numOpsKilled);
+    // Update the metrics for tracking user operations during state transitions.
+    _replicationCoordinator->updateAndLogStateTransitionMetrics(
+        ReplicationCoordinator::OpsKillingStateTransitionEnum::kRollback,
+        numOpsKilled,
+        numOpsRunning);
 }
-}  // namespace
 
 Status RollbackImpl::_transitionToRollback(OperationContext* opCtx) {
     invariant(opCtx);
@@ -318,7 +323,7 @@ Status RollbackImpl::_transitionToRollback(OperationContext* opCtx) {
         // Kill all user operations to ensure we can successfully acquire the RSTL. Since the node
         // must be a secondary, this is only killing readers, whose connections will be closed
         // shortly regardless.
-        killAllUserOperations(opCtx);
+        _killAllUserOperations(opCtx);
 
         rstlLock.waitForLockUntil(Date_t::max());
 
diff --git a/src/mongo/db/repl/rollback_impl.h b/src/mongo/db/repl/rollback_impl.h
index 424b394fa95..517db073dd0 100644
--- a/src/mongo/db/repl/rollback_impl.h
+++ b/src/mongo/db/repl/rollback_impl.h
@@ -348,6 +348,12 @@ private:
         OperationContext* opCtx, RollBackLocalOperations::RollbackCommonPoint commonPoint) const;
 
     /**
+     * Kills all user operations currently being performed. Since this node is a secondary, these
+     * operations are all reads.
+     */
+    void _killAllUserOperations(OperationContext* opCtx);
+
+    /**
      * Uses the ReplicationCoordinator to transition the current member state to ROLLBACK.
      * If the transition to ROLLBACK fails, this could mean that we have been elected PRIMARY. In
      * this case, we return a NotSecondary error.
diff --git a/src/mongo/db/service_entry_point_common.cpp b/src/mongo/db/service_entry_point_common.cpp
index 8b64e1a6bc8..64b73e6008d 100644
--- a/src/mongo/db/service_entry_point_common.cpp
+++ b/src/mongo/db/service_entry_point_common.cpp
@@ -99,8 +99,8 @@ namespace mongo {
 MONGO_FAIL_POINT_DEFINE(rsStopGetMore);
 MONGO_FAIL_POINT_DEFINE(respondWithNotPrimaryInCommandDispatch);
 MONGO_FAIL_POINT_DEFINE(skipCheckingForNotMasterInCommandDispatch);
-MONGO_FAIL_POINT_DEFINE(waitAfterReadCommandFinishesExecution);
 MONGO_FAIL_POINT_DEFINE(sleepMillisAfterCommandExecutionBegins);
+MONGO_FAIL_POINT_DEFINE(waitAfterCommandFinishesExecution);
 
 // Tracks the number of times a legacy unacknowledged write failed due to
 // not master error resulted in network disconnection.
@@ -587,18 +587,23 @@ bool runCommandImpl(OperationContext* opCtx,
         }
     }
 
-    // This failpoint should affect both getMores and commands which are read-only and thus don't
-    // support writeConcern.
-    if (!shouldWaitForWriteConcern || command->getLogicalOp() == LogicalOp::opGetMore) {
-        MONGO_FAIL_POINT_BLOCK(waitAfterReadCommandFinishesExecution, options) {
-            const BSONObj& data = options.getData();
-            auto db = data["db"].str();
-            if (db.empty() || request.getDatabase() == db) {
-                CurOpFailpointHelpers::waitWhileFailPointEnabled(
-                    &waitAfterReadCommandFinishesExecution,
-                    opCtx,
-                    "waitAfterReadCommandFinishesExecution");
-            }
+    // This fail point blocks all commands which are running on the specified namespace, or which
+    // are present in the given list of commands. If no namespace or command list are provided, then
+    // the fail point will block all commands.
+    MONGO_FAIL_POINT_BLOCK(waitAfterCommandFinishesExecution, options) {
+        const BSONObj& data = options.getData();
+        auto ns = data["ns"].valueStringDataSafe();
+        auto commands =
+            data.hasField("commands") ? data["commands"].Array() : std::vector<BSONElement>();
+
+        // If 'ns' or 'commands' is not set, block for all the namespaces or commands respectively.
+        if ((ns.empty() || invocation->ns().ns() == ns) &&
+            (commands.empty() ||
+             std::any_of(commands.begin(), commands.end(), [&request](auto& element) {
+                 return element.valueStringDataSafe() == request.getCommandName();
+             }))) {
+            CurOpFailpointHelpers::waitWhileFailPointEnabled(
+                &waitAfterCommandFinishesExecution, opCtx, "waitAfterCommandFinishesExecution");
         }
     }
 
diff --git a/src/mongo/embedded/replication_coordinator_embedded.cpp b/src/mongo/embedded/replication_coordinator_embedded.cpp
index be0946a5058..62a60c2bad6 100644
--- a/src/mongo/embedded/replication_coordinator_embedded.cpp
+++ b/src/mongo/embedded/replication_coordinator_embedded.cpp
@@ -492,5 +492,12 @@ void ReplicationCoordinatorEmbedded::attemptToAdvanceStableTimestamp() {
     UASSERT_NOT_IMPLEMENTED;
 }
 
+void ReplicationCoordinatorEmbedded::updateAndLogStateTransitionMetrics(
+    const ReplicationCoordinator::OpsKillingStateTransitionEnum stateTransition,
+    const size_t numOpsKilled,
+    const size_t numOpsRunning) const {
+    UASSERT_NOT_IMPLEMENTED;
+}
+
 }  // namespace embedded
 }  // namespace mongo
diff --git a/src/mongo/embedded/replication_coordinator_embedded.h b/src/mongo/embedded/replication_coordinator_embedded.h
index 8d7788a0f41..8b208e09f4d 100644
--- a/src/mongo/embedded/replication_coordinator_embedded.h
+++ b/src/mongo/embedded/replication_coordinator_embedded.h
@@ -263,6 +263,11 @@ public:
 
     void attemptToAdvanceStableTimestamp() override;
 
+    void updateAndLogStateTransitionMetrics(
+        const ReplicationCoordinator::OpsKillingStateTransitionEnum stateTransition,
+        const size_t numOpsKilled,
+        const size_t numOpsRunning) const override;
+
 private:
     // Back pointer to the ServiceContext that has started the instance.
     ServiceContext* const _service;
author	Samyukta Lanka <samy.lanka@mongodb.com>	2020-01-13 23:27:43 +0000
committer	evergreen <evergreen@mongodb.com>	2020-01-13 23:27:43 +0000
commit	515b5d3510124d307e6db8c85b72c8f680ed37e1 (patch)
tree	7a5a928c3a70d9b1bb3f0a0a0855c44c26909751 /src/mongo
parent	cc283da7cdac667c1941b40d1fb155dbd15afe20 (diff)
download	mongo-515b5d3510124d307e6db8c85b72c8f680ed37e1.tar.gz