SERVER-38162 Acquire RSTL on shutdown in mode X

author: Samy Lanka <samy.lanka@mongodb.com> 2018-12-13 16:38:27 -0500
committer: Samy Lanka <samy.lanka@mongodb.com> 2019-01-17 17:38:32 -0500
commit: 5918fda8a354db2e3ecc95ac0c384b412bfe0684 (patch)
tree: 10a698246928b82ceffca68c086f352a59fb13e4 /src/mongo/db
parent: 95ff8eff9c4641240c6158d1b449f1fbabea6a8e (diff)
download: mongo-5918fda8a354db2e3ecc95ac0c384b412bfe0684.tar.gz
4 files changed, 28 insertions, 11 deletions
diff --git a/src/mongo/db/concurrency/replication_state_transition_lock_guard.cpp b/src/mongo/db/concurrency/replication_state_transition_lock_guard.cpp
index 54b6e3696e8..0260275ba24 100644
--- a/src/mongo/db/concurrency/replication_state_transition_lock_guard.cpp
+++ b/src/mongo/db/concurrency/replication_state_transition_lock_guard.cpp
@@ -99,6 +99,5 @@ void ReplicationStateTransitionLockGuard::_unlock() {
     _result = LOCK_INVALID;
 }
 
-
 }  // namespace repl
 }  // namespace mongo
diff --git a/src/mongo/db/db.cpp b/src/mongo/db/db.cpp
index 8d07347ca5b..eefd845af57 100644
--- a/src/mongo/db/db.cpp
+++ b/src/mongo/db/db.cpp
@@ -65,6 +65,7 @@
 #include "mongo/db/commands/feature_compatibility_version_gen.h"
 #include "mongo/db/concurrency/d_concurrency.h"
 #include "mongo/db/concurrency/lock_state.h"
+#include "mongo/db/concurrency/replication_state_transition_lock_guard.h"
 #include "mongo/db/concurrency/write_conflict_exception.h"
 #include "mongo/db/db_raii.h"
 #include "mongo/db/dbdirectclient.h"
@@ -910,14 +911,33 @@ void shutdownTask() {
             opCtx = uniqueOpCtx.get();
         }
 
-        // This can wait a long time while we drain the secondary's apply queue, especially if it
-        // is building an index.
+        // This can wait a long time while we drain the secondary's apply queue, especially if
+        // it is building an index.
         repl::ReplicationCoordinator::get(serviceContext)->shutdown(opCtx);
 
         ShardingInitializationMongoD::get(serviceContext)->shutDown(opCtx);
 
-        // Destroy all stashed transaction resources, in order to release locks.
-        killSessionsLocalShutdownAllTransactions(opCtx);
+        // Acquire the RSTL in mode X. First we enqueue the lock request, then kill all operations,
+        // destroy all stashed transaction resources in order to release locks, and finally wait
+        // until the lock request is granted.
+        repl::ReplicationStateTransitionLockGuard rstl(
+            opCtx, repl::ReplicationStateTransitionLockGuard::EnqueueOnly());
+
+        // Kill all operations. After this point, the opCtx will have been marked as killed and will
+        // not be usable other than to kill all transactions directly below.
+        serviceContext->setKillAllOperations();
+
+        {
+            // Make this scope uninterruptible so that we can still abort all transactions even
+            // though the opCtx has been killed. While we don't currently check for an interrupt
+            // before checking out a session, we want to make sure that this completes.
+            UninterruptibleLockGuard noInterrupt(opCtx->lockState());
+
+            // Destroy all stashed transaction resources, in order to release locks.
+            killSessionsLocalShutdownAllTransactions(opCtx);
+
+            rstl.waitForLockUntil(Date_t::max());
+        }
 
         // Interrupts all index builds, leaving the state intact to be recovered when the server
         // restarts. This should be done after replication oplog application finishes, so foreground
@@ -925,8 +945,6 @@ void shutdownTask() {
         IndexBuildsCoordinator::get(serviceContext)->shutdown();
     }
 
-    serviceContext->setKillAllOperations();
-
     ReplicaSetMonitor::shutdown();
 
     if (auto sr = Grid::get(serviceContext)->shardRegistry()) {
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index c739b696577..b7949947cdd 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -1763,8 +1763,8 @@ void ReplicationCoordinatorImpl::stepDown(OperationContext* opCtx,
     ReplicationStateTransitionLockGuard rstlLock(
         opCtx, ReplicationStateTransitionLockGuard::EnqueueOnly());
 
-    // Kill all user operations to help us get the global lock faster, as well as to ensure that
-    // operations that are no longer safe to run (like writes) get killed.
+    // Since we are in stepdown, after enqueueing the RSTL we need to kill all user operations to
+    // ensure that operations that are no longer safe to run (like writes) get killed.
     _killOperationsOnStepDown(opCtx);
 
     // Using 'force' sets the default for the wait time to zero, which means the stepdown will
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
index dafe36af09c..fa53cec63ba 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
@@ -384,8 +384,8 @@ void ReplicationCoordinatorImpl::_stepDownFinish(
 
     ReplicationStateTransitionLockGuard rstlLock(
         opCtx.get(), ReplicationStateTransitionLockGuard::EnqueueOnly());
-    // Kill all user operations to help us get the global lock faster, as well as to ensure that
-    // operations that are no longer safe to run (like writes) get killed.
+    // Since we are in stepdown, after enqueueing the RSTL we need to kill all user operations to
+    // ensure that operations that are no longer safe to run (like writes) get killed.
     _killOperationsOnStepDown(opCtx.get());
     rstlLock.waitForLockUntil(Date_t::max());
author	Samy Lanka <samy.lanka@mongodb.com>	2018-12-13 16:38:27 -0500
committer	Samy Lanka <samy.lanka@mongodb.com>	2019-01-17 17:38:32 -0500
commit	5918fda8a354db2e3ecc95ac0c384b412bfe0684 (patch)
tree	10a698246928b82ceffca68c086f352a59fb13e4 /src/mongo/db
parent	95ff8eff9c4641240c6158d1b449f1fbabea6a8e (diff)
download	mongo-5918fda8a354db2e3ecc95ac0c384b412bfe0684.tar.gz