SERVER-33785 Prevent interrupt in clientcursormon thread

author: Charlie Swanson <charlie.swanson@mongodb.com> 2018-03-12 17:32:59 -0400
committer: Charlie Swanson <charlie.swanson@mongodb.com> 2018-03-14 14:02:00 -0400
commit: b5b4975daf0a327cc2ee2756b6771ca4c9fabfed (patch)
tree: 058ac35a0058e99c2d0f58e689ae4bd1b6c6027f /src
parent: da9c4e4ef0e9b5539fee9265b89e1d28f844c547 (diff)
download: mongo-b5b4975daf0a327cc2ee2756b6771ca4c9fabfed.tar.gz
2 files changed, 26 insertions, 20 deletions
diff --git a/src/mongo/db/cursor_manager.cpp b/src/mongo/db/cursor_manager.cpp
index 5617369698b..08653cfda22 100644
--- a/src/mongo/db/cursor_manager.cpp
+++ b/src/mongo/db/cursor_manager.cpp
@@ -285,18 +285,25 @@ std::size_t GlobalCursorIdCache::timeoutCursors(OperationContext* opCtx, Date_t
     // For each collection, time out its cursors under the collection lock (to prevent the
     // collection from going away during the erase).
     for (const auto& nsTodo : todo) {
-        // Note that we specify 'kViewsPermitted' here, even though we don't expect 'nsTodo' to be a
-        // view. Because we are not holding the mutex anymore, it is possible that the collection we
-        // are trying to access has since been destroyed and a view of the same name has been
-        // created in its place. Without 'kViewsPermitted' here, that would result in a uassert that
-        // would crash the cursor cleaner background thread.
-        AutoGetCollectionForReadCommand ctx(
-            opCtx, nsTodo, AutoGetCollection::ViewMode::kViewsPermitted);
-        if (!ctx.getDb()) {
+        // We need to be careful to not use an AutoGet* helper, since we only need the lock to
+        // protect potential access to the Collection's CursorManager, and those helpers may
+        // do things we don't want here, like check the shard version or throw an exception if this
+        // namespace has since turned into a view. Using Database::getCollection() will simply
+        // return nullptr if the collection has since turned into a view. In this case, the cursors
+        // will already have been cleaned up when the collection was dropped, so there will be none
+        // left to time out.
+        //
+        // Additionally, we need to use the UninterruptibleLockGuard to ensure the lock acquisition
+        // will not throw due to an interrupt. This method can be called from a background thread so
+        // we do not want to throw any exceptions.
+        UninterruptibleLockGuard noInterrupt(opCtx->lockState());
+        AutoGetDb dbLock(opCtx, nsTodo.db(), MODE_IS);
+        Lock::CollectionLock collLock(opCtx->lockState(), nsTodo.ns(), MODE_IS);
+        if (!dbLock.getDb()) {
             continue;
         }
 
-        Collection* const collection = ctx.getCollection();
+        Collection* const collection = dbLock.getDb()->getCollection(opCtx, nsTodo);
         if (!collection) {
             // The 'nsTodo' collection has been dropped since we held _mutex. We can safely skip it.
             continue;
diff --git a/src/mongo/db/operation_context.cpp b/src/mongo/db/operation_context.cpp
index e503c2e071d..0ce0b29a459 100644
--- a/src/mongo/db/operation_context.cpp
+++ b/src/mongo/db/operation_context.cpp
@@ -66,11 +66,12 @@ MONGO_FP_DECLARE(maxTimeNeverTimeOut);
 //
 // {configureFailPoint: "checkForInterruptFail",
 //  mode: "alwaysOn",
-//  data: {conn: 17, chance: .01}}
+//  data: {threadName: "threadName", chance: .01}}
 //
-// Both data fields must be specified.  In the above example, all interrupt points on connection 17
-// will generate a kill on the current operation with probability p(.01), including interrupt points
-// of nested operations.  "chance" must be a double between 0 and 1, inclusive.
+// Both data fields must be specified. In the above example, all interrupt points on the thread with
+// name 'threadName' will generate a kill on the current operation with probability p(.01),
+// including interrupt points of nested operations. "chance" must be a double between 0 and 1,
+// inclusive.
 MONGO_FP_DECLARE(checkForInterruptFail);
 
 }  // namespace
@@ -168,17 +169,15 @@ namespace {
 // Helper function for checkForInterrupt fail point.  Decides whether the operation currently
 // being run by the given Client meet the (probabilistic) conditions for interruption as
 // specified in the fail point info.
-bool opShouldFail(const OperationContext* opCtx, const BSONObj& failPointInfo) {
+bool opShouldFail(Client* client, const BSONObj& failPointInfo) {
     // Only target the client with the specified connection number.
-    if (opCtx->getClient()->getConnectionId() != failPointInfo["conn"].safeNumberLong()) {
+    if (client->desc() != failPointInfo["threadName"].valuestrsafe()) {
         return false;
     }
 
     // Return true with (approx) probability p = "chance".  Recall: 0 <= chance <= 1.
-    double next = static_cast<double>(std::abs(opCtx->getClient()->getPrng().nextInt64()));
-    double upperBound =
-        std::numeric_limits<int64_t>::max() * failPointInfo["chance"].numberDouble();
-    if (next > upperBound) {
+    double next = client->getPrng().nextCanonicalDouble();
+    if (next > failPointInfo["chance"].numberDouble()) {
         return false;
     }
     return true;
@@ -200,7 +199,7 @@ Status OperationContext::checkForInterruptNoAssert() {
     }
 
     MONGO_FAIL_POINT_BLOCK(checkForInterruptFail, scopedFailPoint) {
-        if (opShouldFail(this, scopedFailPoint.getData())) {
+        if (opShouldFail(getClient(), scopedFailPoint.getData())) {
             log() << "set pending kill on op " << getOpID() << ", for checkForInterruptFail";
             markKilled();
         }
author	Charlie Swanson <charlie.swanson@mongodb.com>	2018-03-12 17:32:59 -0400
committer	Charlie Swanson <charlie.swanson@mongodb.com>	2018-03-14 14:02:00 -0400
commit	b5b4975daf0a327cc2ee2756b6771ca4c9fabfed (patch)
tree	058ac35a0058e99c2d0f58e689ae4bd1b6c6027f /src
parent	da9c4e4ef0e9b5539fee9265b89e1d28f844c547 (diff)
download	mongo-b5b4975daf0a327cc2ee2756b6771ca4c9fabfed.tar.gz