diff options
author | Charlie Swanson <charlie.swanson@mongodb.com> | 2018-03-12 17:32:59 -0400 |
---|---|---|
committer | Charlie Swanson <charlie.swanson@mongodb.com> | 2018-03-14 14:02:00 -0400 |
commit | b5b4975daf0a327cc2ee2756b6771ca4c9fabfed (patch) | |
tree | 058ac35a0058e99c2d0f58e689ae4bd1b6c6027f /src | |
parent | da9c4e4ef0e9b5539fee9265b89e1d28f844c547 (diff) | |
download | mongo-b5b4975daf0a327cc2ee2756b6771ca4c9fabfed.tar.gz |
SERVER-33785 Prevent interrupt in clientcursormon thread
Diffstat (limited to 'src')
-rw-r--r-- | src/mongo/db/cursor_manager.cpp | 25 | ||||
-rw-r--r-- | src/mongo/db/operation_context.cpp | 21 |
2 files changed, 26 insertions, 20 deletions
diff --git a/src/mongo/db/cursor_manager.cpp b/src/mongo/db/cursor_manager.cpp index 5617369698b..08653cfda22 100644 --- a/src/mongo/db/cursor_manager.cpp +++ b/src/mongo/db/cursor_manager.cpp @@ -285,18 +285,25 @@ std::size_t GlobalCursorIdCache::timeoutCursors(OperationContext* opCtx, Date_t // For each collection, time out its cursors under the collection lock (to prevent the // collection from going away during the erase). for (const auto& nsTodo : todo) { - // Note that we specify 'kViewsPermitted' here, even though we don't expect 'nsTodo' to be a - // view. Because we are not holding the mutex anymore, it is possible that the collection we - // are trying to access has since been destroyed and a view of the same name has been - // created in its place. Without 'kViewsPermitted' here, that would result in a uassert that - // would crash the cursor cleaner background thread. - AutoGetCollectionForReadCommand ctx( - opCtx, nsTodo, AutoGetCollection::ViewMode::kViewsPermitted); - if (!ctx.getDb()) { + // We need to be careful to not use an AutoGet* helper, since we only need the lock to + // protect potential access to the Collection's CursorManager, and those helpers may + // do things we don't want here, like check the shard version or throw an exception if this + // namespace has since turned into a view. Using Database::getCollection() will simply + // return nullptr if the collection has since turned into a view. In this case, the cursors + // will already have been cleaned up when the collection was dropped, so there will be none + // left to time out. + // + // Additionally, we need to use the UninterruptibleLockGuard to ensure the lock acquisition + // will not throw due to an interrupt. This method can be called from a background thread so + // we do not want to throw any exceptions. + UninterruptibleLockGuard noInterrupt(opCtx->lockState()); + AutoGetDb dbLock(opCtx, nsTodo.db(), MODE_IS); + Lock::CollectionLock collLock(opCtx->lockState(), nsTodo.ns(), MODE_IS); + if (!dbLock.getDb()) { continue; } - Collection* const collection = ctx.getCollection(); + Collection* const collection = dbLock.getDb()->getCollection(opCtx, nsTodo); if (!collection) { // The 'nsTodo' collection has been dropped since we held _mutex. We can safely skip it. continue; diff --git a/src/mongo/db/operation_context.cpp b/src/mongo/db/operation_context.cpp index e503c2e071d..0ce0b29a459 100644 --- a/src/mongo/db/operation_context.cpp +++ b/src/mongo/db/operation_context.cpp @@ -66,11 +66,12 @@ MONGO_FP_DECLARE(maxTimeNeverTimeOut); // // {configureFailPoint: "checkForInterruptFail", // mode: "alwaysOn", -// data: {conn: 17, chance: .01}} +// data: {threadName: "threadName", chance: .01}} // -// Both data fields must be specified. In the above example, all interrupt points on connection 17 -// will generate a kill on the current operation with probability p(.01), including interrupt points -// of nested operations. "chance" must be a double between 0 and 1, inclusive. +// Both data fields must be specified. In the above example, all interrupt points on the thread with +// name 'threadName' will generate a kill on the current operation with probability p(.01), +// including interrupt points of nested operations. "chance" must be a double between 0 and 1, +// inclusive. MONGO_FP_DECLARE(checkForInterruptFail); } // namespace @@ -168,17 +169,15 @@ namespace { // Helper function for checkForInterrupt fail point. Decides whether the operation currently // being run by the given Client meet the (probabilistic) conditions for interruption as // specified in the fail point info. -bool opShouldFail(const OperationContext* opCtx, const BSONObj& failPointInfo) { +bool opShouldFail(Client* client, const BSONObj& failPointInfo) { // Only target the client with the specified connection number. - if (opCtx->getClient()->getConnectionId() != failPointInfo["conn"].safeNumberLong()) { + if (client->desc() != failPointInfo["threadName"].valuestrsafe()) { return false; } // Return true with (approx) probability p = "chance". Recall: 0 <= chance <= 1. - double next = static_cast<double>(std::abs(opCtx->getClient()->getPrng().nextInt64())); - double upperBound = - std::numeric_limits<int64_t>::max() * failPointInfo["chance"].numberDouble(); - if (next > upperBound) { + double next = client->getPrng().nextCanonicalDouble(); + if (next > failPointInfo["chance"].numberDouble()) { return false; } return true; @@ -200,7 +199,7 @@ Status OperationContext::checkForInterruptNoAssert() { } MONGO_FAIL_POINT_BLOCK(checkForInterruptFail, scopedFailPoint) { - if (opShouldFail(this, scopedFailPoint.getData())) { + if (opShouldFail(getClient(), scopedFailPoint.getData())) { log() << "set pending kill on op " << getOpID() << ", for checkForInterruptFail"; markKilled(); } |