diff options
author | Charlie Swanson <charlie.swanson@mongodb.com> | 2018-04-30 12:54:34 -0400 |
---|---|---|
committer | Charlie Swanson <charlie.swanson@mongodb.com> | 2018-05-15 09:22:58 -0400 |
commit | fa9278c892a6833694983191a5726b20f6a187a0 (patch) | |
tree | 3ef2ffc47c32a962d04507c127d2487485317041 /src/mongo/db | |
parent | a9ba9539c5042911f030a1d4b3e4ce8ff0a449fc (diff) | |
download | mongo-fa9278c892a6833694983191a5726b20f6a187a0.tar.gz |
SERVER-33959 Avoid deadlock during global cursor registration
Diffstat (limited to 'src/mongo/db')
-rw-r--r-- | src/mongo/db/clientcursor.cpp | 3 | ||||
-rw-r--r-- | src/mongo/db/commands/list_collections.cpp | 142 | ||||
-rw-r--r-- | src/mongo/db/commands/list_indexes.cpp | 140 | ||||
-rw-r--r-- | src/mongo/db/cursor_manager.cpp | 103 | ||||
-rw-r--r-- | src/mongo/db/cursor_manager.h | 7 |
5 files changed, 217 insertions, 178 deletions
diff --git a/src/mongo/db/clientcursor.cpp b/src/mongo/db/clientcursor.cpp index f960afc60d7..200a128b664 100644 --- a/src/mongo/db/clientcursor.cpp +++ b/src/mongo/db/clientcursor.cpp @@ -204,7 +204,8 @@ void ClientCursorPin::release() { deleteUnderlying(); } else { // Unpin the cursor under the collection cursor manager lock. - _cursor->_cursorManager->unpin(_opCtx, _cursor); + _cursor->_cursorManager->unpin( + _opCtx, std::unique_ptr<ClientCursor, ClientCursor::Deleter>(_cursor)); cursorStatsOpenPinned.decrement(); } diff --git a/src/mongo/db/commands/list_collections.cpp b/src/mongo/db/commands/list_collections.cpp index dce39421301..b59d8ff6669 100644 --- a/src/mongo/db/commands/list_collections.cpp +++ b/src/mongo/db/commands/list_collections.cpp @@ -277,89 +277,91 @@ public: jsobj, "includePendingDrops", false, &includePendingDrops); uassertStatusOK(status); - AutoGetDb autoDb(opCtx, dbname, MODE_IS); - - Database* db = autoDb.getDb(); - - auto ws = make_unique<WorkingSet>(); - auto root = make_unique<QueuedDataStage>(opCtx, ws.get()); - - if (db) { - if (auto collNames = _getExactNameMatches(matcher.get())) { - for (auto&& collName : *collNames) { - auto nss = NamespaceString(db->name(), collName); - Collection* collection = db->getCollection(opCtx, nss); - BSONObj collBson = - buildCollectionBson(opCtx, collection, includePendingDrops, nameOnly); - if (!collBson.isEmpty()) { - _addWorkingSetMember(opCtx, collBson, matcher.get(), ws.get(), root.get()); + const NamespaceString cursorNss = NamespaceString::makeListCollectionsNSS(dbname); + std::unique_ptr<PlanExecutor, PlanExecutor::Deleter> exec; + BSONArrayBuilder firstBatch; + { + AutoGetDb autoDb(opCtx, dbname, MODE_IS); + Database* db = autoDb.getDb(); + + auto ws = make_unique<WorkingSet>(); + auto root = make_unique<QueuedDataStage>(opCtx, ws.get()); + + if (db) { + if (auto collNames = _getExactNameMatches(matcher.get())) { + for (auto&& collName : *collNames) { + auto nss = NamespaceString(db->name(), collName); + Collection* collection = db->getCollection(opCtx, nss); + BSONObj collBson = + buildCollectionBson(opCtx, collection, includePendingDrops, nameOnly); + if (!collBson.isEmpty()) { + _addWorkingSetMember( + opCtx, collBson, matcher.get(), ws.get(), root.get()); + } } - } - } else { - for (auto&& collection : *db) { - BSONObj collBson = - buildCollectionBson(opCtx, collection, includePendingDrops, nameOnly); - if (!collBson.isEmpty()) { - _addWorkingSetMember(opCtx, collBson, matcher.get(), ws.get(), root.get()); + } else { + for (auto&& collection : *db) { + BSONObj collBson = + buildCollectionBson(opCtx, collection, includePendingDrops, nameOnly); + if (!collBson.isEmpty()) { + _addWorkingSetMember( + opCtx, collBson, matcher.get(), ws.get(), root.get()); + } } } - } - // Skipping views is only necessary for internal cloning operations. - bool skipViews = filterElt.type() == mongo::Object && - SimpleBSONObjComparator::kInstance.evaluate( - filterElt.Obj() == ListCollectionsFilter::makeTypeCollectionFilter()); - if (!skipViews) { - db->getViewCatalog()->iterate(opCtx, [&](const ViewDefinition& view) { - BSONObj viewBson = buildViewBson(view, nameOnly); - if (!viewBson.isEmpty()) { - _addWorkingSetMember(opCtx, viewBson, matcher.get(), ws.get(), root.get()); - } - }); + // Skipping views is only necessary for internal cloning operations. + bool skipViews = filterElt.type() == mongo::Object && + SimpleBSONObjComparator::kInstance.evaluate( + filterElt.Obj() == ListCollectionsFilter::makeTypeCollectionFilter()); + if (!skipViews) { + db->getViewCatalog()->iterate(opCtx, [&](const ViewDefinition& view) { + BSONObj viewBson = buildViewBson(view, nameOnly); + if (!viewBson.isEmpty()) { + _addWorkingSetMember( + opCtx, viewBson, matcher.get(), ws.get(), root.get()); + } + }); + } } - } - const NamespaceString cursorNss = NamespaceString::makeListCollectionsNSS(dbname); + exec = uassertStatusOK(PlanExecutor::make( + opCtx, std::move(ws), std::move(root), cursorNss, PlanExecutor::NO_YIELD)); - auto statusWithPlanExecutor = PlanExecutor::make( - opCtx, std::move(ws), std::move(root), cursorNss, PlanExecutor::NO_YIELD); - uassertStatusOK(statusWithPlanExecutor.getStatus()); - auto exec = std::move(statusWithPlanExecutor.getValue()); + for (long long objCount = 0; objCount < batchSize; objCount++) { + BSONObj next; + PlanExecutor::ExecState state = exec->getNext(&next, NULL); + if (state == PlanExecutor::IS_EOF) { + break; + } + invariant(state == PlanExecutor::ADVANCED); - BSONArrayBuilder firstBatch; + // If we can't fit this result inside the current batch, then we stash it for later. + if (!FindCommon::haveSpaceForNext(next, objCount, firstBatch.len())) { + exec->enqueue(next); + break; + } - for (long long objCount = 0; objCount < batchSize; objCount++) { - BSONObj next; - PlanExecutor::ExecState state = exec->getNext(&next, NULL); - if (state == PlanExecutor::IS_EOF) { - break; + firstBatch.append(next); } - invariant(state == PlanExecutor::ADVANCED); - - // If we can't fit this result inside the current batch, then we stash it for later. - if (!FindCommon::haveSpaceForNext(next, objCount, firstBatch.len())) { - exec->enqueue(next); - break; + if (exec->isEOF()) { + appendCursorResponseObject(0LL, cursorNss.ns(), firstBatch.arr(), &result); + return true; } - - firstBatch.append(next); - } - - CursorId cursorId = 0LL; - if (!exec->isEOF()) { exec->saveState(); exec->detachFromOperationContext(); - auto pinnedCursor = CursorManager::getGlobalCursorManager()->registerCursor( - opCtx, - {std::move(exec), - cursorNss, - AuthorizationSession::get(opCtx->getClient())->getAuthenticatedUserNames(), - opCtx->recoveryUnit()->getReadConcernLevel(), - jsobj}); - cursorId = pinnedCursor.getCursor()->cursorid(); - } - - appendCursorResponseObject(cursorId, cursorNss.ns(), firstBatch.arr(), &result); + } // Drop db lock. Global cursor registration must be done without holding any locks. + + auto pinnedCursor = CursorManager::getGlobalCursorManager()->registerCursor( + opCtx, + {std::move(exec), + cursorNss, + AuthorizationSession::get(opCtx->getClient())->getAuthenticatedUserNames(), + opCtx->recoveryUnit()->getReadConcernLevel(), + jsobj}); + + appendCursorResponseObject( + pinnedCursor.getCursor()->cursorid(), cursorNss.ns(), firstBatch.arr(), &result); return true; } diff --git a/src/mongo/db/commands/list_indexes.cpp b/src/mongo/db/commands/list_indexes.cpp index dce38f51fa5..a3565df8e9e 100644 --- a/src/mongo/db/commands/list_indexes.cpp +++ b/src/mongo/db/commands/list_indexes.cpp @@ -125,83 +125,89 @@ public: uassertStatusOK( CursorRequest::parseCommandCursorOptions(cmdObj, defaultBatchSize, &batchSize)); - AutoGetCollectionForReadCommand ctx(opCtx, CommandHelpers::parseNsOrUUID(dbname, cmdObj)); - Collection* collection = ctx.getCollection(); - uassert(ErrorCodes::NamespaceNotFound, - str::stream() << "ns does not exist: " << ctx.getNss().ns(), - collection); - - const CollectionCatalogEntry* cce = collection->getCatalogEntry(); - invariant(cce); - - const auto nss = ctx.getNss(); - - vector<string> indexNames; - writeConflictRetry(opCtx, "listIndexes", nss.ns(), [&indexNames, &cce, &opCtx] { - indexNames.clear(); - cce->getReadyIndexes(opCtx, &indexNames); - }); - - auto ws = make_unique<WorkingSet>(); - auto root = make_unique<QueuedDataStage>(opCtx, ws.get()); - - for (size_t i = 0; i < indexNames.size(); i++) { - BSONObj indexSpec = - writeConflictRetry(opCtx, "listIndexes", nss.ns(), [&cce, &opCtx, &indexNames, i] { - return cce->getIndexSpec(opCtx, indexNames[i]); - }); - - WorkingSetID id = ws->allocate(); - WorkingSetMember* member = ws->get(id); - member->keyData.clear(); - member->recordId = RecordId(); - member->obj = Snapshotted<BSONObj>(SnapshotId(), indexSpec.getOwned()); - member->transitionToOwnedObj(); - root->pushBack(id); - } + std::unique_ptr<PlanExecutor, PlanExecutor::Deleter> exec; + NamespaceString cursorNss; + BSONArrayBuilder firstBatch; + { + AutoGetCollectionForReadCommand ctx(opCtx, + CommandHelpers::parseNsOrUUID(dbname, cmdObj)); + Collection* collection = ctx.getCollection(); + uassert(ErrorCodes::NamespaceNotFound, + str::stream() << "ns does not exist: " << ctx.getNss().ns(), + collection); + + const CollectionCatalogEntry* cce = collection->getCatalogEntry(); + invariant(cce); + + const auto nss = ctx.getNss(); + + vector<string> indexNames; + writeConflictRetry(opCtx, "listIndexes", nss.ns(), [&indexNames, &cce, &opCtx] { + indexNames.clear(); + cce->getReadyIndexes(opCtx, &indexNames); + }); + + auto ws = make_unique<WorkingSet>(); + auto root = make_unique<QueuedDataStage>(opCtx, ws.get()); + + for (size_t i = 0; i < indexNames.size(); i++) { + BSONObj indexSpec = writeConflictRetry( + opCtx, "listIndexes", nss.ns(), [&cce, &opCtx, &indexNames, i] { + return cce->getIndexSpec(opCtx, indexNames[i]); + }); + + WorkingSetID id = ws->allocate(); + WorkingSetMember* member = ws->get(id); + member->keyData.clear(); + member->recordId = RecordId(); + member->obj = Snapshotted<BSONObj>(SnapshotId(), indexSpec.getOwned()); + member->transitionToOwnedObj(); + root->pushBack(id); + } - const NamespaceString cursorNss = NamespaceString::makeListIndexesNSS(dbname, nss.coll()); - invariant(nss == cursorNss.getTargetNSForListIndexes()); + cursorNss = NamespaceString::makeListIndexesNSS(dbname, nss.coll()); + invariant(nss == cursorNss.getTargetNSForListIndexes()); - auto statusWithPlanExecutor = PlanExecutor::make( - opCtx, std::move(ws), std::move(root), cursorNss, PlanExecutor::NO_YIELD); - uassertStatusOK(statusWithPlanExecutor.getStatus()); - auto exec = std::move(statusWithPlanExecutor.getValue()); + exec = uassertStatusOK(PlanExecutor::make( + opCtx, std::move(ws), std::move(root), cursorNss, PlanExecutor::NO_YIELD)); - BSONArrayBuilder firstBatch; + for (long long objCount = 0; objCount < batchSize; objCount++) { + BSONObj next; + PlanExecutor::ExecState state = exec->getNext(&next, NULL); + if (state == PlanExecutor::IS_EOF) { + break; + } + invariant(state == PlanExecutor::ADVANCED); - for (long long objCount = 0; objCount < batchSize; objCount++) { - BSONObj next; - PlanExecutor::ExecState state = exec->getNext(&next, NULL); - if (state == PlanExecutor::IS_EOF) { - break; - } - invariant(state == PlanExecutor::ADVANCED); + // If we can't fit this result inside the current batch, then we stash it for later. + if (!FindCommon::haveSpaceForNext(next, objCount, firstBatch.len())) { + exec->enqueue(next); + break; + } - // If we can't fit this result inside the current batch, then we stash it for later. - if (!FindCommon::haveSpaceForNext(next, objCount, firstBatch.len())) { - exec->enqueue(next); - break; + firstBatch.append(next); } - firstBatch.append(next); - } + if (exec->isEOF()) { + appendCursorResponseObject(0LL, cursorNss.ns(), firstBatch.arr(), &result); + return true; + } - CursorId cursorId = 0LL; - if (!exec->isEOF()) { exec->saveState(); exec->detachFromOperationContext(); - auto pinnedCursor = CursorManager::getGlobalCursorManager()->registerCursor( - opCtx, - {std::move(exec), - cursorNss, - AuthorizationSession::get(opCtx->getClient())->getAuthenticatedUserNames(), - opCtx->recoveryUnit()->getReadConcernLevel(), - cmdObj}); - cursorId = pinnedCursor.getCursor()->cursorid(); - } - - appendCursorResponseObject(cursorId, cursorNss.ns(), firstBatch.arr(), &result); + } // Drop collection lock. Global cursor registration must be done without holding any + // locks. + + const auto pinnedCursor = CursorManager::getGlobalCursorManager()->registerCursor( + opCtx, + {std::move(exec), + cursorNss, + AuthorizationSession::get(opCtx->getClient())->getAuthenticatedUserNames(), + opCtx->recoveryUnit()->getReadConcernLevel(), + cmdObj}); + + appendCursorResponseObject( + pinnedCursor.getCursor()->cursorid(), cursorNss.ns(), firstBatch.arr(), &result); return true; } diff --git a/src/mongo/db/cursor_manager.cpp b/src/mongo/db/cursor_manager.cpp index d6c00d996d2..d58e25bbbc7 100644 --- a/src/mongo/db/cursor_manager.cpp +++ b/src/mongo/db/cursor_manager.cpp @@ -573,32 +573,40 @@ void CursorManager::invalidateAll(OperationContext* opCtx, // Mark all cursors as killed, but keep around those we can in order to provide a useful error // message to the user when they attempt to use it next time. - auto allCurrentPartitions = _cursorMap->lockAllPartitions(); - for (auto&& partition : allCurrentPartitions) { - for (auto it = partition.begin(); it != partition.end();) { - auto* cursor = it->second; - cursor->markAsKilled({ErrorCodes::QueryPlanKilled, reason}); - - // If there's an operation actively using the cursor, then that operation is now - // responsible for cleaning it up. Otherwise we can immediately dispose of it. - if (cursor->_operationUsingCursor) { - it = partition.erase(it); - removeTransactionCursorReference(cursor); - continue; - } + std::vector<std::unique_ptr<ClientCursor, ClientCursor::Deleter>> toDisposeWithoutMutex; + { + auto allCurrentPartitions = _cursorMap->lockAllPartitions(); + for (auto&& partition : allCurrentPartitions) { + for (auto it = partition.begin(); it != partition.end();) { + auto* cursor = it->second; + cursor->markAsKilled({ErrorCodes::QueryPlanKilled, reason}); + + // If there's an operation actively using the cursor, then that operation is now + // responsible for cleaning it up. Otherwise we can immediately dispose of it. + if (cursor->_operationUsingCursor) { + it = partition.erase(it); + removeTransactionCursorReference(cursor); + continue; + } - if (!collectionGoingAway) { - // We keep around unpinned cursors so that future attempts to use the cursor will - // result in a useful error message. - ++it; - } else { - removeTransactionCursorReference(cursor); - cursor->dispose(opCtx); - delete cursor; - it = partition.erase(it); + if (!collectionGoingAway) { + // We keep around unpinned cursors so that future attempts to use the cursor + // will result in a useful error message. + ++it; + } else { + removeTransactionCursorReference(cursor); + toDisposeWithoutMutex.emplace_back(cursor); + it = partition.erase(it); + } } } } + + // Dispose of the cursors we can now delete. This might involve lock acquisitions for safe + // cleanup, so avoid doing it while holding mutexes. + for (auto&& cursor : toDisposeWithoutMutex) { + cursor->dispose(opCtx); + } } void CursorManager::invalidateDocument(OperationContext* opCtx, @@ -636,17 +644,15 @@ bool CursorManager::cursorShouldTimeout_inlock(const ClientCursor* cursor, Date_ } std::size_t CursorManager::timeoutCursors(OperationContext* opCtx, Date_t now) { - std::vector<std::unique_ptr<ClientCursor, ClientCursor::Deleter>> toDelete; + std::vector<std::unique_ptr<ClientCursor, ClientCursor::Deleter>> toDisposeWithoutMutex; for (size_t partitionId = 0; partitionId < kNumPartitions; ++partitionId) { auto lockedPartition = _cursorMap->lockOnePartitionById(partitionId); for (auto it = lockedPartition->begin(); it != lockedPartition->end();) { auto* cursor = it->second; if (cursorShouldTimeout_inlock(cursor, now)) { - // Dispose of the cursor and remove it from the partition. removeTransactionCursorReference(cursor); - cursor->dispose(opCtx); - toDelete.push_back(std::unique_ptr<ClientCursor, ClientCursor::Deleter>{cursor}); + toDisposeWithoutMutex.emplace_back(cursor); it = lockedPartition->erase(it); } else { ++it; @@ -654,7 +660,11 @@ std::size_t CursorManager::timeoutCursors(OperationContext* opCtx, Date_t now) { } } - return toDelete.size(); + // Be careful not to dispose of cursors while holding the partition lock. + for (auto&& cursor : toDisposeWithoutMutex) { + cursor->dispose(opCtx); + } + return toDisposeWithoutMutex.size(); } namespace { @@ -691,10 +701,9 @@ StatusWith<ClientCursorPin> CursorManager::pinCursor(OperationContext* opCtx, if (cursor->getExecutor()->isMarkedAsKilled()) { // This cursor was killed while it was idle. Status error = cursor->getExecutor()->getKillStatus(); - lockedPartition->erase(cursor->cursorid()); - removeTransactionCursorReference(cursor); - cursor->dispose(opCtx); - delete cursor; + deregisterAndDestroyCursor(std::move(lockedPartition), + opCtx, + std::unique_ptr<ClientCursor, ClientCursor::Deleter>(cursor)); return error; } @@ -716,7 +725,8 @@ StatusWith<ClientCursorPin> CursorManager::pinCursor(OperationContext* opCtx, return ClientCursorPin(opCtx, cursor); } -void CursorManager::unpin(OperationContext* opCtx, ClientCursor* cursor) { +void CursorManager::unpin(OperationContext* opCtx, + std::unique_ptr<ClientCursor, ClientCursor::Deleter> cursor) { // Avoid computing the current time within the critical section. auto now = opCtx->getServiceContext()->getPreciseClockSource()->now(); @@ -737,13 +747,14 @@ void CursorManager::unpin(OperationContext* opCtx, ClientCursor* cursor) { if (interruptStatus == ErrorCodes::Interrupted || interruptStatus == ErrorCodes::CursorKilled) { LOG(0) << "removing cursor " << cursor->cursorid() << " after completing batch: " << interruptStatus; - partition->erase(cursor->cursorid()); - removeTransactionCursorReference(cursor); - cursor->dispose(opCtx); - delete cursor; + return deregisterAndDestroyCursor(std::move(partition), opCtx, std::move(cursor)); } else if (!interruptStatus.isOK()) { cursor->markAsKilled(interruptStatus); } + + // The cursor will stay around in '_cursorMap', so release the unique pointer to avoid deleting + // it. + cursor.release(); } void CursorManager::getCursorIds(std::set<CursorId>* openCursors) const { @@ -863,6 +874,22 @@ void CursorManager::deregisterCursor(ClientCursor* cursor) { removeTransactionCursorReference(cursor); } +void CursorManager::deregisterAndDestroyCursor( + Partitioned<stdx::unordered_map<CursorId, ClientCursor*>, kNumPartitions>::OnePartition&& lk, + OperationContext* opCtx, + std::unique_ptr<ClientCursor, ClientCursor::Deleter> cursor) { + { + auto lockWithRestrictedScope = std::move(lk); + lockWithRestrictedScope->erase(cursor->cursorid()); + removeTransactionCursorReference(cursor.get()); + } + // Dispose of the cursor without holding any cursor manager mutexes. Disposal of a cursor can + // require taking lock manager locks, which we want to avoid while holding a mutex. If we did + // so, any caller of a CursorManager method which already held a lock manager lock could induce + // a deadlock when trying to acquire a CursorManager lock. + cursor->dispose(opCtx); +} + Status CursorManager::killCursor(OperationContext* opCtx, CursorId id, bool shouldAudit, @@ -911,9 +938,7 @@ Status CursorManager::killCursor(OperationContext* opCtx, audit::logKillCursorsAuthzCheck(opCtx->getClient(), _nss, id, ErrorCodes::OK); } - lockedPartition->erase(ownedCursor->cursorid()); - cursor->_cursorManager->removeTransactionCursorReference(cursor); - ownedCursor->dispose(opCtx); + deregisterAndDestroyCursor(std::move(lockedPartition), opCtx, std::move(ownedCursor)); return Status::OK(); } diff --git a/src/mongo/db/cursor_manager.h b/src/mongo/db/cursor_manager.h index 2104a558bae..6817455ec0d 100644 --- a/src/mongo/db/cursor_manager.h +++ b/src/mongo/db/cursor_manager.h @@ -292,8 +292,13 @@ private: OperationContext* opCtx, std::unique_ptr<ClientCursor, ClientCursor::Deleter> clientCursor); void deregisterCursor(ClientCursor* cursor); + void deregisterAndDestroyCursor( + Partitioned<stdx::unordered_map<CursorId, ClientCursor*>, kNumPartitions>::OnePartition&&, + OperationContext* opCtx, + std::unique_ptr<ClientCursor, ClientCursor::Deleter> cursor); - void unpin(OperationContext* opCtx, ClientCursor* cursor); + void unpin(OperationContext* opCtx, + std::unique_ptr<ClientCursor, ClientCursor::Deleter> cursor); bool cursorShouldTimeout_inlock(const ClientCursor* cursor, Date_t now); |