From 40747744d90190e44986561119b35d6e791c12dd Mon Sep 17 00:00:00 2001 From: Tommaso Tocci Date: Tue, 29 Nov 2022 18:02:19 +0000 Subject: SERVER-70323 converts many sharding invariants to tassert --- src/mongo/db/s/collection_sharding_runtime.cpp | 27 ++-- .../db/s/shard_server_catalog_cache_loader.cpp | 46 ++++--- src/mongo/db/s/sharding_recovery_service.cpp | 144 +++++++++++++-------- src/mongo/s/catalog_cache.cpp | 69 +++++----- src/mongo/s/client/shard_registry.cpp | 4 +- 5 files changed, 177 insertions(+), 113 deletions(-) diff --git a/src/mongo/db/s/collection_sharding_runtime.cpp b/src/mongo/db/s/collection_sharding_runtime.cpp index 51a4f6ac5a6..35e65b1dc8b 100644 --- a/src/mongo/db/s/collection_sharding_runtime.cpp +++ b/src/mongo/db/s/collection_sharding_runtime.cpp @@ -110,8 +110,9 @@ ScopedCollectionFilter CollectionShardingRuntime::getOwnershipFilter( if (!supportNonVersionedOperations) { optReceivedShardVersion = getOperationReceivedVersion(opCtx, _nss); // No operations should be calling getOwnershipFilter without a shard version - invariant(optReceivedShardVersion, - "getOwnershipFilter called by operation that doesn't specify shard version"); + tassert(7032300, + "getOwnershipFilter called by operation that doesn't specify shard version", + optReceivedShardVersion); } auto metadata = @@ -120,10 +121,11 @@ ScopedCollectionFilter CollectionShardingRuntime::getOwnershipFilter( supportNonVersionedOperations); if (!supportNonVersionedOperations) { - invariant(!ShardVersion::isIgnoredVersion(*optReceivedShardVersion) || - !metadata->get().allowMigrations() || !metadata->get().isSharded(), - "For sharded collections getOwnershipFilter cannot be relied on without a valid " - "shard version"); + tassert(7032301, + "For sharded collections getOwnershipFilter cannot be relied on without a valid " + "shard version", + !ShardVersion::isIgnoredVersion(*optReceivedShardVersion) || + !metadata->get().allowMigrations() || !metadata->get().isSharded()); } return {std::move(metadata)}; @@ -200,8 +202,9 @@ boost::optional> CollectionShardingRuntime::getCriticalSe void CollectionShardingRuntime::setFilteringMetadata(OperationContext* opCtx, CollectionMetadata newMetadata) { - invariant(!newMetadata.isSharded() || !_nss.isNamespaceAlwaysUnsharded(), - str::stream() << "Namespace " << _nss.ns() << " must never be sharded."); + tassert(7032302, + str::stream() << "Namespace " << _nss.ns() << " must never be sharded.", + !newMetadata.isSharded() || !_nss.isNamespaceAlwaysUnsharded()); stdx::lock_guard lk(_metadataManagerLock); @@ -572,7 +575,9 @@ CollectionCriticalSection::CollectionCriticalSection(OperationContext* opCtx, Milliseconds(migrationLockAcquisitionMaxWaitMS.load()))); auto scopedCsr = CollectionShardingRuntime::assertCollectionLockedAndAcquire( _opCtx, _nss, CSRAcquisitionMode::kExclusive); - invariant(scopedCsr->getCurrentMetadataIfKnown()); + tassert(7032305, + "Collection metadata unknown when entering critical section", + scopedCsr->getCurrentMetadataIfKnown()); scopedCsr->enterCriticalSectionCatchUpPhase(_reason); } @@ -594,7 +599,9 @@ void CollectionCriticalSection::enterCommitPhase() { Milliseconds(migrationLockAcquisitionMaxWaitMS.load()))); auto scopedCsr = CollectionShardingRuntime::assertCollectionLockedAndAcquire( _opCtx, _nss, CSRAcquisitionMode::kExclusive); - invariant(scopedCsr->getCurrentMetadataIfKnown()); + tassert(7032304, + "Collection metadata unknown when entering critical section commit phase", + scopedCsr->getCurrentMetadataIfKnown()); scopedCsr->enterCriticalSectionCommitPhase(_reason); } diff --git a/src/mongo/db/s/shard_server_catalog_cache_loader.cpp b/src/mongo/db/s/shard_server_catalog_cache_loader.cpp index cbc44c21b03..6264ee66cde 100644 --- a/src/mongo/db/s/shard_server_catalog_cache_loader.cpp +++ b/src/mongo/db/s/shard_server_catalog_cache_loader.cpp @@ -29,6 +29,8 @@ #include "mongo/db/s/shard_server_catalog_cache_loader.h" +#include + #include "mongo/db/catalog/rename_collection.h" #include "mongo/db/client.h" #include "mongo/db/db_raii.h" @@ -760,9 +762,10 @@ ShardServerCatalogCacheLoader::_schedulePrimaryGetChunksSince( } // After finding metadata remotely, we must have found metadata locally. - invariant(!collAndChunks.changedChunks.empty(), - str::stream() << "No chunks metadata found for collection '" << nss - << "' despite the config server returned actual information"); + tassert(7032350, + str::stream() << "No chunks metadata found for collection '" << nss + << "' despite the config server returned actual information", + !collAndChunks.changedChunks.empty()); return swCollectionAndChangedChunks; }; @@ -1168,7 +1171,9 @@ void ShardServerCatalogCacheLoader::_updatePersistedCollAndChunksMetadata( stdx::unique_lock lock(_mutex); const CollAndChunkTask& task = _collAndChunkTaskLists[nss].front(); - invariant(task.dropped || !task.collectionAndChangedChunks->changedChunks.empty()); + tassert(7032351, + "Invalid CollAndChunkTask state", + task.dropped || !task.collectionAndChangedChunks->changedChunks.empty()); // If this task is from an old term and no longer valid, do not execute and return true so that // the task gets removed from the task list @@ -1291,10 +1296,15 @@ ShardServerCatalogCacheLoader::CollAndChunkTask::CollAndChunkTask( termCreated(currentTerm) { if (statusWithCollectionAndChangedChunks.isOK()) { collectionAndChangedChunks = std::move(statusWithCollectionAndChangedChunks.getValue()); - invariant(!collectionAndChangedChunks->changedChunks.empty()); + tassert(7032354, + "Found no chunks in retrieved collection metadata", + !collectionAndChangedChunks->changedChunks.empty()); maxQueryVersion = collectionAndChangedChunks->changedChunks.back().getVersion(); } else { - invariant(statusWithCollectionAndChangedChunks == ErrorCodes::NamespaceNotFound); + tassert(7032358, + fmt::format("Encountered unexpected error while fetching collection metadata: {}", + statusWithCollectionAndChangedChunks.getStatus().toString()), + statusWithCollectionAndChangedChunks == ErrorCodes::NamespaceNotFound); dropped = true; maxQueryVersion = ChunkVersion::UNSHARDED(); } @@ -1306,7 +1316,10 @@ ShardServerCatalogCacheLoader::DBTask::DBTask(StatusWith swDatabas if (swDatabaseType.isOK()) { dbType = std::move(swDatabaseType.getValue()); } else { - invariant(swDatabaseType == ErrorCodes::NamespaceNotFound); + tassert(7032355, + fmt::format("Encountered unexpected error while fetching database metadata: {}", + swDatabaseType.getStatus().toString()), + swDatabaseType == ErrorCodes::NamespaceNotFound); } } @@ -1329,10 +1342,11 @@ void ShardServerCatalogCacheLoader::CollAndChunkTaskList::addTask(CollAndChunkTa } if (task.dropped) { - invariant(lastTask.maxQueryVersion == task.minQueryVersion, - str::stream() << "The version of the added task is not contiguous with that of " - << "the previous one: LastTask {" << lastTask.toString() << "}, " - << "AddedTask {" << task.toString() << "}"); + tassert(7032356, + str::stream() << "The version of the added task is not contiguous with that of " + << "the previous one: LastTask {" << lastTask.toString() << "}, " + << "AddedTask {" << task.toString() << "}", + lastTask.maxQueryVersion == task.minQueryVersion); // As an optimization, on collection drop, clear any pending tasks in order to prevent any // throw-away work from executing. Because we have no way to differentiate whether the @@ -1346,11 +1360,11 @@ void ShardServerCatalogCacheLoader::CollAndChunkTaskList::addTask(CollAndChunkTa } } else { // Tasks must have contiguous versions, unless a complete reload occurs. - invariant(lastTask.maxQueryVersion == task.minQueryVersion || !task.minQueryVersion.isSet(), - str::stream() << "The added task is not the first and its version is not " - << "contiguous with that of the previous one: LastTask {" - << lastTask.toString() << "}, AddedTask {" << task.toString() - << "}"); + tassert(7032357, + str::stream() << "The added task is not the first and its version is not " + << "contiguous with that of the previous one: LastTask {" + << lastTask.toString() << "}, AddedTask {" << task.toString() << "}", + lastTask.maxQueryVersion == task.minQueryVersion || !task.minQueryVersion.isSet()); _tasks.emplace_back(std::move(task)); } diff --git a/src/mongo/db/s/sharding_recovery_service.cpp b/src/mongo/db/s/sharding_recovery_service.cpp index 10505a5997b..6c940009b4b 100644 --- a/src/mongo/db/s/sharding_recovery_service.cpp +++ b/src/mongo/db/s/sharding_recovery_service.cpp @@ -27,9 +27,9 @@ * it in the license file. */ -#include -#include "mongo/platform/basic.h" +#include +#include #include "mongo/db/s/sharding_recovery_service.h" @@ -149,7 +149,12 @@ void ShardingRecoveryService::acquireRecoverableCriticalSectionBlockWrites( "reason"_attr = reason, "writeConcern"_attr = writeConcern); - invariant(!opCtx->lockState()->isLocked()); + tassert(7032360, + fmt::format("Can't acquire recoverable critical section for collection '{}' with " + "reason '{}' while holding locks", + nss.toString(), + reason.toString()), + !opCtx->lockState()->isLocked()); { Lock::GlobalLock lk(opCtx, MODE_IX); @@ -169,25 +174,29 @@ void ShardingRecoveryService::acquireRecoverableCriticalSectionBlockWrites( BSON(CollectionCriticalSectionDocument::kNssFieldName << nss.toString())); auto cursor = dbClient.find(std::move(findRequest)); - // if there is a doc with the same nss -> in order to not fail it must have the same reason + // if there is a doc with the same nss -> in order to not fail it must have the same + // reason if (cursor->more()) { const auto bsonObj = cursor->next(); const auto collCSDoc = CollectionCriticalSectionDocument::parse( IDLParserContext("AcquireRecoverableCSBW"), bsonObj); - invariant(collCSDoc.getReason().woCompare(reason) == 0, - str::stream() - << "Trying to acquire a critical section blocking writes for namespace " - << nss << " and reason " << reason << " but it is already taken by " - << "another operation with different reason " << collCSDoc.getReason()); + tassert(7032368, + fmt::format("Trying to acquire a critical section blocking writes for " + "namespace '{}' and reason '{}' but it is already taken by another " + "operation with different reason '{}'", + nss.toString(), + reason.toString(), + collCSDoc.getReason().toString()), + collCSDoc.getReason().woCompare(reason) == 0); - LOGV2_DEBUG( - 5656601, - 3, - "The recoverable critical section was already acquired to block writes, do nothing", - "namespace"_attr = nss, - "reason"_attr = reason, - "writeConcern"_attr = writeConcern); + LOGV2_DEBUG(5656601, + 3, + "The recoverable critical section was already acquired to block " + "writes, do nothing", + "namespace"_attr = nss, + "reason"_attr = reason, + "writeConcern"_attr = writeConcern); return; } @@ -195,8 +204,10 @@ void ShardingRecoveryService::acquireRecoverableCriticalSectionBlockWrites( // The collection critical section is not taken, try to acquire it. // The following code will try to add a doc to config.criticalCollectionSections: - // - If everything goes well, the shard server op observer will acquire the in-memory CS. - // - Otherwise this call will fail and the CS won't be taken (neither persisted nor in-mem) + // - If everything goes well, the shard server op observer will acquire the in-memory + // CS. + // - Otherwise this call will fail and the CS won't be taken (neither persisted nor + // in-mem) CollectionCriticalSectionDocument newDoc(nss, reason, false /* blockReads */); newDoc.setAdditionalInfo(additionalInfo); @@ -213,10 +224,13 @@ void ShardingRecoveryService::acquireRecoverableCriticalSectionBlockWrites( BatchedCommandResponse batchedResponse; std::string unusedErrmsg; batchedResponse.parseBSON(commandReply, &unusedErrmsg); - invariant(batchedResponse.getN() > 0, - str::stream() << "Insert did not add any doc to collection " - << NamespaceString::kCollectionCriticalSectionsNamespace - << " for namespace " << nss << " and reason " << reason); + tassert(7032369, + fmt::format("Insert did not add any doc to collection '{}' for namespace '{}' " + "and reason '{}'", + nss.toString(), + reason.toString(), + NamespaceString::kCollectionCriticalSectionsNamespace.toString()), + batchedResponse.getN() > 0); } WriteConcernResult ignoreResult; @@ -243,7 +257,12 @@ void ShardingRecoveryService::promoteRecoverableCriticalSectionToBlockAlsoReads( "reason"_attr = reason, "writeConcern"_attr = writeConcern); - invariant(!opCtx->lockState()->isLocked()); + tassert(7032364, + fmt::format("Can't promote recoverable critical section for collection '{}' with " + "reason '{}' while holding locks", + nss.toString(), + reason.toString()), + !opCtx->lockState()->isLocked()); { boost::optional dbLock; @@ -262,24 +281,29 @@ void ShardingRecoveryService::promoteRecoverableCriticalSectionToBlockAlsoReads( BSON(CollectionCriticalSectionDocument::kNssFieldName << nss.toString())); auto cursor = dbClient.find(std::move(findRequest)); - invariant( - cursor->more(), - str::stream() << "Trying to acquire a critical section blocking reads for namespace " - << nss << " and reason " << reason - << " but the critical section wasn't acquired first blocking writers."); + tassert(7032361, + fmt::format( + "Trying to acquire a critical section blocking reads for namespace '{}' and " + "reason '{}' but the critical section wasn't acquired first blocking writers.", + nss.toString(), + reason.toString()), + cursor->more()); BSONObj bsonObj = cursor->next(); const auto collCSDoc = CollectionCriticalSectionDocument::parse( IDLParserContext("AcquireRecoverableCSBR"), bsonObj); - invariant( - collCSDoc.getReason().woCompare(reason) == 0, - str::stream() << "Trying to acquire a critical section blocking reads for namespace " - << nss << " and reason " << reason - << " but it is already taken by another operation with different reason " - << collCSDoc.getReason()); - - // if there is a document with the same nss, reason and blocking reads -> do nothing, the CS - // is already taken! + tassert(7032362, + fmt::format( + "Trying to acquire a critical section blocking reads for namespace '{}' and " + "reason " + "'{}' but it is already taken by another operation with different reason '{}'", + nss.toString(), + reason.toString(), + collCSDoc.getReason().toString()), + collCSDoc.getReason().woCompare(reason) == 0); + + // if there is a document with the same nss, reason and blocking reads -> do nothing, + // the CS is already taken! if (collCSDoc.getBlockReads()) { LOGV2_DEBUG(5656604, 3, @@ -294,8 +318,8 @@ void ShardingRecoveryService::promoteRecoverableCriticalSectionToBlockAlsoReads( // The CS is in the catch-up phase, try to advance it to the commit phase. // The following code will try to update a doc from config.criticalCollectionSections: - // - If everything goes well, the shard server op observer will advance the in-memory CS to - // the + // - If everything goes well, the shard server op observer will advance the in-memory CS + // to the // commit phase (blocking readers). // - Otherwise this call will fail and the CS won't be advanced (neither persisted nor // in-mem) @@ -321,10 +345,13 @@ void ShardingRecoveryService::promoteRecoverableCriticalSectionToBlockAlsoReads( BatchedCommandResponse batchedResponse; std::string unusedErrmsg; batchedResponse.parseBSON(commandReply, &unusedErrmsg); - invariant(batchedResponse.getNModified() > 0, - str::stream() << "Update did not modify any doc from collection " - << NamespaceString::kCollectionCriticalSectionsNamespace - << " for namespace " << nss << " and reason " << reason); + tassert(7032363, + fmt::format("Update did not modify any doc from collection '{}' for namespace '{}' " + "and reason '{}'", + NamespaceString::kCollectionCriticalSectionsNamespace.toString(), + nss.toString(), + reason.toString()), + batchedResponse.getNModified() > 0); } WriteConcernResult ignoreResult; @@ -352,7 +379,12 @@ void ShardingRecoveryService::releaseRecoverableCriticalSection( "reason"_attr = reason, "writeConcern"_attr = writeConcern); - invariant(!opCtx->lockState()->isLocked()); + tassert(7032365, + fmt::format("Can't release recoverable critical section for collection '{}' with " + "reason '{}' while holding locks", + nss.toString(), + reason.toString()), + !opCtx->lockState()->isLocked()); { boost::optional dbLock; @@ -401,12 +433,13 @@ void ShardingRecoveryService::releaseRecoverableCriticalSection( return; } - invariant( - !isDifferentReason, - str::stream() << "Trying to release a critical for namespace " << nss << " and reason " - << reason - << " but it is already taken by another operation with different reason " - << collCSDoc.getReason()); + tassert(7032366, + fmt::format("Trying to release a critical for namespace '{}' and reason '{}' but " + "it is already taken by another operation with different reason '{}'", + nss.toString(), + reason.toString(), + collCSDoc.getReason().toString()), + !isDifferentReason); // The collection critical section is taken (in any phase), try to release it. @@ -436,10 +469,13 @@ void ShardingRecoveryService::releaseRecoverableCriticalSection( BatchedCommandResponse batchedResponse; std::string unusedErrmsg; batchedResponse.parseBSON(commandReply, &unusedErrmsg); - invariant(batchedResponse.getN() > 0, - str::stream() << "Delete did not remove any doc from collection " - << NamespaceString::kCollectionCriticalSectionsNamespace - << " for namespace " << nss << " and reason " << reason); + tassert(7032367, + fmt::format("Delete did not remove any doc from collection '{}' for namespace '{}' " + "and reason '{}'", + NamespaceString::kCollectionCriticalSectionsNamespace.toString(), + nss.toString(), + reason.toString()), + batchedResponse.getN() > 0); } WriteConcernResult ignoreResult; diff --git a/src/mongo/s/catalog_cache.cpp b/src/mongo/s/catalog_cache.cpp index ed5ab9d5e31..0bfb5078009 100644 --- a/src/mongo/s/catalog_cache.cpp +++ b/src/mongo/s/catalog_cache.cpp @@ -29,6 +29,8 @@ #include "mongo/s/catalog_cache.h" +#include + #include "mongo/bson/bsonobjbuilder.h" #include "mongo/db/curop.h" #include "mongo/db/query/collation/collator_factory_interface.h" @@ -79,28 +81,31 @@ std::shared_ptr createUpdatedRoutingTableHistory( if (isIncremental && collectionAndChunks.changedChunks.size() == 1 && collectionAndChunks.changedChunks[0].getVersion() == existingHistory->optRt->getVersion()) { - invariant(collectionAndChunks.allowMigrations == existingHistory->optRt->allowMigrations(), - str::stream() << "allowMigrations field of " << nss - << " collection changed without changing the collection version " - << existingHistory->optRt->getVersion().toString() - << ". Old value: " << existingHistory->optRt->allowMigrations() - << ", new value: " << collectionAndChunks.allowMigrations); + tassert(7032310, + fmt::format("allowMigrations field of collection '{}' changed without changing the " + "collection version {}. Old value: {}, new value: {}", + nss.toString(), + existingHistory->optRt->getVersion().toString(), + existingHistory->optRt->allowMigrations(), + collectionAndChunks.allowMigrations), + collectionAndChunks.allowMigrations == existingHistory->optRt->allowMigrations()); const auto& oldReshardingFields = existingHistory->optRt->getReshardingFields(); const auto& newReshardingFields = collectionAndChunks.reshardingFields; - invariant( - [&] { - if (oldReshardingFields && newReshardingFields) - return oldReshardingFields->toBSON().woCompare(newReshardingFields->toBSON()) == - 0; - else - return !oldReshardingFields && !newReshardingFields; - }(), - str::stream() << "reshardingFields field of " << nss - << " collection changed without changing the collection version " - << existingHistory->optRt->getVersion().toString() - << ". Old value: " << oldReshardingFields->toBSON() - << ", new value: " << newReshardingFields->toBSON()); + tassert(7032311, + fmt::format("reshardingFields field of collection '{}' changed without changing " + "the collection version {}. Old value: {}, new value: {}", + nss.toString(), + existingHistory->optRt->getVersion().toString(), + oldReshardingFields->toBSON().toString(), + newReshardingFields->toBSON().toString()), + [&] { + if (oldReshardingFields && newReshardingFields) + return oldReshardingFields->toBSON().woCompare( + newReshardingFields->toBSON()) == 0; + else + return !oldReshardingFields && !newReshardingFields; + }()); return existingHistory->optRt; } @@ -112,7 +117,11 @@ std::shared_ptr createUpdatedRoutingTableHistory( return 0; } if (collectionAndChunks.maxChunkSizeBytes) { - invariant(collectionAndChunks.maxChunkSizeBytes.value() > 0); + tassert(7032312, + fmt::format("Invalid maxChunkSizeBytes value {} for collection '{}'", + nss.toString(), + collectionAndChunks.maxChunkSizeBytes.value()), + collectionAndChunks.maxChunkSizeBytes.value() > 0); return uint64_t(*collectionAndChunks.maxChunkSizeBytes); } return boost::none; @@ -262,13 +271,11 @@ CatalogCache::~CatalogCache() { StatusWith CatalogCache::getDatabase(OperationContext* opCtx, StringData dbName, bool allowLocks) { - if (!allowLocks) { - invariant( - !opCtx->lockState() || !opCtx->lockState()->isLocked(), + tassert(7032313, "Do not hold a lock while refreshing the catalog cache. Doing so would potentially " "hold the lock during a network call, and can lead to a deadlock as described in " - "SERVER-37398."); - } + "SERVER-37398.", + allowLocks || !opCtx->lockState() || !opCtx->lockState()->isLocked()); Timer t{}; ScopeGuard finishTiming([&] { @@ -292,13 +299,11 @@ StatusWith CatalogCache::_getCollectionPlacementInfoAt( const NamespaceString& nss, boost::optional atClusterTime, bool allowLocks) { - if (!allowLocks) { - invariant(!opCtx->lockState() || !opCtx->lockState()->isLocked(), - "Do not hold a lock while refreshing the catalog cache. Doing so would " - "potentially hold " - "the lock during a network call, and can lead to a deadlock as described in " - "SERVER-37398."); - } + tassert(7032314, + "Do not hold a lock while refreshing the catalog cache. Doing so would potentially " + "hold the lock during a network call, and can lead to a deadlock as described in " + "SERVER-37398.", + allowLocks || !opCtx->lockState() || !opCtx->lockState()->isLocked()); try { const auto swDbInfo = getDatabase(opCtx, nss.db(), allowLocks); diff --git a/src/mongo/s/client/shard_registry.cpp b/src/mongo/s/client/shard_registry.cpp index 3ae0325c02f..b48aaf941ea 100644 --- a/src/mongo/s/client/shard_registry.cpp +++ b/src/mongo/s/client/shard_registry.cpp @@ -126,7 +126,9 @@ ShardRegistry::Cache::LookupResult ShardRegistry::_lookup(OperationContext* opCt invariant(key == _kSingleton); // This function can potentially block for a long time on network activity, so holding of locks // is disallowed. - invariant(!opCtx->lockState() || !opCtx->lockState()->isLocked()); + tassert(7032320, + "Can't perform ShardRegistry lookup while holding locks", + !opCtx->lockState() || !opCtx->lockState()->isLocked()); auto lastForcedReloadIncrement = _forceReloadIncrement.load(); -- cgit v1.2.1