diff options
author | Kaloian Manassiev <kaloian.manassiev@mongodb.com> | 2022-03-11 13:46:49 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-03-11 14:21:00 +0000 |
commit | 8ff255e3b85a0f4c74c2fa3842e292076926c6ff (patch) | |
tree | 695ca135ad5041761960255f7547368953ca45db | |
parent | f690e6856e9cbc7ee1bc3bcf20dfa4eed6621e2a (diff) | |
download | mongo-8ff255e3b85a0f4c74c2fa3842e292076926c6ff.tar.gz |
SERVER-63327 Remove usages of StaleShardVersion from the migrations path
-rw-r--r-- | src/mongo/db/s/balancer/balancer_defragmentation_policy_impl.cpp | 18 | ||||
-rw-r--r-- | src/mongo/db/s/collection_metadata.cpp | 24 | ||||
-rw-r--r-- | src/mongo/db/s/collection_metadata.h | 5 | ||||
-rw-r--r-- | src/mongo/db/s/collection_sharding_runtime.cpp | 15 | ||||
-rw-r--r-- | src/mongo/db/s/collection_sharding_state.h | 6 | ||||
-rw-r--r-- | src/mongo/db/s/migration_destination_manager_legacy_commands.cpp | 14 | ||||
-rw-r--r-- | src/mongo/db/s/migration_source_manager.cpp | 62 | ||||
-rw-r--r-- | src/mongo/db/s/scoped_operation_completion_sharding_actions.cpp | 7 | ||||
-rw-r--r-- | src/mongo/db/service_entry_point_common.cpp | 8 | ||||
-rw-r--r-- | src/mongo/executor/network_interface_tl.cpp | 5 | ||||
-rw-r--r-- | src/mongo/s/transaction_router.h | 4 | ||||
-rw-r--r-- | src/mongo/s/write_ops/batch_write_exec.h | 2 | ||||
-rw-r--r-- | src/mongo/s/write_ops/batch_write_op.cpp | 4 |
13 files changed, 100 insertions, 74 deletions
diff --git a/src/mongo/db/s/balancer/balancer_defragmentation_policy_impl.cpp b/src/mongo/db/s/balancer/balancer_defragmentation_policy_impl.cpp index 0b1d26b0296..bb891bcff60 100644 --- a/src/mongo/db/s/balancer/balancer_defragmentation_policy_impl.cpp +++ b/src/mongo/db/s/balancer/balancer_defragmentation_policy_impl.cpp @@ -90,9 +90,21 @@ ZoneInfo getCollectionZones(OperationContext* opCtx, const CollectionType& coll) return zones; } -bool isRetriableForDefragmentation(const Status& error) { - return (ErrorCodes::isA<ErrorCategory::RetriableError>(error) || - error == ErrorCodes::StaleConfig); +bool isRetriableForDefragmentation(const Status& status) { + if (ErrorCodes::isA<ErrorCategory::RetriableError>(status)) + return true; + + if (status == ErrorCodes::StaleConfig) { + if (auto staleInfo = status.extraInfo<StaleConfigInfo>()) { + // If the staleInfo error contains a "wanted" version, this means the donor shard which + // returned this error has its versioning information up-to-date (as opposed to UNKNOWN) + // and it couldn't find the chunk that the defragmenter expected. Such a situation can + // only arise as a result of manual split/merge/move concurrently with the defragmenter. + return !staleInfo->getVersionWanted(); + } + } + + return false; } void handleActionResult(OperationContext* opCtx, diff --git a/src/mongo/db/s/collection_metadata.cpp b/src/mongo/db/s/collection_metadata.cpp index 39933678d66..f68c39ef0ce 100644 --- a/src/mongo/db/s/collection_metadata.cpp +++ b/src/mongo/db/s/collection_metadata.cpp @@ -181,30 +181,6 @@ bool CollectionMetadata::getNextChunk(const BSONObj& lookupKey, ChunkType* chunk return true; } -Status CollectionMetadata::checkChunkIsValid(const ChunkType& chunk) const { - invariant(isSharded()); - - ChunkType existingChunk; - - if (!getNextChunk(chunk.getMin(), &existingChunk)) { - return {ErrorCodes::StaleShardVersion, - str::stream() << "Chunk with bounds " - << ChunkRange(chunk.getMin(), chunk.getMax()).toString() - << " is not owned by this shard."}; - } - - if (existingChunk.getMin().woCompare(chunk.getMin()) || - existingChunk.getMax().woCompare(chunk.getMax())) { - return {ErrorCodes::StaleShardVersion, - str::stream() << "Unable to find chunk with the exact bounds " - << chunk.getRange().toString() << " at collection version " - << getCollVersion().toString() - << " found existing chunk: " << existingChunk.toString()}; - } - - return Status::OK(); -} - bool CollectionMetadata::currentShardHasAnyChunks() const { invariant(isSharded()); std::set<ShardId> shards; diff --git a/src/mongo/db/s/collection_metadata.h b/src/mongo/db/s/collection_metadata.h index 4cf162648b5..84ff44a6c90 100644 --- a/src/mongo/db/s/collection_metadata.h +++ b/src/mongo/db/s/collection_metadata.h @@ -204,11 +204,6 @@ public: bool getNextChunk(const BSONObj& lookupKey, ChunkType* chunk) const; /** - * Validates that the passed-in chunk's bounds exactly match a chunk in the metadata cache. - */ - Status checkChunkIsValid(const ChunkType& chunk) const; - - /** * Returns true if the argument range overlaps any chunk. */ bool rangeOverlapsChunk(const ChunkRange& range) const { diff --git a/src/mongo/db/s/collection_sharding_runtime.cpp b/src/mongo/db/s/collection_sharding_runtime.cpp index 1f40b068e86..fa4e68cc9af 100644 --- a/src/mongo/db/s/collection_sharding_runtime.cpp +++ b/src/mongo/db/s/collection_sharding_runtime.cpp @@ -29,8 +29,6 @@ #define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kSharding -#include "mongo/platform/basic.h" - #include "mongo/db/s/collection_sharding_runtime.h" #include "mongo/base/checked_cast.h" @@ -136,8 +134,8 @@ ScopedCollectionDescription CollectionShardingRuntime::getCollectionDescription( auto optMetadata = _getCurrentMetadataIfKnown(boost::none); uassert( StaleConfigInfo(_nss, - ChunkVersion::UNSHARDED(), - boost::none, + ChunkVersion::IGNORED() /* receivedVersion */, + boost::none /* wantedVersion */, ShardingState::get(_serviceContext)->shardId()), str::stream() << "sharding status of collection " << _nss.ns() << " is not currently available for description and needs to be recovered " @@ -347,9 +345,10 @@ CollectionShardingRuntime::_getMetadataWithVersionCheckAt( auto csrLock = CSRLock::lockShared(opCtx, this); auto optCurrentMetadata = _getCurrentMetadataIfKnown(atClusterTime); - - uassert(StaleConfigInfo( - _nss, receivedShardVersion, boost::none, ShardingState::get(opCtx)->shardId()), + uassert(StaleConfigInfo(_nss, + receivedShardVersion, + boost::none /* wantedVersion */, + ShardingState::get(opCtx)->shardId()), str::stream() << "sharding status of collection " << _nss.ns() << " is not currently known and needs to be recovered", optCurrentMetadata); @@ -362,7 +361,7 @@ CollectionShardingRuntime::_getMetadataWithVersionCheckAt( : ShardingMigrationCriticalSection::kRead); uassert(StaleConfigInfo(_nss, receivedShardVersion, - boost::none, + boost::none /* wantedVersion */, ShardingState::get(opCtx)->shardId(), std::move(criticalSectionSignal)), str::stream() << "migration commit in progress for " << _nss.ns(), diff --git a/src/mongo/db/s/collection_sharding_state.h b/src/mongo/db/s/collection_sharding_state.h index 53adbe24fde..575ccb85f2f 100644 --- a/src/mongo/db/s/collection_sharding_state.h +++ b/src/mongo/db/s/collection_sharding_state.h @@ -97,8 +97,8 @@ public: static void appendInfoForServerStatus(OperationContext* opCtx, BSONObjBuilder* builder); /** - * If the shard currently doesn't know whether the collection is sharded or not, it will throw - * StaleShardVersion. + * If the shard currently doesn't know whether the collection is sharded or not, it will throw a + * StaleConfig exception. * * If the request doesn't have a shard version all collections will be treated as UNSHARDED. * @@ -112,7 +112,7 @@ public: * * If the shard currently doesn't know whether the collection is sharded or not, or if the * expected shard version doesn't match with the one in the OperationShardingState, it will - * throw StaleShardVersion. + * throw a StaleConfig exception. * * If the operation context contains an 'atClusterTime', the returned filtering object will be * tied to a specific point in time. Otherwise, it will reference the latest cluster time diff --git a/src/mongo/db/s/migration_destination_manager_legacy_commands.cpp b/src/mongo/db/s/migration_destination_manager_legacy_commands.cpp index 7f90fe3a6d6..8233113e409 100644 --- a/src/mongo/db/s/migration_destination_manager_legacy_commands.cpp +++ b/src/mongo/db/s/migration_destination_manager_legacy_commands.cpp @@ -29,8 +29,6 @@ #define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kSharding -#include "mongo/platform/basic.h" - #include "mongo/db/auth/action_set.h" #include "mongo/db/auth/action_type.h" #include "mongo/db/auth/authorization_session.h" @@ -116,15 +114,19 @@ public: // We force a refresh immediately after registering this migration to guarantee that this // shard will not receive a chunk after refreshing. onShardVersionMismatch(opCtx, nss, boost::none); + const auto shardId = ShardingState::get(opCtx)->shardId(); const auto collectionEpoch = [&] { AutoGetCollection autoColl(opCtx, nss, MODE_IS); auto const optMetadata = CollectionShardingRuntime::get(opCtx, nss)->getCurrentMetadataIfKnown(); - uassert( - ErrorCodes::StaleShardVersion, - "Collection's metadata have been found UNKNOWN after a refresh on the recipient", - optMetadata); + uassert(StaleConfigInfo(nss, + ChunkVersion::IGNORED() /* receivedVersion */, + boost::none /* wantedVersion */, + shardId, + boost::none), + "The collection's sharding state was cleared by a concurrent operation", + optMetadata); return optMetadata->getShardVersion().epoch(); }(); diff --git a/src/mongo/db/s/migration_source_manager.cpp b/src/mongo/db/s/migration_source_manager.cpp index 52735e850c5..84598e4e1a2 100644 --- a/src/mongo/db/s/migration_source_manager.cpp +++ b/src/mongo/db/s/migration_source_manager.cpp @@ -29,8 +29,6 @@ #define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kShardingMigration -#include "mongo/platform/basic.h" - #include "mongo/db/s/migration_source_manager.h" #include "mongo/bson/bsonobjbuilder.h" @@ -175,6 +173,7 @@ MigrationSourceManager::MigrationSourceManager(OperationContext* opCtx, // Make sure the latest shard version is recovered as of the time of the invocation of the // command. onShardVersionMismatch(_opCtx, _args.getNss(), boost::none); + const auto shardId = ShardingState::get(opCtx)->shardId(); // Complete any unfinished migration pending recovery { @@ -202,12 +201,20 @@ MigrationSourceManager::MigrationSourceManager(OperationContext* opCtx, const auto csrLock = CollectionShardingRuntime::CSRLock::lockExclusive(_opCtx, csr); auto optMetadata = csr->getCurrentMetadataIfKnown(); - uassert(ErrorCodes::ConflictingOperationInProgress, + uassert(StaleConfigInfo(_args.getNss(), + ChunkVersion::IGNORED() /* receivedVersion */, + boost::none /* wantedVersion */, + shardId, + boost::none), "The collection's sharding state was cleared by a concurrent operation", optMetadata); auto& metadata = *optMetadata; - uassert(ErrorCodes::IncompatibleShardingMetadata, + uassert(StaleConfigInfo(_args.getNss(), + ChunkVersion::IGNORED() /* receivedVersion */, + ChunkVersion::UNSHARDED() /* wantedVersion */, + shardId, + boost::none), "Cannot move chunks for an unsharded collection", metadata.isSharded()); @@ -227,26 +234,47 @@ MigrationSourceManager::MigrationSourceManager(OperationContext* opCtx, const auto collectionVersion = collectionMetadata.getCollVersion(); const auto shardVersion = collectionMetadata.getShardVersion(); - // If the shard major version is zero, this means we do not have any chunks locally to migrate - uassert(ErrorCodes::IncompatibleShardingMetadata, - str::stream() << "cannot move chunk " << _args.toString() - << " because the shard doesn't contain any chunks", - shardVersion.majorVersion() > 0); - - uassert(ErrorCodes::StaleEpoch, + uassert(StaleConfigInfo(_args.getNss(), + ChunkVersion::IGNORED() /* receivedVersion */, + shardVersion /* wantedVersion */, + shardId, + boost::none), str::stream() << "cannot move chunk " << _args.toString() << " because collection may have been dropped. " << "current epoch: " << collectionVersion.epoch() << ", cmd epoch: " << _args.getVersionEpoch(), _args.getVersionEpoch() == collectionVersion.epoch()); - ChunkType chunkToMove; - chunkToMove.setMin(_args.getMinKey()); - chunkToMove.setMax(_args.getMaxKey()); + uassert(StaleConfigInfo(_args.getNss(), + ChunkVersion::IGNORED() /* receivedVersion */, + shardVersion /* wantedVersion */, + shardId, + boost::none), + str::stream() << "cannot move chunk " << _args.toString() + << " because the shard doesn't contain any chunks", + shardVersion.majorVersion() > 0); - uassertStatusOKWithContext(collectionMetadata.checkChunkIsValid(chunkToMove), - str::stream() << "Unable to move chunk with arguments '" - << redact(_args.toString())); + ChunkType existingChunk; + uassert(StaleConfigInfo(_args.getNss(), + ChunkVersion::IGNORED() /* receivedVersion */, + shardVersion /* wantedVersion */, + shardId, + boost::none), + str::stream() << "Chunk with bounds " + << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() + << " is not owned by this shard.", + collectionMetadata.getNextChunk(_args.getMinKey(), &existingChunk)); + uassert(StaleConfigInfo(_args.getNss(), + ChunkVersion::IGNORED() /* receivedVersion */, + shardVersion /* wantedVersion */, + shardId, + boost::none), + str::stream() << "Chunk with bounds " + << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() + << " does not exist. The closest owned chunk is " + << ChunkRange(existingChunk.getMin(), existingChunk.getMax()).toString(), + existingChunk.getMin().woCompare(_args.getMinKey()) == 0 && + existingChunk.getMax().woCompare(_args.getMaxKey()) == 0); _collectionEpoch = collectionVersion.epoch(); _collectionUUID = collectionUUID; diff --git a/src/mongo/db/s/scoped_operation_completion_sharding_actions.cpp b/src/mongo/db/s/scoped_operation_completion_sharding_actions.cpp index d7253272b62..7e3d89d58f0 100644 --- a/src/mongo/db/s/scoped_operation_completion_sharding_actions.cpp +++ b/src/mongo/db/s/scoped_operation_completion_sharding_actions.cpp @@ -80,6 +80,13 @@ ScopedOperationCompletionShardingActions::~ScopedOperationCompletionShardingActi } if (staleInfo->getVersionWanted() && + ChunkVersion::isIgnoredVersion(staleInfo->getVersionReceived())) { + // Shard is recovered, but the router didn't sent a shard version, therefore we just + // need to tell the router how much it needs to advance to (getVersionWanted). + return; + } + + if (staleInfo->getVersionWanted() && staleInfo->getVersionReceived().isOlderThan(*staleInfo->getVersionWanted())) { // Shard is recovered and the router is staler than the shard return; diff --git a/src/mongo/db/service_entry_point_common.cpp b/src/mongo/db/service_entry_point_common.cpp index 7dd97367749..ec7eccf6c9d 100644 --- a/src/mongo/db/service_entry_point_common.cpp +++ b/src/mongo/db/service_entry_point_common.cpp @@ -1698,6 +1698,14 @@ Future<void> ExecCommandDatabase::_commandExec() { } if (sce->getVersionWanted() && + ChunkVersion::isIgnoredVersion(sce->getVersionReceived())) { + // Shard is recovered, but the router didn't sent a shard version, therefore + // we just need to tell the router how much it needs to advance to + // (getVersionWanted). + return s; + } + + if (sce->getVersionWanted() && sce->getVersionReceived().isOlderThan(*sce->getVersionWanted())) { // Shard is recovered and the router is staler than the shard return s; diff --git a/src/mongo/executor/network_interface_tl.cpp b/src/mongo/executor/network_interface_tl.cpp index 1142ef6eec5..555b3677de4 100644 --- a/src/mongo/executor/network_interface_tl.cpp +++ b/src/mongo/executor/network_interface_tl.cpp @@ -893,9 +893,8 @@ void NetworkInterfaceTL::RequestState::resolve(Future<RemoteCommandResponse> fut const auto commandStatus = getStatusFromCommandResult(response.data); if (isHedge) { - // Ignore maxTimeMS expiration, StaleDbVersion or any error belonging to - // StaleShardVersionError - // error category for hedged reads without triggering the finish line. + // Ignore maxTimeMS expiration or any sharding "retargeting needed" error category + // for hedged reads without triggering the finish line. if (commandStatus == ErrorCodes::MaxTimeMSExpired || commandStatus == ErrorCodes::StaleDbVersion || ErrorCodes::isStaleShardVersionError(commandStatus)) { diff --git a/src/mongo/s/transaction_router.h b/src/mongo/s/transaction_router.h index 9da80bd9afd..276a473aead 100644 --- a/src/mongo/s/transaction_router.h +++ b/src/mongo/s/transaction_router.h @@ -269,8 +269,8 @@ public: * The first command in a transaction to target at least one shard must select a cluster time * timestamp before targeting, but may change the timestamp before contacting any shards to * allow optimizing the timestamp based on the targeted shards. If the first command encounters - * a retryable error, e.g. StaleShardVersion or SnapshotTooOld, the retry may also select a new - * timestamp. Once the first command has successfully completed, the timestamp cannot be + * a retryable error, e.g. "retargeting needed" or SnapshotTooOld, the retry may also select a + * new timestamp. Once the first command has successfully completed, the timestamp cannot be * changed. */ class AtClusterTime { diff --git a/src/mongo/s/write_ops/batch_write_exec.h b/src/mongo/s/write_ops/batch_write_exec.h index 86ce5d7508b..3bd4a135fe9 100644 --- a/src/mongo/s/write_ops/batch_write_exec.h +++ b/src/mongo/s/write_ops/batch_write_exec.h @@ -103,7 +103,7 @@ public: // Number of round trips required for the batch int numRounds; - // Number of stale batches due to StaleShardVersion + // Number of stale batches due to "retargeting needed" errors int numStaleShardBatches; // Number of stale batches due to StaleDbVersion int numStaleDbBatches; diff --git a/src/mongo/s/write_ops/batch_write_op.cpp b/src/mongo/s/write_ops/batch_write_op.cpp index 41b36adc7eb..c31b56992d6 100644 --- a/src/mongo/s/write_ops/batch_write_op.cpp +++ b/src/mongo/s/write_ops/batch_write_op.cpp @@ -472,8 +472,8 @@ Status BatchWriteOp::targetBatch( // For unordered writes, the router must return an entry for each failed write. This // constant is a pessimistic attempt to ensure that if a request to a shard hits - // StaleShardVersion and has to return number of errors equivalent to the number of writes - // in the batch, the response size will not exceed the max BSON size. + // "retargeting needed" error and has to return number of errors equivalent to the number of + // writes in the batch, the response size will not exceed the max BSON size. // // The constant of 272 is chosen as an approximation of the size of the BSON representation // of the StaleConfigInfo (which contains the shard id) and the adjacent error message. |