summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKaloian Manassiev <kaloian.manassiev@mongodb.com>2022-03-11 13:46:49 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-03-11 14:21:00 +0000
commit8ff255e3b85a0f4c74c2fa3842e292076926c6ff (patch)
tree695ca135ad5041761960255f7547368953ca45db
parentf690e6856e9cbc7ee1bc3bcf20dfa4eed6621e2a (diff)
downloadmongo-8ff255e3b85a0f4c74c2fa3842e292076926c6ff.tar.gz
SERVER-63327 Remove usages of StaleShardVersion from the migrations path
-rw-r--r--src/mongo/db/s/balancer/balancer_defragmentation_policy_impl.cpp18
-rw-r--r--src/mongo/db/s/collection_metadata.cpp24
-rw-r--r--src/mongo/db/s/collection_metadata.h5
-rw-r--r--src/mongo/db/s/collection_sharding_runtime.cpp15
-rw-r--r--src/mongo/db/s/collection_sharding_state.h6
-rw-r--r--src/mongo/db/s/migration_destination_manager_legacy_commands.cpp14
-rw-r--r--src/mongo/db/s/migration_source_manager.cpp62
-rw-r--r--src/mongo/db/s/scoped_operation_completion_sharding_actions.cpp7
-rw-r--r--src/mongo/db/service_entry_point_common.cpp8
-rw-r--r--src/mongo/executor/network_interface_tl.cpp5
-rw-r--r--src/mongo/s/transaction_router.h4
-rw-r--r--src/mongo/s/write_ops/batch_write_exec.h2
-rw-r--r--src/mongo/s/write_ops/batch_write_op.cpp4
13 files changed, 100 insertions, 74 deletions
diff --git a/src/mongo/db/s/balancer/balancer_defragmentation_policy_impl.cpp b/src/mongo/db/s/balancer/balancer_defragmentation_policy_impl.cpp
index 0b1d26b0296..bb891bcff60 100644
--- a/src/mongo/db/s/balancer/balancer_defragmentation_policy_impl.cpp
+++ b/src/mongo/db/s/balancer/balancer_defragmentation_policy_impl.cpp
@@ -90,9 +90,21 @@ ZoneInfo getCollectionZones(OperationContext* opCtx, const CollectionType& coll)
return zones;
}
-bool isRetriableForDefragmentation(const Status& error) {
- return (ErrorCodes::isA<ErrorCategory::RetriableError>(error) ||
- error == ErrorCodes::StaleConfig);
+bool isRetriableForDefragmentation(const Status& status) {
+ if (ErrorCodes::isA<ErrorCategory::RetriableError>(status))
+ return true;
+
+ if (status == ErrorCodes::StaleConfig) {
+ if (auto staleInfo = status.extraInfo<StaleConfigInfo>()) {
+ // If the staleInfo error contains a "wanted" version, this means the donor shard which
+ // returned this error has its versioning information up-to-date (as opposed to UNKNOWN)
+ // and it couldn't find the chunk that the defragmenter expected. Such a situation can
+ // only arise as a result of manual split/merge/move concurrently with the defragmenter.
+ return !staleInfo->getVersionWanted();
+ }
+ }
+
+ return false;
}
void handleActionResult(OperationContext* opCtx,
diff --git a/src/mongo/db/s/collection_metadata.cpp b/src/mongo/db/s/collection_metadata.cpp
index 39933678d66..f68c39ef0ce 100644
--- a/src/mongo/db/s/collection_metadata.cpp
+++ b/src/mongo/db/s/collection_metadata.cpp
@@ -181,30 +181,6 @@ bool CollectionMetadata::getNextChunk(const BSONObj& lookupKey, ChunkType* chunk
return true;
}
-Status CollectionMetadata::checkChunkIsValid(const ChunkType& chunk) const {
- invariant(isSharded());
-
- ChunkType existingChunk;
-
- if (!getNextChunk(chunk.getMin(), &existingChunk)) {
- return {ErrorCodes::StaleShardVersion,
- str::stream() << "Chunk with bounds "
- << ChunkRange(chunk.getMin(), chunk.getMax()).toString()
- << " is not owned by this shard."};
- }
-
- if (existingChunk.getMin().woCompare(chunk.getMin()) ||
- existingChunk.getMax().woCompare(chunk.getMax())) {
- return {ErrorCodes::StaleShardVersion,
- str::stream() << "Unable to find chunk with the exact bounds "
- << chunk.getRange().toString() << " at collection version "
- << getCollVersion().toString()
- << " found existing chunk: " << existingChunk.toString()};
- }
-
- return Status::OK();
-}
-
bool CollectionMetadata::currentShardHasAnyChunks() const {
invariant(isSharded());
std::set<ShardId> shards;
diff --git a/src/mongo/db/s/collection_metadata.h b/src/mongo/db/s/collection_metadata.h
index 4cf162648b5..84ff44a6c90 100644
--- a/src/mongo/db/s/collection_metadata.h
+++ b/src/mongo/db/s/collection_metadata.h
@@ -204,11 +204,6 @@ public:
bool getNextChunk(const BSONObj& lookupKey, ChunkType* chunk) const;
/**
- * Validates that the passed-in chunk's bounds exactly match a chunk in the metadata cache.
- */
- Status checkChunkIsValid(const ChunkType& chunk) const;
-
- /**
* Returns true if the argument range overlaps any chunk.
*/
bool rangeOverlapsChunk(const ChunkRange& range) const {
diff --git a/src/mongo/db/s/collection_sharding_runtime.cpp b/src/mongo/db/s/collection_sharding_runtime.cpp
index 1f40b068e86..fa4e68cc9af 100644
--- a/src/mongo/db/s/collection_sharding_runtime.cpp
+++ b/src/mongo/db/s/collection_sharding_runtime.cpp
@@ -29,8 +29,6 @@
#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kSharding
-#include "mongo/platform/basic.h"
-
#include "mongo/db/s/collection_sharding_runtime.h"
#include "mongo/base/checked_cast.h"
@@ -136,8 +134,8 @@ ScopedCollectionDescription CollectionShardingRuntime::getCollectionDescription(
auto optMetadata = _getCurrentMetadataIfKnown(boost::none);
uassert(
StaleConfigInfo(_nss,
- ChunkVersion::UNSHARDED(),
- boost::none,
+ ChunkVersion::IGNORED() /* receivedVersion */,
+ boost::none /* wantedVersion */,
ShardingState::get(_serviceContext)->shardId()),
str::stream() << "sharding status of collection " << _nss.ns()
<< " is not currently available for description and needs to be recovered "
@@ -347,9 +345,10 @@ CollectionShardingRuntime::_getMetadataWithVersionCheckAt(
auto csrLock = CSRLock::lockShared(opCtx, this);
auto optCurrentMetadata = _getCurrentMetadataIfKnown(atClusterTime);
-
- uassert(StaleConfigInfo(
- _nss, receivedShardVersion, boost::none, ShardingState::get(opCtx)->shardId()),
+ uassert(StaleConfigInfo(_nss,
+ receivedShardVersion,
+ boost::none /* wantedVersion */,
+ ShardingState::get(opCtx)->shardId()),
str::stream() << "sharding status of collection " << _nss.ns()
<< " is not currently known and needs to be recovered",
optCurrentMetadata);
@@ -362,7 +361,7 @@ CollectionShardingRuntime::_getMetadataWithVersionCheckAt(
: ShardingMigrationCriticalSection::kRead);
uassert(StaleConfigInfo(_nss,
receivedShardVersion,
- boost::none,
+ boost::none /* wantedVersion */,
ShardingState::get(opCtx)->shardId(),
std::move(criticalSectionSignal)),
str::stream() << "migration commit in progress for " << _nss.ns(),
diff --git a/src/mongo/db/s/collection_sharding_state.h b/src/mongo/db/s/collection_sharding_state.h
index 53adbe24fde..575ccb85f2f 100644
--- a/src/mongo/db/s/collection_sharding_state.h
+++ b/src/mongo/db/s/collection_sharding_state.h
@@ -97,8 +97,8 @@ public:
static void appendInfoForServerStatus(OperationContext* opCtx, BSONObjBuilder* builder);
/**
- * If the shard currently doesn't know whether the collection is sharded or not, it will throw
- * StaleShardVersion.
+ * If the shard currently doesn't know whether the collection is sharded or not, it will throw a
+ * StaleConfig exception.
*
* If the request doesn't have a shard version all collections will be treated as UNSHARDED.
*
@@ -112,7 +112,7 @@ public:
*
* If the shard currently doesn't know whether the collection is sharded or not, or if the
* expected shard version doesn't match with the one in the OperationShardingState, it will
- * throw StaleShardVersion.
+ * throw a StaleConfig exception.
*
* If the operation context contains an 'atClusterTime', the returned filtering object will be
* tied to a specific point in time. Otherwise, it will reference the latest cluster time
diff --git a/src/mongo/db/s/migration_destination_manager_legacy_commands.cpp b/src/mongo/db/s/migration_destination_manager_legacy_commands.cpp
index 7f90fe3a6d6..8233113e409 100644
--- a/src/mongo/db/s/migration_destination_manager_legacy_commands.cpp
+++ b/src/mongo/db/s/migration_destination_manager_legacy_commands.cpp
@@ -29,8 +29,6 @@
#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kSharding
-#include "mongo/platform/basic.h"
-
#include "mongo/db/auth/action_set.h"
#include "mongo/db/auth/action_type.h"
#include "mongo/db/auth/authorization_session.h"
@@ -116,15 +114,19 @@ public:
// We force a refresh immediately after registering this migration to guarantee that this
// shard will not receive a chunk after refreshing.
onShardVersionMismatch(opCtx, nss, boost::none);
+ const auto shardId = ShardingState::get(opCtx)->shardId();
const auto collectionEpoch = [&] {
AutoGetCollection autoColl(opCtx, nss, MODE_IS);
auto const optMetadata =
CollectionShardingRuntime::get(opCtx, nss)->getCurrentMetadataIfKnown();
- uassert(
- ErrorCodes::StaleShardVersion,
- "Collection's metadata have been found UNKNOWN after a refresh on the recipient",
- optMetadata);
+ uassert(StaleConfigInfo(nss,
+ ChunkVersion::IGNORED() /* receivedVersion */,
+ boost::none /* wantedVersion */,
+ shardId,
+ boost::none),
+ "The collection's sharding state was cleared by a concurrent operation",
+ optMetadata);
return optMetadata->getShardVersion().epoch();
}();
diff --git a/src/mongo/db/s/migration_source_manager.cpp b/src/mongo/db/s/migration_source_manager.cpp
index 52735e850c5..84598e4e1a2 100644
--- a/src/mongo/db/s/migration_source_manager.cpp
+++ b/src/mongo/db/s/migration_source_manager.cpp
@@ -29,8 +29,6 @@
#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kShardingMigration
-#include "mongo/platform/basic.h"
-
#include "mongo/db/s/migration_source_manager.h"
#include "mongo/bson/bsonobjbuilder.h"
@@ -175,6 +173,7 @@ MigrationSourceManager::MigrationSourceManager(OperationContext* opCtx,
// Make sure the latest shard version is recovered as of the time of the invocation of the
// command.
onShardVersionMismatch(_opCtx, _args.getNss(), boost::none);
+ const auto shardId = ShardingState::get(opCtx)->shardId();
// Complete any unfinished migration pending recovery
{
@@ -202,12 +201,20 @@ MigrationSourceManager::MigrationSourceManager(OperationContext* opCtx,
const auto csrLock = CollectionShardingRuntime::CSRLock::lockExclusive(_opCtx, csr);
auto optMetadata = csr->getCurrentMetadataIfKnown();
- uassert(ErrorCodes::ConflictingOperationInProgress,
+ uassert(StaleConfigInfo(_args.getNss(),
+ ChunkVersion::IGNORED() /* receivedVersion */,
+ boost::none /* wantedVersion */,
+ shardId,
+ boost::none),
"The collection's sharding state was cleared by a concurrent operation",
optMetadata);
auto& metadata = *optMetadata;
- uassert(ErrorCodes::IncompatibleShardingMetadata,
+ uassert(StaleConfigInfo(_args.getNss(),
+ ChunkVersion::IGNORED() /* receivedVersion */,
+ ChunkVersion::UNSHARDED() /* wantedVersion */,
+ shardId,
+ boost::none),
"Cannot move chunks for an unsharded collection",
metadata.isSharded());
@@ -227,26 +234,47 @@ MigrationSourceManager::MigrationSourceManager(OperationContext* opCtx,
const auto collectionVersion = collectionMetadata.getCollVersion();
const auto shardVersion = collectionMetadata.getShardVersion();
- // If the shard major version is zero, this means we do not have any chunks locally to migrate
- uassert(ErrorCodes::IncompatibleShardingMetadata,
- str::stream() << "cannot move chunk " << _args.toString()
- << " because the shard doesn't contain any chunks",
- shardVersion.majorVersion() > 0);
-
- uassert(ErrorCodes::StaleEpoch,
+ uassert(StaleConfigInfo(_args.getNss(),
+ ChunkVersion::IGNORED() /* receivedVersion */,
+ shardVersion /* wantedVersion */,
+ shardId,
+ boost::none),
str::stream() << "cannot move chunk " << _args.toString()
<< " because collection may have been dropped. "
<< "current epoch: " << collectionVersion.epoch()
<< ", cmd epoch: " << _args.getVersionEpoch(),
_args.getVersionEpoch() == collectionVersion.epoch());
- ChunkType chunkToMove;
- chunkToMove.setMin(_args.getMinKey());
- chunkToMove.setMax(_args.getMaxKey());
+ uassert(StaleConfigInfo(_args.getNss(),
+ ChunkVersion::IGNORED() /* receivedVersion */,
+ shardVersion /* wantedVersion */,
+ shardId,
+ boost::none),
+ str::stream() << "cannot move chunk " << _args.toString()
+ << " because the shard doesn't contain any chunks",
+ shardVersion.majorVersion() > 0);
- uassertStatusOKWithContext(collectionMetadata.checkChunkIsValid(chunkToMove),
- str::stream() << "Unable to move chunk with arguments '"
- << redact(_args.toString()));
+ ChunkType existingChunk;
+ uassert(StaleConfigInfo(_args.getNss(),
+ ChunkVersion::IGNORED() /* receivedVersion */,
+ shardVersion /* wantedVersion */,
+ shardId,
+ boost::none),
+ str::stream() << "Chunk with bounds "
+ << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString()
+ << " is not owned by this shard.",
+ collectionMetadata.getNextChunk(_args.getMinKey(), &existingChunk));
+ uassert(StaleConfigInfo(_args.getNss(),
+ ChunkVersion::IGNORED() /* receivedVersion */,
+ shardVersion /* wantedVersion */,
+ shardId,
+ boost::none),
+ str::stream() << "Chunk with bounds "
+ << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString()
+ << " does not exist. The closest owned chunk is "
+ << ChunkRange(existingChunk.getMin(), existingChunk.getMax()).toString(),
+ existingChunk.getMin().woCompare(_args.getMinKey()) == 0 &&
+ existingChunk.getMax().woCompare(_args.getMaxKey()) == 0);
_collectionEpoch = collectionVersion.epoch();
_collectionUUID = collectionUUID;
diff --git a/src/mongo/db/s/scoped_operation_completion_sharding_actions.cpp b/src/mongo/db/s/scoped_operation_completion_sharding_actions.cpp
index d7253272b62..7e3d89d58f0 100644
--- a/src/mongo/db/s/scoped_operation_completion_sharding_actions.cpp
+++ b/src/mongo/db/s/scoped_operation_completion_sharding_actions.cpp
@@ -80,6 +80,13 @@ ScopedOperationCompletionShardingActions::~ScopedOperationCompletionShardingActi
}
if (staleInfo->getVersionWanted() &&
+ ChunkVersion::isIgnoredVersion(staleInfo->getVersionReceived())) {
+ // Shard is recovered, but the router didn't sent a shard version, therefore we just
+ // need to tell the router how much it needs to advance to (getVersionWanted).
+ return;
+ }
+
+ if (staleInfo->getVersionWanted() &&
staleInfo->getVersionReceived().isOlderThan(*staleInfo->getVersionWanted())) {
// Shard is recovered and the router is staler than the shard
return;
diff --git a/src/mongo/db/service_entry_point_common.cpp b/src/mongo/db/service_entry_point_common.cpp
index 7dd97367749..ec7eccf6c9d 100644
--- a/src/mongo/db/service_entry_point_common.cpp
+++ b/src/mongo/db/service_entry_point_common.cpp
@@ -1698,6 +1698,14 @@ Future<void> ExecCommandDatabase::_commandExec() {
}
if (sce->getVersionWanted() &&
+ ChunkVersion::isIgnoredVersion(sce->getVersionReceived())) {
+ // Shard is recovered, but the router didn't sent a shard version, therefore
+ // we just need to tell the router how much it needs to advance to
+ // (getVersionWanted).
+ return s;
+ }
+
+ if (sce->getVersionWanted() &&
sce->getVersionReceived().isOlderThan(*sce->getVersionWanted())) {
// Shard is recovered and the router is staler than the shard
return s;
diff --git a/src/mongo/executor/network_interface_tl.cpp b/src/mongo/executor/network_interface_tl.cpp
index 1142ef6eec5..555b3677de4 100644
--- a/src/mongo/executor/network_interface_tl.cpp
+++ b/src/mongo/executor/network_interface_tl.cpp
@@ -893,9 +893,8 @@ void NetworkInterfaceTL::RequestState::resolve(Future<RemoteCommandResponse> fut
const auto commandStatus = getStatusFromCommandResult(response.data);
if (isHedge) {
- // Ignore maxTimeMS expiration, StaleDbVersion or any error belonging to
- // StaleShardVersionError
- // error category for hedged reads without triggering the finish line.
+ // Ignore maxTimeMS expiration or any sharding "retargeting needed" error category
+ // for hedged reads without triggering the finish line.
if (commandStatus == ErrorCodes::MaxTimeMSExpired ||
commandStatus == ErrorCodes::StaleDbVersion ||
ErrorCodes::isStaleShardVersionError(commandStatus)) {
diff --git a/src/mongo/s/transaction_router.h b/src/mongo/s/transaction_router.h
index 9da80bd9afd..276a473aead 100644
--- a/src/mongo/s/transaction_router.h
+++ b/src/mongo/s/transaction_router.h
@@ -269,8 +269,8 @@ public:
* The first command in a transaction to target at least one shard must select a cluster time
* timestamp before targeting, but may change the timestamp before contacting any shards to
* allow optimizing the timestamp based on the targeted shards. If the first command encounters
- * a retryable error, e.g. StaleShardVersion or SnapshotTooOld, the retry may also select a new
- * timestamp. Once the first command has successfully completed, the timestamp cannot be
+ * a retryable error, e.g. "retargeting needed" or SnapshotTooOld, the retry may also select a
+ * new timestamp. Once the first command has successfully completed, the timestamp cannot be
* changed.
*/
class AtClusterTime {
diff --git a/src/mongo/s/write_ops/batch_write_exec.h b/src/mongo/s/write_ops/batch_write_exec.h
index 86ce5d7508b..3bd4a135fe9 100644
--- a/src/mongo/s/write_ops/batch_write_exec.h
+++ b/src/mongo/s/write_ops/batch_write_exec.h
@@ -103,7 +103,7 @@ public:
// Number of round trips required for the batch
int numRounds;
- // Number of stale batches due to StaleShardVersion
+ // Number of stale batches due to "retargeting needed" errors
int numStaleShardBatches;
// Number of stale batches due to StaleDbVersion
int numStaleDbBatches;
diff --git a/src/mongo/s/write_ops/batch_write_op.cpp b/src/mongo/s/write_ops/batch_write_op.cpp
index 41b36adc7eb..c31b56992d6 100644
--- a/src/mongo/s/write_ops/batch_write_op.cpp
+++ b/src/mongo/s/write_ops/batch_write_op.cpp
@@ -472,8 +472,8 @@ Status BatchWriteOp::targetBatch(
// For unordered writes, the router must return an entry for each failed write. This
// constant is a pessimistic attempt to ensure that if a request to a shard hits
- // StaleShardVersion and has to return number of errors equivalent to the number of writes
- // in the batch, the response size will not exceed the max BSON size.
+ // "retargeting needed" error and has to return number of errors equivalent to the number of
+ // writes in the batch, the response size will not exceed the max BSON size.
//
// The constant of 272 is chosen as an approximation of the size of the BSON representation
// of the StaleConfigInfo (which contains the shard id) and the adjacent error message.