summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAllison Easton <allison.easton@mongodb.com>2023-01-31 10:45:02 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2023-01-31 11:21:41 +0000
commitf5e0d58f5f7ae5971ec7be2646cb497793a4ff0e (patch)
tree4e7ffdca8fa54fe6aaf4f50c54532d520e22ff60
parent61c3feb5d0da0695dbff46554ad3e1dc68a59352 (diff)
downloadmongo-f5e0d58f5f7ae5971ec7be2646cb497793a4ff0e.tar.gz
SERVER-70321 Collmod coordinator must not resume migrations on retriable errors
-rw-r--r--src/mongo/db/s/collmod_coordinator.cpp165
-rw-r--r--src/mongo/db/s/sharding_ddl_coordinator.cpp23
-rw-r--r--src/mongo/db/s/sharding_ddl_coordinator.h5
-rw-r--r--src/mongo/db/s/sharding_ddl_util.cpp20
-rw-r--r--src/mongo/db/s/sharding_ddl_util.h6
5 files changed, 136 insertions, 83 deletions
diff --git a/src/mongo/db/s/collmod_coordinator.cpp b/src/mongo/db/s/collmod_coordinator.cpp
index 3f4a44475d8..c1abec270a5 100644
--- a/src/mongo/db/s/collmod_coordinator.cpp
+++ b/src/mongo/db/s/collmod_coordinator.cpp
@@ -135,7 +135,13 @@ void CollModCoordinator::_enterPhase(Phase newPhase) {
if (_doc.getPhase() == Phase::kUnset) {
newDoc = _insertStateDocument(std::move(newDoc));
} else {
- newDoc = _updateStateDocument(cc().makeOperationContext().get(), std::move(newDoc));
+ ServiceContext::UniqueOperationContext uniqueOpCtx;
+ auto opCtx = cc().getOperationContext();
+ if (!opCtx) {
+ uniqueOpCtx = cc().makeOperationContext();
+ opCtx = uniqueOpCtx.get();
+ }
+ newDoc = _updateStateDocument(opCtx, std::move(newDoc));
}
{
@@ -304,74 +310,99 @@ ExecutorFuture<void> CollModCoordinator::_runImpl(
_saveShardingInfoOnCoordinatorIfNecessary(opCtx);
if (_collInfo->isSharded) {
- ShardsvrCollModParticipant request(nss(), _request);
- bool needsUnblock =
- _collInfo->timeSeriesOptions && hasTimeSeriesGranularityUpdate(_request);
- request.setNeedsUnblock(needsUnblock);
-
- std::vector<AsyncRequestsSender::Response> responses;
- auto shardsOwningChunks = _shardingInfo->shardsOwningChunks;
- auto primaryShardOwningChunk = std::find(shardsOwningChunks.begin(),
- shardsOwningChunks.end(),
- _shardingInfo->primaryShard);
-
- // If trying to convert an index to unique, executes a dryRun first to find any
- // duplicates without actually changing the indexes to avoid inconsistent index
- // specs on different shards.
- // Example:
- // Shard0: {_id: 0, a: 1}
- // Shard1: {_id: 1, a: 2}, {_id: 2, a: 2}
- // When trying to convert index {a: 1} to unique, the dry run will return the
- // duplicate errors to the user without converting the indexes.
- if (isCollModIndexUniqueConversion(_request)) {
- // The 'dryRun' option only works with 'unique' index option. We need to
- // strip out other incompatible options.
- auto dryRunRequest =
- ShardsvrCollModParticipant{nss(), makeCollModDryRunRequest(_request)};
- sharding_ddl_util::sendAuthenticatedCommandToShards(
- opCtx,
- nss().db(),
- CommandHelpers::appendMajorityWriteConcern(dryRunRequest.toBSON({})),
- shardsOwningChunks,
- **executor);
- }
-
- // A view definition will only be present on the primary shard. So we pass an
- // addition 'performViewChange' flag only to the primary shard.
- if (primaryShardOwningChunk != shardsOwningChunks.end()) {
- request.setPerformViewChange(true);
- const auto& primaryResponse =
+ try {
+ if (!_firstExecution) {
+ bool allowMigrations = sharding_ddl_util::checkAllowMigrations(
+ opCtx, _collInfo->nsForTargeting);
+ if (_result.is_initialized() && allowMigrations) {
+ // The command finished and we have the response. Return it.
+ return;
+ } else if (allowMigrations) {
+ // Previous run on a different node completed, but we lost the
+ // result in the stepdown. Restart from stage in which we disallow
+ // migrations.
+ _enterPhase(Phase::kBlockShards);
+ uasserted(ErrorCodes::Interrupted,
+ "Retriable error to move to previous stage");
+ }
+ }
+
+ ShardsvrCollModParticipant request(nss(), _request);
+ bool needsUnblock = _collInfo->timeSeriesOptions &&
+ hasTimeSeriesGranularityUpdate(_request);
+ request.setNeedsUnblock(needsUnblock);
+
+ std::vector<AsyncRequestsSender::Response> responses;
+ auto shardsOwningChunks = _shardingInfo->shardsOwningChunks;
+ auto primaryShardOwningChunk = std::find(shardsOwningChunks.begin(),
+ shardsOwningChunks.end(),
+ _shardingInfo->primaryShard);
+
+ // If trying to convert an index to unique, executes a dryRun first to find
+ // any duplicates without actually changing the indexes to avoid
+ // inconsistent index specs on different shards. Example:
+ // Shard0: {_id: 0, a: 1}
+ // Shard1: {_id: 1, a: 2}, {_id: 2, a: 2}
+ // When trying to convert index {a: 1} to unique, the dry run will return
+ // the duplicate errors to the user without converting the indexes.
+ if (isCollModIndexUniqueConversion(_request)) {
+ // The 'dryRun' option only works with 'unique' index option. We need to
+ // strip out other incompatible options.
+ auto dryRunRequest = ShardsvrCollModParticipant{
+ nss(), makeCollModDryRunRequest(_request)};
+ sharding_ddl_util::sendAuthenticatedCommandToShards(
+ opCtx,
+ nss().db(),
+ CommandHelpers::appendMajorityWriteConcern(
+ dryRunRequest.toBSON({})),
+ shardsOwningChunks,
+ **executor);
+ }
+
+ // A view definition will only be present on the primary shard. So we pass
+ // an addition 'performViewChange' flag only to the primary shard.
+ if (primaryShardOwningChunk != shardsOwningChunks.end()) {
+ request.setPerformViewChange(true);
+ const auto& primaryResponse =
+ sharding_ddl_util::sendAuthenticatedCommandToShards(
+ opCtx,
+ nss().db(),
+ CommandHelpers::appendMajorityWriteConcern(request.toBSON({})),
+ {_shardingInfo->primaryShard},
+ **executor);
+ responses.insert(
+ responses.end(), primaryResponse.begin(), primaryResponse.end());
+ shardsOwningChunks.erase(primaryShardOwningChunk);
+ }
+
+ request.setPerformViewChange(false);
+ const auto& secondaryResponses =
sharding_ddl_util::sendAuthenticatedCommandToShards(
opCtx,
nss().db(),
CommandHelpers::appendMajorityWriteConcern(request.toBSON({})),
- {_shardingInfo->primaryShard},
+ shardsOwningChunks,
**executor);
responses.insert(
- responses.end(), primaryResponse.begin(), primaryResponse.end());
- shardsOwningChunks.erase(primaryShardOwningChunk);
- }
-
- request.setPerformViewChange(false);
- const auto& secondaryResponses =
- sharding_ddl_util::sendAuthenticatedCommandToShards(
- opCtx,
- nss().db(),
- CommandHelpers::appendMajorityWriteConcern(request.toBSON({})),
- shardsOwningChunks,
- **executor);
- responses.insert(
- responses.end(), secondaryResponses.begin(), secondaryResponses.end());
-
- BSONObjBuilder builder;
- std::string errmsg;
- auto ok = appendRawResponses(opCtx, &errmsg, &builder, responses).responseOK;
- if (!errmsg.empty()) {
- CommandHelpers::appendSimpleCommandStatus(builder, ok, errmsg);
+ responses.end(), secondaryResponses.begin(), secondaryResponses.end());
+
+ BSONObjBuilder builder;
+ std::string errmsg;
+ auto ok =
+ appendRawResponses(opCtx, &errmsg, &builder, responses).responseOK;
+ if (!errmsg.empty()) {
+ CommandHelpers::appendSimpleCommandStatus(builder, ok, errmsg);
+ }
+ _result = builder.obj();
+ sharding_ddl_util::resumeMigrations(
+ opCtx, _collInfo->nsForTargeting, _doc.getCollUUID());
+ } catch (DBException& ex) {
+ if (!_isRetriableErrorForDDLCoordinator(ex.toStatus())) {
+ sharding_ddl_util::resumeMigrations(
+ opCtx, _collInfo->nsForTargeting, _doc.getCollUUID());
+ }
+ throw;
}
- _result = builder.obj();
- sharding_ddl_util::resumeMigrations(
- opCtx, _collInfo->nsForTargeting, _doc.getCollUUID());
} else {
CollMod cmd(nss());
cmd.setCollModRequest(_request);
@@ -399,16 +430,6 @@ ExecutorFuture<void> CollModCoordinator::_runImpl(
"Error running collMod",
"namespace"_attr = nss(),
"error"_attr = redact(status));
- // If we have the collection UUID set, this error happened in a sharded collection,
- // we should restore the migrations.
- if (_doc.getCollUUID()) {
- auto opCtxHolder = cc().makeOperationContext();
- auto* opCtx = opCtxHolder.get();
- getForwardableOpMetadata().setOn(opCtx);
-
- sharding_ddl_util::resumeMigrations(
- opCtx, _collInfo->nsForTargeting, _doc.getCollUUID());
- }
}
return status;
});
diff --git a/src/mongo/db/s/sharding_ddl_coordinator.cpp b/src/mongo/db/s/sharding_ddl_coordinator.cpp
index 190e4717666..ddbba410072 100644
--- a/src/mongo/db/s/sharding_ddl_coordinator.cpp
+++ b/src/mongo/db/s/sharding_ddl_coordinator.cpp
@@ -57,7 +57,7 @@ namespace {
const Backoff kExponentialBackoff(Seconds(1), Milliseconds::max());
-}
+} // namespace
ShardingDDLCoordinatorMetadata extractShardingDDLCoordinatorMetadata(const BSONObj& coorDoc) {
return ShardingDDLCoordinatorMetadata::parse(
@@ -291,16 +291,7 @@ SemiFuture<void> ShardingDDLCoordinator::run(std::shared_ptr<executor::ScopedTas
// If the token is not cancelled we retry because it could have been generated
// by a remote node.
if (!status.isOK() && !_completeOnError &&
- (_mustAlwaysMakeProgress() ||
- status.isA<ErrorCategory::CursorInvalidatedError>() ||
- status.isA<ErrorCategory::ShutdownError>() ||
- status.isA<ErrorCategory::RetriableError>() ||
- status.isA<ErrorCategory::CancellationError>() ||
- status.isA<ErrorCategory::ExceededTimeLimitError>() ||
- status.isA<ErrorCategory::WriteConcernError>() ||
- status == ErrorCodes::FailedToSatisfyReadPreference ||
- status == ErrorCodes::Interrupted || status == ErrorCodes::LockBusy ||
- status == ErrorCodes::CommandNotFound) &&
+ (_mustAlwaysMakeProgress() || _isRetriableErrorForDDLCoordinator(status)) &&
!token.isCanceled()) {
LOGV2_INFO(5656000,
"Re-executing sharding DDL coordinator",
@@ -414,4 +405,14 @@ void ShardingDDLCoordinator::_performNoopRetryableWriteOnAllShardsAndConfigsvr(
sharding_ddl_util::performNoopRetryableWriteOnShards(opCtx, shardsAndConfigsvr, osi, executor);
}
+bool ShardingDDLCoordinator::_isRetriableErrorForDDLCoordinator(const Status& status) {
+ return status.isA<ErrorCategory::CursorInvalidatedError>() ||
+ status.isA<ErrorCategory::ShutdownError>() || status.isA<ErrorCategory::RetriableError>() ||
+ status.isA<ErrorCategory::CancellationError>() ||
+ status.isA<ErrorCategory::ExceededTimeLimitError>() ||
+ status.isA<ErrorCategory::WriteConcernError>() ||
+ status == ErrorCodes::FailedToSatisfyReadPreference || status == ErrorCodes::Interrupted ||
+ status == ErrorCodes::LockBusy || status == ErrorCodes::CommandNotFound;
+}
+
} // namespace mongo
diff --git a/src/mongo/db/s/sharding_ddl_coordinator.h b/src/mongo/db/s/sharding_ddl_coordinator.h
index 5972c7ce9e6..f18b3b0ff5d 100644
--- a/src/mongo/db/s/sharding_ddl_coordinator.h
+++ b/src/mongo/db/s/sharding_ddl_coordinator.h
@@ -203,6 +203,11 @@ protected:
return false;
};
+ /*
+ * Specify if the given error will be retried by the ddl coordinator infrastructure.
+ */
+ bool _isRetriableErrorForDDLCoordinator(const Status& status);
+
ShardingDDLCoordinatorService* _service;
const ShardingDDLCoordinatorId _coordId;
diff --git a/src/mongo/db/s/sharding_ddl_util.cpp b/src/mongo/db/s/sharding_ddl_util.cpp
index 777cf40c858..47b6eb84732 100644
--- a/src/mongo/db/s/sharding_ddl_util.cpp
+++ b/src/mongo/db/s/sharding_ddl_util.cpp
@@ -548,6 +548,26 @@ void resumeMigrations(OperationContext* opCtx,
setAllowMigrations(opCtx, nss, expectedCollectionUUID, true);
}
+bool checkAllowMigrations(OperationContext* opCtx, const NamespaceString& nss) {
+ auto collDoc =
+ uassertStatusOK(Grid::get(opCtx)->shardRegistry()->getConfigShard()->exhaustiveFindOnConfig(
+ opCtx,
+ ReadPreferenceSetting(ReadPreference::PrimaryOnly, TagSet{}),
+ repl::ReadConcernLevel::kMajorityReadConcern,
+ CollectionType::ConfigNS,
+ BSON(CollectionType::kNssFieldName << nss.ns()),
+ BSONObj(),
+ 1))
+ .docs;
+
+ uassert(ErrorCodes::NamespaceNotFound,
+ str::stream() << "collection " << nss.ns() << " not found",
+ !collDoc.empty());
+
+ auto coll = CollectionType(collDoc[0]);
+ return coll.getAllowMigrations();
+}
+
boost::optional<UUID> getCollectionUUID(OperationContext* opCtx,
const NamespaceString& nss,
bool allowViews) {
diff --git a/src/mongo/db/s/sharding_ddl_util.h b/src/mongo/db/s/sharding_ddl_util.h
index dd02fe5035a..cbb1cc67b54 100644
--- a/src/mongo/db/s/sharding_ddl_util.h
+++ b/src/mongo/db/s/sharding_ddl_util.h
@@ -166,6 +166,12 @@ void resumeMigrations(OperationContext* opCtx,
const NamespaceString& nss,
const boost::optional<UUID>& expectedCollectionUUID);
+/**
+ * Calls to the config server primary to get the collection document for the given nss.
+ * Returns the value of the allowMigrations flag on the collection document.
+ */
+bool checkAllowMigrations(OperationContext* opCtx, const NamespaceString& nss);
+
/*
* Returns the UUID of the collection (if exists) using the catalog. It does not provide any locking
* guarantees after the call.