diff options
author | Tommaso Tocci <tommaso.tocci@mongodb.com> | 2021-04-30 20:10:58 +0200 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-05-03 13:11:20 +0000 |
commit | 85dad3bf26622966b7c144751e14d0d6983bc75d (patch) | |
tree | 3f1f741fa941f037a4a803e878c01b9915ed0290 | |
parent | 6e3c93bd5b41a69edfaf55da395fd6672423ad7a (diff) | |
download | mongo-85dad3bf26622966b7c144751e14d0d6983bc75d.tar.gz |
SERVER-56560 Avoid thread scheduling deadlock in create collection coordinator
-rw-r--r-- | src/mongo/db/s/create_collection_coordinator.cpp | 21 | ||||
-rw-r--r-- | src/mongo/db/s/create_collection_coordinator.h | 7 | ||||
-rw-r--r-- | src/mongo/db/s/sharding_ddl_coordinator.cpp | 31 |
3 files changed, 32 insertions, 27 deletions
diff --git a/src/mongo/db/s/create_collection_coordinator.cpp b/src/mongo/db/s/create_collection_coordinator.cpp index 351ac3df87c..27e26e51012 100644 --- a/src/mongo/db/s/create_collection_coordinator.cpp +++ b/src/mongo/db/s/create_collection_coordinator.cpp @@ -49,13 +49,10 @@ #include "mongo/s/cluster_write.h" #include "mongo/s/grid.h" #include "mongo/s/request_types/shard_collection_gen.h" -#include "mongo/util/future_util.h" namespace mongo { namespace { -const Backoff kExponentialBackoff(Seconds(1), Milliseconds::max()); - struct OptionsAndIndexes { BSONObj options; std::vector<BSONObj> indexSpecs; @@ -471,14 +468,14 @@ ExecutorFuture<void> CreateCollectionCoordinator::_runImpl( opCtx, nss(), _critSecReason, ShardingCatalogClient::kMajorityWriteConcern); _createCollectionOnNonPrimaryShards(opCtx); - _commitWithRetries(executor, token).get(opCtx); + _commit(opCtx); } sharding_ddl_util::releaseRecoverableCriticalSection( opCtx, nss(), _critSecReason, ShardingCatalogClient::kMajorityWriteConcern); if (!_splitPolicy->isOptimized()) { - _commitWithRetries(executor, token).get(opCtx); + _commit(opCtx); } _finalize(opCtx); @@ -741,20 +738,6 @@ void CreateCollectionCoordinator::_commit(OperationContext* opCtx) { updateCatalogEntry(opCtx, nss(), coll); } -ExecutorFuture<void> CreateCollectionCoordinator::_commitWithRetries( - std::shared_ptr<executor::ScopedTaskExecutor> executor, const CancellationToken& token) { - return AsyncTry([this] { - auto opCtxHolder = cc().makeOperationContext(); - auto* opCtx = opCtxHolder.get(); - getForwardableOpMetadata().setOn(opCtx); - - _commit(opCtx); - }) - .until([token](Status status) { return status.isOK() || token.isCanceled(); }) - .withBackoffBetweenIterations(kExponentialBackoff) - .on(**executor, CancellationToken::uncancelable()); -} - void CreateCollectionCoordinator::_finalize(OperationContext* opCtx) noexcept { LOGV2_DEBUG(5277907, 2, "Create collection _finalize", "namespace"_attr = nss()); diff --git a/src/mongo/db/s/create_collection_coordinator.h b/src/mongo/db/s/create_collection_coordinator.h index 9171e2a6b80..dc8c81b1b86 100644 --- a/src/mongo/db/s/create_collection_coordinator.h +++ b/src/mongo/db/s/create_collection_coordinator.h @@ -125,13 +125,6 @@ private: */ void _finalize(OperationContext* opCtx) noexcept; - /** - * Executes _commit with an exponential backoff and retries if the commit failed due to a - * stepdown error. - */ - ExecutorFuture<void> _commitWithRetries(std::shared_ptr<executor::ScopedTaskExecutor> executor, - const CancellationToken& token); - CreateCollectionCoordinatorDocument _doc; const BSONObj _critSecReason; diff --git a/src/mongo/db/s/sharding_ddl_coordinator.cpp b/src/mongo/db/s/sharding_ddl_coordinator.cpp index 5e05dae0a09..a8179143790 100644 --- a/src/mongo/db/s/sharding_ddl_coordinator.cpp +++ b/src/mongo/db/s/sharding_ddl_coordinator.cpp @@ -42,8 +42,14 @@ #include "mongo/logv2/log.h" #include "mongo/s/grid.h" #include "mongo/s/write_ops/batched_command_response.h" +#include "mongo/util/future_util.h" namespace mongo { +namespace { + +const Backoff kExponentialBackoff(Seconds(1), Milliseconds::max()); + +} ShardingDDLCoordinatorMetadata extractShardingDDLCoordinatorMetadata(const BSONObj& coorDoc) { return ShardingDDLCoordinatorMetadata::parse( @@ -170,7 +176,30 @@ SemiFuture<void> ShardingDDLCoordinator::run(std::shared_ptr<executor::ScopedTas return status; }) .then([this, executor, token, anchor = shared_from_this()] { - return _runImpl(executor, token); + return AsyncTry([this, executor, token] { return _runImpl(executor, token); }) + .until([this, token](Status status) { + // Retry until either: + // - The coordinator succeed + // - The coordiantor failed with non-retryable error + // - The node is stepping/shutting down + // + // If the token is not cancelled we retry stepdown errors because it could have + // been generated by a remote node. + if (!status.isOK() && + (status.isA<ErrorCategory::NotPrimaryError>() || + status.isA<ErrorCategory::ShutdownError>()) && + !token.isCanceled()) { + LOGV2_DEBUG(5656000, + 1, + "Re-executing sharding DDL coordinator", + "coordinatorId"_attr = _coorMetadata.getId(), + "reason"_attr = redact(status)); + return false; + } + return true; + }) + .withBackoffBetweenIterations(kExponentialBackoff) + .on(**executor, CancellationToken::uncancelable()); }) .onCompletion([this, anchor = shared_from_this()](const Status& status) { auto opCtxHolder = cc().makeOperationContext(); |