diff options
Diffstat (limited to 'src/mongo/db/s/sharding_ddl_coordinator.cpp')
-rw-r--r-- | src/mongo/db/s/sharding_ddl_coordinator.cpp | 47 |
1 files changed, 35 insertions, 12 deletions
diff --git a/src/mongo/db/s/sharding_ddl_coordinator.cpp b/src/mongo/db/s/sharding_ddl_coordinator.cpp index 95bee8bcfc0..88d8ec01246 100644 --- a/src/mongo/db/s/sharding_ddl_coordinator.cpp +++ b/src/mongo/db/s/sharding_ddl_coordinator.cpp @@ -213,7 +213,17 @@ SemiFuture<void> ShardingDDLCoordinator::run(std::shared_ptr<executor::ScopedTas hangBeforeRunningCoordinatorInstance.pauseWhileSet(); }) - .onError([this, anchor = shared_from_this()](const Status& status) { + .onError([this, token, anchor = shared_from_this()](const Status& status) { + // The construction of a DDL coordinator recovered from disk can only fail due to + // stepdown/shutdown. + dassert(!_recoveredFromDisk || + (token.isCanceled() && + (status.isA<ErrorCategory::CancellationError>() || + status.isA<ErrorCategory::NotPrimaryError>()))); + + // Ensure coordinator cleanup if the document has not been saved. + _completeOnError = !_recoveredFromDisk; + static constexpr auto& errorMsg = "Failed to complete construction of sharding DDL coordinator"; LOGV2_ERROR( @@ -264,20 +274,30 @@ SemiFuture<void> ShardingDDLCoordinator::run(std::shared_ptr<executor::ScopedTas auto opCtxHolder = cc().makeOperationContext(); auto* opCtx = opCtxHolder.get(); - auto completionStatus = status; + // If we are stepping down the token MUST be cancelled. Each implementation of the + // coordinator must retry remote stepping down errors, unless, we allow finalizing the + // coordinator in the presence of errors. + dassert(!(status.isA<ErrorCategory::NotPrimaryError>() || + status.isA<ErrorCategory::ShutdownError>()) || + token.isCanceled() || _completeOnError); - bool isSteppingDown = status.isA<ErrorCategory::NotPrimaryError>() || - status.isA<ErrorCategory::ShutdownError>(); + auto completionStatus = status; - // Release the coordinator only in case the node is not stepping down or in case of - // acceptable error - if (!isSteppingDown || (!status.isOK() && _completeOnError)) { - LOGV2( - 5565601, "Releasing sharding DDL coordinator", "coordinatorId"_attr = _coordId); + bool isSteppingDown = token.isCanceled(); - auto session = metadata().getSession(); + // Remove the ddl coordinator and release locks if the execution was successfull or if + // there was any error and we have the _completeOnError flag set or if we are not + // stepping down. + auto cleanup = [&]() { + return completionStatus.isOK() || _completeOnError || !isSteppingDown; + }; + if (cleanup()) { try { + LOGV2(5565601, + "Releasing sharding DDL coordinator", + "coordinatorId"_attr = _coordId); + // We need to execute this in another executor to ensure the remove work is // done. const auto docWasRemoved = _removeDocumentUntillSuccessOrStepdown( @@ -291,6 +311,8 @@ SemiFuture<void> ShardingDDLCoordinator::run(std::shared_ptr<executor::ScopedTas Status::OK()); } + auto session = metadata().getSession(); + if (status.isOK() && session) { // Return lsid to the SessionCache. If status is not OK, let the lsid be // discarded. @@ -299,6 +321,7 @@ SemiFuture<void> ShardingDDLCoordinator::run(std::shared_ptr<executor::ScopedTas } } catch (const DBException& ex) { completionStatus = ex.toStatus(); + // Ensure the only possible error is that we're stepping down. isSteppingDown = completionStatus.isA<ErrorCategory::NotPrimaryError>() || completionStatus.isA<ErrorCategory::ShutdownError>() || completionStatus.isA<ErrorCategory::CancellationError>(); @@ -306,7 +329,7 @@ SemiFuture<void> ShardingDDLCoordinator::run(std::shared_ptr<executor::ScopedTas } } - if (isSteppingDown) { + if (!cleanup()) { LOGV2(5950000, "Not releasing distributed locks because the node is stepping down or " "shutting down", @@ -315,7 +338,7 @@ SemiFuture<void> ShardingDDLCoordinator::run(std::shared_ptr<executor::ScopedTas } while (!_scopedLocks.empty()) { - if (!isSteppingDown) { + if (cleanup()) { // (SERVER-59500) Only release the remote locks in case of no stepdown/shutdown const auto& resource = _scopedLocks.top().getNs(); DistLockManager::get(opCtx)->unlock(opCtx, resource); |