diff options
author | Marcos José Grillo Ramirez <marcos.grillo@mongodb.com> | 2022-03-02 15:01:37 +0100 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-03-08 15:44:20 +0000 |
commit | ebb15b33a2524d5999de8b71f252ee34714fbc46 (patch) | |
tree | 30d418041a1bd4b590bd2e1447414d28876db7ef /src/mongo | |
parent | 8af92ee09c50dc6f3009f78dc322dc16e4ef53f1 (diff) | |
download | mongo-ebb15b33a2524d5999de8b71f252ee34714fbc46.tar.gz |
SERVER-62521 Ensure distributed locks are being released even if a remote stepdown error occurs
(cherry picked from commit c9a74181ade07e824a0b6bec6913d27c56e8bf21)
Diffstat (limited to 'src/mongo')
-rw-r--r-- | src/mongo/db/s/sharding_ddl_coordinator.cpp | 47 |
1 files changed, 35 insertions, 12 deletions
diff --git a/src/mongo/db/s/sharding_ddl_coordinator.cpp b/src/mongo/db/s/sharding_ddl_coordinator.cpp index e6da6be570a..ffc0e4c7741 100644 --- a/src/mongo/db/s/sharding_ddl_coordinator.cpp +++ b/src/mongo/db/s/sharding_ddl_coordinator.cpp @@ -226,7 +226,17 @@ SemiFuture<void> ShardingDDLCoordinator::run(std::shared_ptr<executor::ScopedTas hangBeforeRunningCoordinatorInstance.pauseWhileSet(); }) - .onError([this, anchor = shared_from_this()](const Status& status) { + .onError([this, token, anchor = shared_from_this()](const Status& status) { + // The construction of a DDL coordinator recovered from disk can only fail due to + // stepdown/shutdown. + dassert(!_recoveredFromDisk || + (token.isCanceled() && + (status.isA<ErrorCategory::CancellationError>() || + status.isA<ErrorCategory::NotPrimaryError>()))); + + // Ensure coordinator cleanup if the document has not been saved. + _completeOnError = !_recoveredFromDisk; + static constexpr auto& errorMsg = "Failed to complete construction of sharding DDL coordinator"; LOGV2_ERROR( @@ -277,20 +287,30 @@ SemiFuture<void> ShardingDDLCoordinator::run(std::shared_ptr<executor::ScopedTas auto opCtxHolder = cc().makeOperationContext(); auto* opCtx = opCtxHolder.get(); - auto completionStatus = status; + // If we are stepping down the token MUST be cancelled. Each implementation of the + // coordinator must retry remote stepping down errors, unless, we allow finalizing the + // coordinator in the presence of errors. + dassert(!(status.isA<ErrorCategory::NotPrimaryError>() || + status.isA<ErrorCategory::ShutdownError>()) || + token.isCanceled() || _completeOnError); - bool isSteppingDown = status.isA<ErrorCategory::NotPrimaryError>() || - status.isA<ErrorCategory::ShutdownError>(); + auto completionStatus = status; - // Release the coordinator only in case the node is not stepping down or in case of - // acceptable error - if (!isSteppingDown || (!status.isOK() && _completeOnError)) { - LOGV2( - 5565601, "Releasing sharding DDL coordinator", "coordinatorId"_attr = _coordId); + bool isSteppingDown = token.isCanceled(); - auto session = metadata().getSession(); + // Remove the ddl coordinator and release locks if the execution was successfull or if + // there was any error and we have the _completeOnError flag set or if we are not + // stepping down. + auto cleanup = [&]() { + return completionStatus.isOK() || _completeOnError || !isSteppingDown; + }; + if (cleanup()) { try { + LOGV2(5565601, + "Releasing sharding DDL coordinator", + "coordinatorId"_attr = _coordId); + // We need to execute this in another executor to ensure the remove work is // done. const auto docWasRemoved = _removeDocumentUntillSuccessOrStepdown( @@ -304,6 +324,8 @@ SemiFuture<void> ShardingDDLCoordinator::run(std::shared_ptr<executor::ScopedTas Status::OK()); } + auto session = metadata().getSession(); + if (status.isOK() && session) { // Return lsid to the SessionCache. If status is not OK, let the lsid be // discarded. @@ -312,6 +334,7 @@ SemiFuture<void> ShardingDDLCoordinator::run(std::shared_ptr<executor::ScopedTas } } catch (const DBException& ex) { completionStatus = ex.toStatus(); + // Ensure the only possible error is that we're stepping down. isSteppingDown = completionStatus.isA<ErrorCategory::NotPrimaryError>() || completionStatus.isA<ErrorCategory::ShutdownError>() || completionStatus.isA<ErrorCategory::CancellationError>(); @@ -319,7 +342,7 @@ SemiFuture<void> ShardingDDLCoordinator::run(std::shared_ptr<executor::ScopedTas } } - if (isSteppingDown) { + if (!cleanup()) { LOGV2(5950000, "Not releasing distributed locks because the node is stepping down or " "shutting down", @@ -328,7 +351,7 @@ SemiFuture<void> ShardingDDLCoordinator::run(std::shared_ptr<executor::ScopedTas } while (!_scopedLocks.empty()) { - if (!isSteppingDown) { + if (cleanup()) { // (SERVER-59500) Only release the remote locks in case of no stepdown/shutdown const auto& resource = _scopedLocks.top().getNs(); DistLockManager::get(opCtx)->unlock(opCtx, resource); |