summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarcos José Grillo Ramirez <marcos.grillo@mongodb.com>2022-03-02 15:01:37 +0100
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-03-08 15:50:55 +0000
commitf7dfccadbb6f19018765b96ca746dcdeb74cf1ec (patch)
treeb3a3a1f23b4119f5849015f46f9a5d8649477dbe
parent0586ec6fb0d046c48ef8fad2a2846007120bbcfa (diff)
downloadmongo-f7dfccadbb6f19018765b96ca746dcdeb74cf1ec.tar.gz
SERVER-62521 Ensure distributed locks are being released even if a remote stepdown error occurs
(cherry picked from commit c9a74181ade07e824a0b6bec6913d27c56e8bf21)
-rw-r--r--src/mongo/db/s/sharding_ddl_coordinator.cpp47
1 files changed, 35 insertions, 12 deletions
diff --git a/src/mongo/db/s/sharding_ddl_coordinator.cpp b/src/mongo/db/s/sharding_ddl_coordinator.cpp
index 95bee8bcfc0..88d8ec01246 100644
--- a/src/mongo/db/s/sharding_ddl_coordinator.cpp
+++ b/src/mongo/db/s/sharding_ddl_coordinator.cpp
@@ -213,7 +213,17 @@ SemiFuture<void> ShardingDDLCoordinator::run(std::shared_ptr<executor::ScopedTas
hangBeforeRunningCoordinatorInstance.pauseWhileSet();
})
- .onError([this, anchor = shared_from_this()](const Status& status) {
+ .onError([this, token, anchor = shared_from_this()](const Status& status) {
+ // The construction of a DDL coordinator recovered from disk can only fail due to
+ // stepdown/shutdown.
+ dassert(!_recoveredFromDisk ||
+ (token.isCanceled() &&
+ (status.isA<ErrorCategory::CancellationError>() ||
+ status.isA<ErrorCategory::NotPrimaryError>())));
+
+ // Ensure coordinator cleanup if the document has not been saved.
+ _completeOnError = !_recoveredFromDisk;
+
static constexpr auto& errorMsg =
"Failed to complete construction of sharding DDL coordinator";
LOGV2_ERROR(
@@ -264,20 +274,30 @@ SemiFuture<void> ShardingDDLCoordinator::run(std::shared_ptr<executor::ScopedTas
auto opCtxHolder = cc().makeOperationContext();
auto* opCtx = opCtxHolder.get();
- auto completionStatus = status;
+ // If we are stepping down the token MUST be cancelled. Each implementation of the
+ // coordinator must retry remote stepping down errors, unless, we allow finalizing the
+ // coordinator in the presence of errors.
+ dassert(!(status.isA<ErrorCategory::NotPrimaryError>() ||
+ status.isA<ErrorCategory::ShutdownError>()) ||
+ token.isCanceled() || _completeOnError);
- bool isSteppingDown = status.isA<ErrorCategory::NotPrimaryError>() ||
- status.isA<ErrorCategory::ShutdownError>();
+ auto completionStatus = status;
- // Release the coordinator only in case the node is not stepping down or in case of
- // acceptable error
- if (!isSteppingDown || (!status.isOK() && _completeOnError)) {
- LOGV2(
- 5565601, "Releasing sharding DDL coordinator", "coordinatorId"_attr = _coordId);
+ bool isSteppingDown = token.isCanceled();
- auto session = metadata().getSession();
+ // Remove the ddl coordinator and release locks if the execution was successfull or if
+ // there was any error and we have the _completeOnError flag set or if we are not
+ // stepping down.
+ auto cleanup = [&]() {
+ return completionStatus.isOK() || _completeOnError || !isSteppingDown;
+ };
+ if (cleanup()) {
try {
+ LOGV2(5565601,
+ "Releasing sharding DDL coordinator",
+ "coordinatorId"_attr = _coordId);
+
// We need to execute this in another executor to ensure the remove work is
// done.
const auto docWasRemoved = _removeDocumentUntillSuccessOrStepdown(
@@ -291,6 +311,8 @@ SemiFuture<void> ShardingDDLCoordinator::run(std::shared_ptr<executor::ScopedTas
Status::OK());
}
+ auto session = metadata().getSession();
+
if (status.isOK() && session) {
// Return lsid to the SessionCache. If status is not OK, let the lsid be
// discarded.
@@ -299,6 +321,7 @@ SemiFuture<void> ShardingDDLCoordinator::run(std::shared_ptr<executor::ScopedTas
}
} catch (const DBException& ex) {
completionStatus = ex.toStatus();
+ // Ensure the only possible error is that we're stepping down.
isSteppingDown = completionStatus.isA<ErrorCategory::NotPrimaryError>() ||
completionStatus.isA<ErrorCategory::ShutdownError>() ||
completionStatus.isA<ErrorCategory::CancellationError>();
@@ -306,7 +329,7 @@ SemiFuture<void> ShardingDDLCoordinator::run(std::shared_ptr<executor::ScopedTas
}
}
- if (isSteppingDown) {
+ if (!cleanup()) {
LOGV2(5950000,
"Not releasing distributed locks because the node is stepping down or "
"shutting down",
@@ -315,7 +338,7 @@ SemiFuture<void> ShardingDDLCoordinator::run(std::shared_ptr<executor::ScopedTas
}
while (!_scopedLocks.empty()) {
- if (!isSteppingDown) {
+ if (cleanup()) {
// (SERVER-59500) Only release the remote locks in case of no stepdown/shutdown
const auto& resource = _scopedLocks.top().getNs();
DistLockManager::get(opCtx)->unlock(opCtx, resource);