summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTommaso Tocci <tommaso.tocci@mongodb.com>2021-04-30 20:10:58 +0200
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-05-03 13:11:20 +0000
commit85dad3bf26622966b7c144751e14d0d6983bc75d (patch)
tree3f1f741fa941f037a4a803e878c01b9915ed0290
parent6e3c93bd5b41a69edfaf55da395fd6672423ad7a (diff)
downloadmongo-85dad3bf26622966b7c144751e14d0d6983bc75d.tar.gz
SERVER-56560 Avoid thread scheduling deadlock in create collection coordinator
-rw-r--r--src/mongo/db/s/create_collection_coordinator.cpp21
-rw-r--r--src/mongo/db/s/create_collection_coordinator.h7
-rw-r--r--src/mongo/db/s/sharding_ddl_coordinator.cpp31
3 files changed, 32 insertions, 27 deletions
diff --git a/src/mongo/db/s/create_collection_coordinator.cpp b/src/mongo/db/s/create_collection_coordinator.cpp
index 351ac3df87c..27e26e51012 100644
--- a/src/mongo/db/s/create_collection_coordinator.cpp
+++ b/src/mongo/db/s/create_collection_coordinator.cpp
@@ -49,13 +49,10 @@
#include "mongo/s/cluster_write.h"
#include "mongo/s/grid.h"
#include "mongo/s/request_types/shard_collection_gen.h"
-#include "mongo/util/future_util.h"
namespace mongo {
namespace {
-const Backoff kExponentialBackoff(Seconds(1), Milliseconds::max());
-
struct OptionsAndIndexes {
BSONObj options;
std::vector<BSONObj> indexSpecs;
@@ -471,14 +468,14 @@ ExecutorFuture<void> CreateCollectionCoordinator::_runImpl(
opCtx, nss(), _critSecReason, ShardingCatalogClient::kMajorityWriteConcern);
_createCollectionOnNonPrimaryShards(opCtx);
- _commitWithRetries(executor, token).get(opCtx);
+ _commit(opCtx);
}
sharding_ddl_util::releaseRecoverableCriticalSection(
opCtx, nss(), _critSecReason, ShardingCatalogClient::kMajorityWriteConcern);
if (!_splitPolicy->isOptimized()) {
- _commitWithRetries(executor, token).get(opCtx);
+ _commit(opCtx);
}
_finalize(opCtx);
@@ -741,20 +738,6 @@ void CreateCollectionCoordinator::_commit(OperationContext* opCtx) {
updateCatalogEntry(opCtx, nss(), coll);
}
-ExecutorFuture<void> CreateCollectionCoordinator::_commitWithRetries(
- std::shared_ptr<executor::ScopedTaskExecutor> executor, const CancellationToken& token) {
- return AsyncTry([this] {
- auto opCtxHolder = cc().makeOperationContext();
- auto* opCtx = opCtxHolder.get();
- getForwardableOpMetadata().setOn(opCtx);
-
- _commit(opCtx);
- })
- .until([token](Status status) { return status.isOK() || token.isCanceled(); })
- .withBackoffBetweenIterations(kExponentialBackoff)
- .on(**executor, CancellationToken::uncancelable());
-}
-
void CreateCollectionCoordinator::_finalize(OperationContext* opCtx) noexcept {
LOGV2_DEBUG(5277907, 2, "Create collection _finalize", "namespace"_attr = nss());
diff --git a/src/mongo/db/s/create_collection_coordinator.h b/src/mongo/db/s/create_collection_coordinator.h
index 9171e2a6b80..dc8c81b1b86 100644
--- a/src/mongo/db/s/create_collection_coordinator.h
+++ b/src/mongo/db/s/create_collection_coordinator.h
@@ -125,13 +125,6 @@ private:
*/
void _finalize(OperationContext* opCtx) noexcept;
- /**
- * Executes _commit with an exponential backoff and retries if the commit failed due to a
- * stepdown error.
- */
- ExecutorFuture<void> _commitWithRetries(std::shared_ptr<executor::ScopedTaskExecutor> executor,
- const CancellationToken& token);
-
CreateCollectionCoordinatorDocument _doc;
const BSONObj _critSecReason;
diff --git a/src/mongo/db/s/sharding_ddl_coordinator.cpp b/src/mongo/db/s/sharding_ddl_coordinator.cpp
index 5e05dae0a09..a8179143790 100644
--- a/src/mongo/db/s/sharding_ddl_coordinator.cpp
+++ b/src/mongo/db/s/sharding_ddl_coordinator.cpp
@@ -42,8 +42,14 @@
#include "mongo/logv2/log.h"
#include "mongo/s/grid.h"
#include "mongo/s/write_ops/batched_command_response.h"
+#include "mongo/util/future_util.h"
namespace mongo {
+namespace {
+
+const Backoff kExponentialBackoff(Seconds(1), Milliseconds::max());
+
+}
ShardingDDLCoordinatorMetadata extractShardingDDLCoordinatorMetadata(const BSONObj& coorDoc) {
return ShardingDDLCoordinatorMetadata::parse(
@@ -170,7 +176,30 @@ SemiFuture<void> ShardingDDLCoordinator::run(std::shared_ptr<executor::ScopedTas
return status;
})
.then([this, executor, token, anchor = shared_from_this()] {
- return _runImpl(executor, token);
+ return AsyncTry([this, executor, token] { return _runImpl(executor, token); })
+ .until([this, token](Status status) {
+ // Retry until either:
+ // - The coordinator succeed
+ // - The coordiantor failed with non-retryable error
+ // - The node is stepping/shutting down
+ //
+ // If the token is not cancelled we retry stepdown errors because it could have
+ // been generated by a remote node.
+ if (!status.isOK() &&
+ (status.isA<ErrorCategory::NotPrimaryError>() ||
+ status.isA<ErrorCategory::ShutdownError>()) &&
+ !token.isCanceled()) {
+ LOGV2_DEBUG(5656000,
+ 1,
+ "Re-executing sharding DDL coordinator",
+ "coordinatorId"_attr = _coorMetadata.getId(),
+ "reason"_attr = redact(status));
+ return false;
+ }
+ return true;
+ })
+ .withBackoffBetweenIterations(kExponentialBackoff)
+ .on(**executor, CancellationToken::uncancelable());
})
.onCompletion([this, anchor = shared_from_this()](const Status& status) {
auto opCtxHolder = cc().makeOperationContext();