diff options
author | Jordi Serra Torrens <jordi.serra-torrens@mongodb.com> | 2021-10-25 07:13:47 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-10-28 16:18:09 +0000 |
commit | fe4cbeb6d0fa079e80b1a300cd4ec8a56cffdd77 (patch) | |
tree | 6ca24f4a6a72f7ff44ad7ce6812c9fef0fda63e6 | |
parent | 0036d54b93c6703864c92181c9694605726f108b (diff) | |
download | mongo-fe4cbeb6d0fa079e80b1a300cd4ec8a56cffdd77.tar.gz |
SERVER-59965 Limit max time wait behind critical section during filtering metadata refresh in txn
(cherry picked from commit 02add56a2100bef135281938a0cadaf374279f03)
-rw-r--r-- | src/mongo/db/s/shard_filtering_metadata_refresh.cpp | 26 | ||||
-rw-r--r-- | src/mongo/db/s/sharding_runtime_d_params.idl | 11 |
2 files changed, 33 insertions, 4 deletions
diff --git a/src/mongo/db/s/shard_filtering_metadata_refresh.cpp b/src/mongo/db/s/shard_filtering_metadata_refresh.cpp index 968e89bb74d..665d5a8ed06 100644 --- a/src/mongo/db/s/shard_filtering_metadata_refresh.cpp +++ b/src/mongo/db/s/shard_filtering_metadata_refresh.cpp @@ -89,7 +89,8 @@ bool joinShardVersionOperation(OperationContext* opCtx, CollectionShardingRuntime* csr, boost::optional<Lock::DBLock>* dbLock, boost::optional<Lock::CollectionLock>* collLock, - boost::optional<CollectionShardingRuntime::CSRLock>* csrLock) { + boost::optional<CollectionShardingRuntime::CSRLock>* csrLock, + Milliseconds criticalSectionMaxWait) { invariant(collLock->has_value()); invariant(csrLock->has_value()); @@ -106,7 +107,11 @@ bool joinShardVersionOperation(OperationContext* opCtx, dbLock->reset(); if (critSecSignal) { - critSecSignal->get(opCtx); + const auto deadline = criticalSectionMaxWait == Milliseconds::max() + ? Date_t::max() + : opCtx->getServiceContext()->getFastClockSource()->now() + criticalSectionMaxWait; + opCtx->runWithDeadline( + deadline, ErrorCodes::ExceededTimeLimit, [&] { critSecSignal->get(opCtx); }); } else { inRecoverOrRefresh->get(opCtx); } @@ -212,6 +217,17 @@ void onShardVersionMismatch(OperationContext* opCtx, "namespace"_attr = nss, "shardVersionReceived"_attr = shardVersionReceived); + // If we are in a transaction, limit the time we can wait behind the critical section. This is + // needed in order to prevent distributed deadlocks in situations where a DDL operation needs to + // acquire the critical section on several shards. In that case, a shard running a transaction + // could be waiting for the critical section to be exited, while on another shard the + // transaction has already executed some statement and stashed locks which prevent the critical + // section from being acquired in that node. Limiting the wait behind the critical section will + // ensure that the transaction will eventually get aborted. + const auto criticalSectionMaxWait = opCtx->inMultiDocumentTransaction() + ? Milliseconds(metadataRefreshInTransactionMaxWaitBehindCritSecMS.load()) + : Milliseconds::max(); + boost::optional<SharedSemiFuture<void>> inRecoverOrRefresh; while (true) { boost::optional<Lock::DBLock> dbLock; @@ -223,7 +239,8 @@ void onShardVersionMismatch(OperationContext* opCtx, boost::optional<CollectionShardingRuntime::CSRLock> csrLock = CollectionShardingRuntime::CSRLock::lockShared(opCtx, csr); - if (joinShardVersionOperation(opCtx, csr, &dbLock, &collLock, &csrLock)) { + if (joinShardVersionOperation( + opCtx, csr, &dbLock, &collLock, &csrLock, criticalSectionMaxWait)) { continue; } @@ -245,7 +262,8 @@ void onShardVersionMismatch(OperationContext* opCtx, // If there is no ongoing shard version operation, initialize the RecoverRefreshThread // thread and associate it to the CSR. - if (!joinShardVersionOperation(opCtx, csr, &dbLock, &collLock, &csrLock)) { + if (!joinShardVersionOperation( + opCtx, csr, &dbLock, &collLock, &csrLock, criticalSectionMaxWait)) { // If the shard doesn't yet know its filtering metadata, recovery needs to be run const bool runRecover = metadata ? false : true; csr->setShardVersionRecoverRefreshFuture( diff --git a/src/mongo/db/s/sharding_runtime_d_params.idl b/src/mongo/db/s/sharding_runtime_d_params.idl index af7d732265c..cd5885bbe3f 100644 --- a/src/mongo/db/s/sharding_runtime_d_params.idl +++ b/src/mongo/db/s/sharding_runtime_d_params.idl @@ -141,3 +141,14 @@ server_parameters: cpp_vartype: int cpp_varname: shardedIndexConsistencyCheckIntervalMS default: 600000 + + metadataRefreshInTransactionMaxWaitBehindCritSecMS: + description: >- + Maximum time in milliseconds to wait behind the critical section when refreshing the + filtering metadata within a transaction. + set_at: [startup, runtime] + cpp_vartype: AtomicWord<int> + cpp_varname: metadataRefreshInTransactionMaxWaitBehindCritSecMS + validator: + gte: 0 + default: 500 |