summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJordi Serra Torrens <jordi.serra-torrens@mongodb.com>2021-10-25 07:13:47 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-10-28 16:18:09 +0000
commitfe4cbeb6d0fa079e80b1a300cd4ec8a56cffdd77 (patch)
tree6ca24f4a6a72f7ff44ad7ce6812c9fef0fda63e6
parent0036d54b93c6703864c92181c9694605726f108b (diff)
downloadmongo-fe4cbeb6d0fa079e80b1a300cd4ec8a56cffdd77.tar.gz
SERVER-59965 Limit max time wait behind critical section during filtering metadata refresh in txn
(cherry picked from commit 02add56a2100bef135281938a0cadaf374279f03)
-rw-r--r--src/mongo/db/s/shard_filtering_metadata_refresh.cpp26
-rw-r--r--src/mongo/db/s/sharding_runtime_d_params.idl11
2 files changed, 33 insertions, 4 deletions
diff --git a/src/mongo/db/s/shard_filtering_metadata_refresh.cpp b/src/mongo/db/s/shard_filtering_metadata_refresh.cpp
index 968e89bb74d..665d5a8ed06 100644
--- a/src/mongo/db/s/shard_filtering_metadata_refresh.cpp
+++ b/src/mongo/db/s/shard_filtering_metadata_refresh.cpp
@@ -89,7 +89,8 @@ bool joinShardVersionOperation(OperationContext* opCtx,
CollectionShardingRuntime* csr,
boost::optional<Lock::DBLock>* dbLock,
boost::optional<Lock::CollectionLock>* collLock,
- boost::optional<CollectionShardingRuntime::CSRLock>* csrLock) {
+ boost::optional<CollectionShardingRuntime::CSRLock>* csrLock,
+ Milliseconds criticalSectionMaxWait) {
invariant(collLock->has_value());
invariant(csrLock->has_value());
@@ -106,7 +107,11 @@ bool joinShardVersionOperation(OperationContext* opCtx,
dbLock->reset();
if (critSecSignal) {
- critSecSignal->get(opCtx);
+ const auto deadline = criticalSectionMaxWait == Milliseconds::max()
+ ? Date_t::max()
+ : opCtx->getServiceContext()->getFastClockSource()->now() + criticalSectionMaxWait;
+ opCtx->runWithDeadline(
+ deadline, ErrorCodes::ExceededTimeLimit, [&] { critSecSignal->get(opCtx); });
} else {
inRecoverOrRefresh->get(opCtx);
}
@@ -212,6 +217,17 @@ void onShardVersionMismatch(OperationContext* opCtx,
"namespace"_attr = nss,
"shardVersionReceived"_attr = shardVersionReceived);
+ // If we are in a transaction, limit the time we can wait behind the critical section. This is
+ // needed in order to prevent distributed deadlocks in situations where a DDL operation needs to
+ // acquire the critical section on several shards. In that case, a shard running a transaction
+ // could be waiting for the critical section to be exited, while on another shard the
+ // transaction has already executed some statement and stashed locks which prevent the critical
+ // section from being acquired in that node. Limiting the wait behind the critical section will
+ // ensure that the transaction will eventually get aborted.
+ const auto criticalSectionMaxWait = opCtx->inMultiDocumentTransaction()
+ ? Milliseconds(metadataRefreshInTransactionMaxWaitBehindCritSecMS.load())
+ : Milliseconds::max();
+
boost::optional<SharedSemiFuture<void>> inRecoverOrRefresh;
while (true) {
boost::optional<Lock::DBLock> dbLock;
@@ -223,7 +239,8 @@ void onShardVersionMismatch(OperationContext* opCtx,
boost::optional<CollectionShardingRuntime::CSRLock> csrLock =
CollectionShardingRuntime::CSRLock::lockShared(opCtx, csr);
- if (joinShardVersionOperation(opCtx, csr, &dbLock, &collLock, &csrLock)) {
+ if (joinShardVersionOperation(
+ opCtx, csr, &dbLock, &collLock, &csrLock, criticalSectionMaxWait)) {
continue;
}
@@ -245,7 +262,8 @@ void onShardVersionMismatch(OperationContext* opCtx,
// If there is no ongoing shard version operation, initialize the RecoverRefreshThread
// thread and associate it to the CSR.
- if (!joinShardVersionOperation(opCtx, csr, &dbLock, &collLock, &csrLock)) {
+ if (!joinShardVersionOperation(
+ opCtx, csr, &dbLock, &collLock, &csrLock, criticalSectionMaxWait)) {
// If the shard doesn't yet know its filtering metadata, recovery needs to be run
const bool runRecover = metadata ? false : true;
csr->setShardVersionRecoverRefreshFuture(
diff --git a/src/mongo/db/s/sharding_runtime_d_params.idl b/src/mongo/db/s/sharding_runtime_d_params.idl
index af7d732265c..cd5885bbe3f 100644
--- a/src/mongo/db/s/sharding_runtime_d_params.idl
+++ b/src/mongo/db/s/sharding_runtime_d_params.idl
@@ -141,3 +141,14 @@ server_parameters:
cpp_vartype: int
cpp_varname: shardedIndexConsistencyCheckIntervalMS
default: 600000
+
+ metadataRefreshInTransactionMaxWaitBehindCritSecMS:
+ description: >-
+ Maximum time in milliseconds to wait behind the critical section when refreshing the
+ filtering metadata within a transaction.
+ set_at: [startup, runtime]
+ cpp_vartype: AtomicWord<int>
+ cpp_varname: metadataRefreshInTransactionMaxWaitBehindCritSecMS
+ validator:
+ gte: 0
+ default: 500