summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrett Nawrocki <brett.nawrocki@mongodb.com>2021-12-16 19:57:56 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-01-07 23:01:50 +0000
commit7318c3b4f85288322d5dcb7b670a692b94c7ccd4 (patch)
treebdfc0602730d7486587927c7f9a46b89969d17b1
parentb6fc3fafff8cd4d9173d30e86600f3ff9d1efa68 (diff)
downloadmongo-7318c3b4f85288322d5dcb7b670a692b94c7ccd4.tar.gz
SERVER-61976 Retry failed shard version refreshes on step up
On step-up, shards will clear the filtering metadata and schedule a shard version refresh for the source collection and the temporary resharding collection. It is possible for the shard version refresh triggered through onShardVersionMismatch() to error and not complete the shard version refresh. This can leave a recipient shard waiting to learn all donor shards are prepared to donate or can leave a donor shard waiting to learn all recipient shards have finished cloning. Therefore, shards now will retry on errors until the refresh successfully completes. (cherry picked from commit 70417bcbe6ca27b9e20455de5e77313ef68c648a) (cherry picked from commit 00591f7a441e452d70af288a4376272a52fcd638)
-rw-r--r--src/mongo/db/s/resharding/resharding_donor_recipient_common.cpp35
1 files changed, 21 insertions, 14 deletions
diff --git a/src/mongo/db/s/resharding/resharding_donor_recipient_common.cpp b/src/mongo/db/s/resharding/resharding_donor_recipient_common.cpp
index 853f71024b6..5539d94634e 100644
--- a/src/mongo/db/s/resharding/resharding_donor_recipient_common.cpp
+++ b/src/mongo/db/s/resharding/resharding_donor_recipient_common.cpp
@@ -53,6 +53,8 @@ using RecipientStateMachine = ReshardingRecipientService::RecipientStateMachine;
namespace {
using namespace fmt::literals;
+const Backoff kExponentialBackoff(Seconds(1), Milliseconds::max());
+
/*
* Creates a ReshardingStateMachine if this node is primary and the ReshardingStateMachine doesn't
* already exist.
@@ -328,22 +330,27 @@ void clearFilteringMetadata(OperationContext* opCtx, bool scheduleAsyncRefresh)
continue;
}
- ExecutorFuture<void>(Grid::get(opCtx)->getExecutorPool()->getFixedExecutor())
- .then([svcCtx = opCtx->getServiceContext(), nss] {
- ThreadClient tc("TriggerReshardingRecovery", svcCtx);
- {
- stdx::lock_guard<Client> lk(*tc.get());
- tc->setSystemOperationKillableByStepdown(lk);
+ AsyncTry([svcCtx = opCtx->getServiceContext(), nss] {
+ ThreadClient tc("TriggerReshardingRecovery", svcCtx);
+ {
+ stdx::lock_guard<Client> lk(*tc.get());
+ tc->setSystemOperationKillableByStepdown(lk);
+ }
+
+ auto opCtx = tc->makeOperationContext();
+ onShardVersionMismatch(opCtx.get(), nss, boost::none /* shardVersionReceived */);
+ })
+ .until([](const Status& status) {
+ if (!status.isOK()) {
+ LOGV2_WARNING(5498101,
+ "Error on deferred shardVersion recovery execution",
+ "error"_attr = redact(status));
}
-
- auto opCtx = tc->makeOperationContext();
- onShardVersionMismatch(opCtx.get(), nss, boost::none /* shardVersionReceived */);
- })
- .onError([](const Status& status) {
- LOGV2_WARNING(5498101,
- "Error on deferred shardVersion recovery execution",
- "error"_attr = redact(status));
+ return status.isOK();
})
+ .withBackoffBetweenIterations(kExponentialBackoff)
+ .on(Grid::get(opCtx)->getExecutorPool()->getFixedExecutor(),
+ CancellationToken::uncancelable())
.getAsync([](auto) {});
}
}