summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrett Nawrocki <brett.nawrocki@mongodb.com>2021-12-16 19:57:56 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-12-20 19:16:51 +0000
commit70417bcbe6ca27b9e20455de5e77313ef68c648a (patch)
tree3efa32fc527a8a0837e143bdb9c064178b74649e
parent1f29a83d4cfd61e9d724633532d55b67bbc60148 (diff)
downloadmongo-70417bcbe6ca27b9e20455de5e77313ef68c648a.tar.gz
SERVER-61976 Retry failed shard version refreshes on step up
On step-up, shards will clear the filtering metadata and schedule a shard version refresh for the source collection and the temporary resharding collection. It is possible for the shard version refresh triggered through onShardVersionMismatch() to error and not complete the shard version refresh. This can leave a recipient shard waiting to learn all donor shards are prepared to donate or can leave a donor shard waiting to learn all recipient shards have finished cloning. Therefore, shards now will retry on errors until the refresh successfully completes.
-rw-r--r--src/mongo/db/s/resharding/resharding_donor_recipient_common.cpp35
1 files changed, 21 insertions, 14 deletions
diff --git a/src/mongo/db/s/resharding/resharding_donor_recipient_common.cpp b/src/mongo/db/s/resharding/resharding_donor_recipient_common.cpp
index 6e8fa8e6649..530561b121b 100644
--- a/src/mongo/db/s/resharding/resharding_donor_recipient_common.cpp
+++ b/src/mongo/db/s/resharding/resharding_donor_recipient_common.cpp
@@ -53,6 +53,8 @@ using RecipientStateMachine = ReshardingRecipientService::RecipientStateMachine;
namespace {
using namespace fmt::literals;
+const Backoff kExponentialBackoff(Seconds(1), Milliseconds::max());
+
/*
* Creates a ReshardingStateMachine if this node is primary and the ReshardingStateMachine doesn't
* already exist.
@@ -328,22 +330,27 @@ void clearFilteringMetadata(OperationContext* opCtx, bool scheduleAsyncRefresh)
continue;
}
- ExecutorFuture<void>(Grid::get(opCtx)->getExecutorPool()->getFixedExecutor())
- .then([svcCtx = opCtx->getServiceContext(), nss] {
- ThreadClient tc("TriggerReshardingRecovery", svcCtx);
- {
- stdx::lock_guard<Client> lk(*tc.get());
- tc->setSystemOperationKillableByStepdown(lk);
+ AsyncTry([svcCtx = opCtx->getServiceContext(), nss] {
+ ThreadClient tc("TriggerReshardingRecovery", svcCtx);
+ {
+ stdx::lock_guard<Client> lk(*tc.get());
+ tc->setSystemOperationKillableByStepdown(lk);
+ }
+
+ auto opCtx = tc->makeOperationContext();
+ onShardVersionMismatch(opCtx.get(), nss, boost::none /* shardVersionReceived */);
+ })
+ .until([](const Status& status) {
+ if (!status.isOK()) {
+ LOGV2_WARNING(5498101,
+ "Error on deferred shardVersion recovery execution",
+ "error"_attr = redact(status));
}
-
- auto opCtx = tc->makeOperationContext();
- onShardVersionMismatch(opCtx.get(), nss, boost::none /* shardVersionReceived */);
- })
- .onError([](const Status& status) {
- LOGV2_WARNING(5498101,
- "Error on deferred shardVersion recovery execution",
- "error"_attr = redact(status));
+ return status.isOK();
})
+ .withBackoffBetweenIterations(kExponentialBackoff)
+ .on(Grid::get(opCtx)->getExecutorPool()->getFixedExecutor(),
+ opCtx->getCancellationToken())
.getAsync([](auto) {});
}
}