SERVER-61976 Retry failed shard version refreshes on step up

On step-up, shards will clear the filtering metadata and schedule a shard version refresh for the source collection and the temporary resharding collection. It is possible for the shard version refresh triggered through onShardVersionMismatch() to error and not complete the shard version refresh. This can leave a recipient shard waiting to learn all donor shards are prepared to donate or can leave a donor shard waiting to learn all recipient shards have finished cloning. Therefore, shards now will retry on errors until the refresh successfully completes.
author: Brett Nawrocki <brett.nawrocki@mongodb.com> 2021-12-16 19:57:56 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2021-12-20 19:16:51 +0000
commit: 70417bcbe6ca27b9e20455de5e77313ef68c648a (patch)
tree: 3efa32fc527a8a0837e143bdb9c064178b74649e
parent: 1f29a83d4cfd61e9d724633532d55b67bbc60148 (diff)
download: mongo-70417bcbe6ca27b9e20455de5e77313ef68c648a.tar.gz
1 files changed, 21 insertions, 14 deletions
diff --git a/src/mongo/db/s/resharding/resharding_donor_recipient_common.cpp b/src/mongo/db/s/resharding/resharding_donor_recipient_common.cpp
index 6e8fa8e6649..530561b121b 100644
--- a/src/mongo/db/s/resharding/resharding_donor_recipient_common.cpp
+++ b/src/mongo/db/s/resharding/resharding_donor_recipient_common.cpp
@@ -53,6 +53,8 @@ using RecipientStateMachine = ReshardingRecipientService::RecipientStateMachine;
 namespace {
 using namespace fmt::literals;
 
+const Backoff kExponentialBackoff(Seconds(1), Milliseconds::max());
+
 /*
  * Creates a ReshardingStateMachine if this node is primary and the ReshardingStateMachine doesn't
  * already exist.
@@ -328,22 +330,27 @@ void clearFilteringMetadata(OperationContext* opCtx, bool scheduleAsyncRefresh)
             continue;
         }
 
-        ExecutorFuture<void>(Grid::get(opCtx)->getExecutorPool()->getFixedExecutor())
-            .then([svcCtx = opCtx->getServiceContext(), nss] {
-                ThreadClient tc("TriggerReshardingRecovery", svcCtx);
-                {
-                    stdx::lock_guard<Client> lk(*tc.get());
-                    tc->setSystemOperationKillableByStepdown(lk);
+        AsyncTry([svcCtx = opCtx->getServiceContext(), nss] {
+            ThreadClient tc("TriggerReshardingRecovery", svcCtx);
+            {
+                stdx::lock_guard<Client> lk(*tc.get());
+                tc->setSystemOperationKillableByStepdown(lk);
+            }
+
+            auto opCtx = tc->makeOperationContext();
+            onShardVersionMismatch(opCtx.get(), nss, boost::none /* shardVersionReceived */);
+        })
+            .until([](const Status& status) {
+                if (!status.isOK()) {
+                    LOGV2_WARNING(5498101,
+                                  "Error on deferred shardVersion recovery execution",
+                                  "error"_attr = redact(status));
                 }
-
-                auto opCtx = tc->makeOperationContext();
-                onShardVersionMismatch(opCtx.get(), nss, boost::none /* shardVersionReceived */);
-            })
-            .onError([](const Status& status) {
-                LOGV2_WARNING(5498101,
-                              "Error on deferred shardVersion recovery execution",
-                              "error"_attr = redact(status));
+                return status.isOK();
             })
+            .withBackoffBetweenIterations(kExponentialBackoff)
+            .on(Grid::get(opCtx)->getExecutorPool()->getFixedExecutor(),
+                opCtx->getCancellationToken())
             .getAsync([](auto) {});
     }
 }
author	Brett Nawrocki <brett.nawrocki@mongodb.com>	2021-12-16 19:57:56 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2021-12-20 19:16:51 +0000
commit	70417bcbe6ca27b9e20455de5e77313ef68c648a (patch)
tree	3efa32fc527a8a0837e143bdb9c064178b74649e
parent	1f29a83d4cfd61e9d724633532d55b67bbc60148 (diff)
download	mongo-70417bcbe6ca27b9e20455de5e77313ef68c648a.tar.gz