From 7f6d0bad957c5b538538b41a41a16102ee71357d Mon Sep 17 00:00:00 2001
From: Brett Nawrocki <brett.nawrocki@mongodb.com>
Date: Thu, 16 Dec 2021 19:57:56 +0000
Subject: SERVER-61976 Retry failed shard version refreshes on step up

On step-up, shards will clear the filtering metadata and schedule a
shard version refresh for the source collection and the temporary
resharding collection. It is possible for the shard version refresh
triggered through onShardVersionMismatch() to error and not complete the
shard version refresh. This can leave a recipient shard waiting to learn
all donor shards are prepared to donate or can leave a donor shard
waiting to learn all recipient shards have finished cloning.

Therefore, shards now will retry on errors until the refresh
successfully completes.

(cherry picked from commit 70417bcbe6ca27b9e20455de5e77313ef68c648a)
(cherry picked from commit 00591f7a441e452d70af288a4376272a52fcd638)
---
 .../resharding_donor_recipient_common.cpp          | 35 +++++++++++++---------
 1 file changed, 21 insertions(+), 14 deletions(-)
diff --git a/src/mongo/db/s/resharding/resharding_donor_recipient_common.cpp b/src/mongo/db/s/resharding/resharding_donor_recipient_common.cpp
index 2e9358925a7..a35828a2d44 100644
--- a/src/mongo/db/s/resharding/resharding_donor_recipient_common.cpp
+++ b/src/mongo/db/s/resharding/resharding_donor_recipient_common.cpp
@@ -53,6 +53,8 @@ using RecipientStateMachine = ReshardingRecipientService::RecipientStateMachine;
 namespace {
 using namespace fmt::literals;
 
+const Backoff kExponentialBackoff(Seconds(1), Milliseconds::max());
+
 /*
  * Creates a ReshardingStateMachine if this node is primary and the ReshardingStateMachine doesn't
  * already exist.
@@ -328,22 +330,27 @@ void clearFilteringMetadata(OperationContext* opCtx, bool scheduleAsyncRefresh)
             continue;
         }
 
-        ExecutorFuture<void>(Grid::get(opCtx)->getExecutorPool()->getFixedExecutor())
-            .then([svcCtx = opCtx->getServiceContext(), nss] {
-                ThreadClient tc("TriggerReshardingRecovery", svcCtx);
-                {
-                    stdx::lock_guard<Client> lk(*tc.get());
-                    tc->setSystemOperationKillableByStepdown(lk);
+        AsyncTry([svcCtx = opCtx->getServiceContext(), nss] {
+            ThreadClient tc("TriggerReshardingRecovery", svcCtx);
+            {
+                stdx::lock_guard<Client> lk(*tc.get());
+                tc->setSystemOperationKillableByStepdown(lk);
+            }
+
+            auto opCtx = tc->makeOperationContext();
+            onShardVersionMismatch(opCtx.get(), nss, boost::none /* shardVersionReceived */);
+        })
+            .until([](const Status& status) {
+                if (!status.isOK()) {
+                    LOGV2_WARNING(5498101,
+                                  "Error on deferred shardVersion recovery execution",
+                                  "error"_attr = redact(status));
                 }
-
-                auto opCtx = tc->makeOperationContext();
-                onShardVersionMismatch(opCtx.get(), nss, boost::none /* shardVersionReceived */);
-            })
-            .onError([](const Status& status) {
-                LOGV2_WARNING(5498101,
-                              "Error on deferred shardVersion recovery execution",
-                              "error"_attr = redact(status));
+                return status.isOK();
             })
+            .withBackoffBetweenIterations(kExponentialBackoff)
+            .on(Grid::get(opCtx)->getExecutorPool()->getFixedExecutor(),
+                CancellationToken::uncancelable())
             .getAsync([](auto) {});
     }
 }
-- 
cgit v1.2.1