SERVER-44597 Make last oplog fetcher for stop timestamp respect initial sync retry strategy

author: Matthew Russotto <matthew.russotto@mongodb.com> 2020-01-09 22:33:39 +0000
committer: evergreen <evergreen@mongodb.com> 2020-01-09 22:33:39 +0000
commit: dd066d87217940cbca42815b119d29cd309f0fbe (patch)
tree: dda3fb437874ef36ab142b1813af1d53eb9c2ecc /src/mongo/db/repl/initial_syncer.cpp
parent: 9d23cfc9f3e3c34c3fd949122d0308204ddc728c (diff)
download: mongo-dd066d87217940cbca42815b119d29cd309f0fbe.tar.gz
1 files changed, 55 insertions, 6 deletions
diff --git a/src/mongo/db/repl/initial_syncer.cpp b/src/mongo/db/repl/initial_syncer.cpp
index b38e79a8751..14f624c99b7 100644
--- a/src/mongo/db/repl/initial_syncer.cpp
+++ b/src/mongo/db/repl/initial_syncer.cpp
@@ -691,13 +691,18 @@ void InitialSyncer::_rollbackCheckerResetCallback(
         return;
     }
 
+    // Since the beginFetchingOpTime is retrieved before significant work is done copying
+    // data from the sync source, we allow the OplogEntryFetcher to use its default retry strategy
+    // which retries up to 'numInitialSyncOplogFindAttempts' times'.  This will fail relatively
+    // quickly in the presence of network errors, allowing us to choose a different sync source.
     status = _scheduleLastOplogEntryFetcher_inlock(
         [=](const StatusWith<mongo::Fetcher::QueryResponse>& response,
             mongo::Fetcher::NextAction*,
             mongo::BSONObjBuilder*) mutable {
             _lastOplogEntryFetcherCallbackForDefaultBeginFetchingOpTime(response,
                                                                         onCompletionGuard);
-        });
+        },
+        kFetcherHandlesRetries);
     if (!status.isOK()) {
         onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status);
         return;
@@ -834,13 +839,18 @@ void InitialSyncer::_getBeginFetchingOpTimeCallback(
         initialSyncHangAfterGettingBeginFetchingTimestamp.pauseWhileSet();
     }
 
+    // Since the beginFetchingOpTime is retrieved before significant work is done copying
+    // data from the sync source, we allow the OplogEntryFetcher to use its default retry strategy
+    // which retries up to 'numInitialSyncOplogFindAttempts' times'.  This will fail relatively
+    // quickly in the presence of network errors, allowing us to choose a different sync source.
     status = _scheduleLastOplogEntryFetcher_inlock(
         [=](const StatusWith<mongo::Fetcher::QueryResponse>& response,
             mongo::Fetcher::NextAction*,
             mongo::BSONObjBuilder*) mutable {
             _lastOplogEntryFetcherCallbackForBeginApplyingTimestamp(
                 response, onCompletionGuard, beginFetchingOpTime);
-        });
+        },
+        kFetcherHandlesRetries);
     if (!status.isOK()) {
         onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status);
         return;
@@ -1145,12 +1155,18 @@ void InitialSyncer::_allDatabaseClonerCallback(
         return;
     }
 
+    // Since the stopTimestamp is retrieved after we have done all the work of retrieving collection
+    // data, we handle retries within this class by retrying for
+    // 'initialSyncTransientErrorRetryPeriodSeconds' (default 24 hours).  This is the same retry
+    // strategy used when retrieving collection data, and avoids retrieving all the data and then
+    // throwing it away due to a transient network outage.
     status = _scheduleLastOplogEntryFetcher_inlock(
         [=](const StatusWith<mongo::Fetcher::QueryResponse>& status,
             mongo::Fetcher::NextAction*,
             mongo::BSONObjBuilder*) {
             _lastOplogEntryFetcherCallbackForStopTimestamp(status, onCompletionGuard);
-        });
+        },
+        kInitialSyncerHandlesRetries);
     if (!status.isOK()) {
         onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status);
         return;
@@ -1165,6 +1181,35 @@ void InitialSyncer::_lastOplogEntryFetcherCallbackForStopTimestamp(
         stdx::lock_guard<Latch> lock(_mutex);
         auto status = _checkForShutdownAndConvertStatus_inlock(
             result.getStatus(), "error fetching last oplog entry for stop timestamp");
+        if (_shouldRetryNetworkError(lock, status)) {
+            auto scheduleStatus = _exec->scheduleWork(
+                [this, onCompletionGuard](executor::TaskExecutor::CallbackArgs args) {
+                    // It is not valid to schedule the retry from within this callback,
+                    // hence we schedule a lambda to schedule the retry.
+                    stdx::lock_guard<Latch> lock(_mutex);
+                    // Since the stopTimestamp is retrieved after we have done all the work of
+                    // retrieving collection data, we handle retries within this class by retrying
+                    // for 'initialSyncTransientErrorRetryPeriodSeconds' (default 24 hours).  This
+                    // is the same retry strategy used when retrieving collection data, and avoids
+                    // retrieving all the data and then throwing it away due to a transient network
+                    // outage.
+                    auto status = _scheduleLastOplogEntryFetcher_inlock(
+                        [=](const StatusWith<mongo::Fetcher::QueryResponse>& status,
+                            mongo::Fetcher::NextAction*,
+                            mongo::BSONObjBuilder*) {
+                            _lastOplogEntryFetcherCallbackForStopTimestamp(status,
+                                                                           onCompletionGuard);
+                        },
+                        kInitialSyncerHandlesRetries);
+                    if (!status.isOK()) {
+                        onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status);
+                    }
+                });
+            if (scheduleStatus.isOK())
+                return;
+            // If scheduling failed, we're shutting down and cannot retry.
+            // So just continue with the original failed status.
+        }
         if (!status.isOK()) {
             onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status);
             return;
@@ -1529,7 +1574,8 @@ void InitialSyncer::_finishCallback(StatusWith<OpTimeAndWallTime> lastApplied) {
     }
 }
 
-Status InitialSyncer::_scheduleLastOplogEntryFetcher_inlock(Fetcher::CallbackFn callback) {
+Status InitialSyncer::_scheduleLastOplogEntryFetcher_inlock(
+    Fetcher::CallbackFn callback, LastOplogEntryFetcherRetryStrategy retryStrategy) {
     BSONObj query = BSON("find" << _opts.remoteOplogNS.coll() << "sort" << BSON("$natural" << -1)
                                 << "limit" << 1);
 
@@ -1542,8 +1588,11 @@ Status InitialSyncer::_scheduleLastOplogEntryFetcher_inlock(Fetcher::CallbackFn
         ReadPreferenceSetting::secondaryPreferredMetadata(),
         RemoteCommandRequest::kNoTimeout /* find network timeout */,
         RemoteCommandRequest::kNoTimeout /* getMore network timeout */,
-        RemoteCommandRetryScheduler::makeRetryPolicy<ErrorCategory::RetriableError>(
-            numInitialSyncOplogFindAttempts.load(), executor::RemoteCommandRequest::kNoTimeout));
+        (retryStrategy == kFetcherHandlesRetries)
+            ? RemoteCommandRetryScheduler::makeRetryPolicy<ErrorCategory::RetriableError>(
+                  numInitialSyncOplogFindAttempts.load(),
+                  executor::RemoteCommandRequest::kNoTimeout)
+            : RemoteCommandRetryScheduler::makeNoRetryPolicy());
     Status scheduleStatus = _lastOplogEntryFetcher->schedule();
     if (!scheduleStatus.isOK()) {
         _lastOplogEntryFetcher.reset();
author	Matthew Russotto <matthew.russotto@mongodb.com>	2020-01-09 22:33:39 +0000
committer	evergreen <evergreen@mongodb.com>	2020-01-09 22:33:39 +0000
commit	dd066d87217940cbca42815b119d29cd309f0fbe (patch)
tree	dda3fb437874ef36ab142b1813af1d53eb9c2ecc /src/mongo/db/repl/initial_syncer.cpp
parent	9d23cfc9f3e3c34c3fd949122d0308204ddc728c (diff)
download	mongo-dd066d87217940cbca42815b119d29cd309f0fbe.tar.gz