SERVER-46984 Stop async updates to the oplogTruncateAfterPoint during primary shutdown that can race with clearing the oplogTruncateAfterPoint for primary clean shutdown.

(cherry picked from commit 5f3e1db10472fcd57615424c10372444a2c8427f)
author: Dianna Hohensee <dianna.hohensee@mongodb.com> 2020-03-19 13:00:46 -0400
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2020-03-20 21:54:42 +0000
commit: 9309d0b1dfb78b700e765c91d0122c83a37edc41 (patch)
tree: bebc8008c19808c052adce0cba9944d6733a5b18 /src
parent: 02e12f2b4acfb8f1d401ed78a04fe4b6e23b9976 (diff)
download: mongo-9309d0b1dfb78b700e765c91d0122c83a37edc41.tar.gz
6 files changed, 22 insertions, 14 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_external_state.h b/src/mongo/db/repl/replication_coordinator_external_state.h
index a2fdb9a1d71..5bce5d1b06d 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state.h
+++ b/src/mongo/db/repl/replication_coordinator_external_state.h
@@ -227,10 +227,18 @@ public:
     virtual void shardingOnStepDownHook() = 0;
 
     /**
-     * Clears oplog visibility state. All of the oplog is safely visible because there are no oplog
-     * writes during stepdown.
+     * Stops asynchronous updates to and then clears the oplogTruncateAfterPoint.
+     *
+     * Safe to call when there are no oplog writes, and therefore no oplog holes that must be
+     * tracked by the oplogTruncateAfterPoint.
+     *
+     * Only primaries update the truncate point asynchronously; other replication states update the
+     * truncate point manually as necessary. This function should be called whenever replication
+     * leaves state PRIMARY: stepdown; and shutdown while in state PRIMARY. Otherwise, we might
+     * leave a stale oplogTruncateAfterPoint set and cause unnecessary oplog truncation during
+     * startup if the server gets restarted.
      */
-    virtual void clearOplogVisibilityStateForStepDown() = 0;
+    virtual void stopAsyncUpdatesOfAndClearOplogTruncateAfterPoint() = 0;
 
     /**
      * Notifies the bgsync and syncSourceFeedback threads to choose a new sync source.
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
index 514f95dc4fa..54024dde95c 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
@@ -408,15 +408,15 @@ void ReplicationCoordinatorExternalStateImpl::shutdown(OperationContext* opCtx)
     // _taskExecutor pointer never changes.
     _taskExecutor->join();
 
-    // Clear the truncate point if we are still primary, so nothing gets truncated unnecessarily on
-    // startup. There are no oplog holes on clean primary shutdown. Stepdown is similarly safe and
-    // clears the truncate point. The other replication states do need truncation if the truncate
-    // point is set: e.g. interruption mid batch application can leave oplog holes.
+    // The oplog truncate after point must be cleared, if we are still primary for shutdown, so
+    // nothing gets truncated unnecessarily on startup. There are no oplog holes on clean primary
+    // shutdown. Stepdown is similarly safe from holes and halts updates to and clears the truncate
+    // point. The other replication states do need truncation if the truncate point is set: e.g.
+    // interruption mid batch application can leave oplog holes.
     if (!storageGlobalParams.readOnly &&
         _replicationProcess->getConsistencyMarkers()
             ->isOplogTruncateAfterPointBeingUsedForPrimary()) {
-        _replicationProcess->getConsistencyMarkers()->setOplogTruncateAfterPoint(opCtx,
-                                                                                 Timestamp());
+        stopAsyncUpdatesOfAndClearOplogTruncateAfterPoint();
     }
 }
 
@@ -768,7 +768,7 @@ void ReplicationCoordinatorExternalStateImpl::shardingOnStepDownHook() {
     }
 }
 
-void ReplicationCoordinatorExternalStateImpl::clearOplogVisibilityStateForStepDown() {
+void ReplicationCoordinatorExternalStateImpl::stopAsyncUpdatesOfAndClearOplogTruncateAfterPoint() {
     auto opCtx = cc().getOperationContext();
     // Temporarily turn off flow control ticketing. Getting a ticket can stall on a ticket being
     // available, which may have to wait for the ticket refresher to run, which in turn blocks on
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.h b/src/mongo/db/repl/replication_coordinator_external_state_impl.h
index 1d5d904c0f4..5a3b52229bf 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_impl.h
+++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.h
@@ -96,7 +96,7 @@ public:
     virtual HostAndPort getClientHostAndPort(const OperationContext* opCtx);
     virtual void closeConnections();
     virtual void shardingOnStepDownHook();
-    virtual void clearOplogVisibilityStateForStepDown() override;
+    virtual void stopAsyncUpdatesOfAndClearOplogTruncateAfterPoint() override;
     virtual void signalApplierToChooseNewSyncSource();
     virtual void stopProducer();
     virtual void startProducerIfStopped();
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp b/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp
index e2b0ee9ecfd..5d6305a2c79 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp
+++ b/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp
@@ -230,7 +230,7 @@ void ReplicationCoordinatorExternalStateMock::closeConnections() {
 
 void ReplicationCoordinatorExternalStateMock::shardingOnStepDownHook() {}
 
-void ReplicationCoordinatorExternalStateMock::clearOplogVisibilityStateForStepDown() {}
+void ReplicationCoordinatorExternalStateMock::stopAsyncUpdatesOfAndClearOplogTruncateAfterPoint() {}
 
 void ReplicationCoordinatorExternalStateMock::signalApplierToChooseNewSyncSource() {}
 
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_mock.h b/src/mongo/db/repl/replication_coordinator_external_state_mock.h
index be37389312a..1444eaeb1ef 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_mock.h
+++ b/src/mongo/db/repl/replication_coordinator_external_state_mock.h
@@ -85,7 +85,7 @@ public:
     virtual StatusWith<OpTimeAndWallTime> loadLastOpTimeAndWallTime(OperationContext* opCtx);
     virtual void closeConnections();
     virtual void shardingOnStepDownHook();
-    virtual void clearOplogVisibilityStateForStepDown() override;
+    virtual void stopAsyncUpdatesOfAndClearOplogTruncateAfterPoint() override;
     virtual void signalApplierToChooseNewSyncSource();
     virtual void stopProducer();
     virtual void startProducerIfStopped();
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index bf516432716..023a20ad13d 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -3735,7 +3735,7 @@ void ReplicationCoordinatorImpl::_performPostMemberStateUpdateAction(
             // This code must be safe to run on node rollback and node removal!
             _externalState->shardingOnStepDownHook();
             _externalState->stopNoopWriter();
-            _externalState->clearOplogVisibilityStateForStepDown();
+            _externalState->stopAsyncUpdatesOfAndClearOplogTruncateAfterPoint();
             break;
         case kActionStartSingleNodeElection:
             // In protocol version 1, single node replset will run an election instead of
author	Dianna Hohensee <dianna.hohensee@mongodb.com>	2020-03-19 13:00:46 -0400
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2020-03-20 21:54:42 +0000
commit	9309d0b1dfb78b700e765c91d0122c83a37edc41 (patch)
tree	bebc8008c19808c052adce0cba9944d6733a5b18 /src
parent	02e12f2b4acfb8f1d401ed78a04fe4b6e23b9976 (diff)
download	mongo-9309d0b1dfb78b700e765c91d0122c83a37edc41.tar.gz