diff options
author | Dianna Hohensee <dianna.hohensee@mongodb.com> | 2020-03-19 13:00:46 -0400 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-03-20 21:54:42 +0000 |
commit | 9309d0b1dfb78b700e765c91d0122c83a37edc41 (patch) | |
tree | bebc8008c19808c052adce0cba9944d6733a5b18 /src | |
parent | 02e12f2b4acfb8f1d401ed78a04fe4b6e23b9976 (diff) | |
download | mongo-9309d0b1dfb78b700e765c91d0122c83a37edc41.tar.gz |
SERVER-46984 Stop async updates to the oplogTruncateAfterPoint during primary shutdown that can race with clearing the oplogTruncateAfterPoint for primary clean shutdown.
(cherry picked from commit 5f3e1db10472fcd57615424c10372444a2c8427f)
Diffstat (limited to 'src')
6 files changed, 22 insertions, 14 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_external_state.h b/src/mongo/db/repl/replication_coordinator_external_state.h index a2fdb9a1d71..5bce5d1b06d 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state.h +++ b/src/mongo/db/repl/replication_coordinator_external_state.h @@ -227,10 +227,18 @@ public: virtual void shardingOnStepDownHook() = 0; /** - * Clears oplog visibility state. All of the oplog is safely visible because there are no oplog - * writes during stepdown. + * Stops asynchronous updates to and then clears the oplogTruncateAfterPoint. + * + * Safe to call when there are no oplog writes, and therefore no oplog holes that must be + * tracked by the oplogTruncateAfterPoint. + * + * Only primaries update the truncate point asynchronously; other replication states update the + * truncate point manually as necessary. This function should be called whenever replication + * leaves state PRIMARY: stepdown; and shutdown while in state PRIMARY. Otherwise, we might + * leave a stale oplogTruncateAfterPoint set and cause unnecessary oplog truncation during + * startup if the server gets restarted. */ - virtual void clearOplogVisibilityStateForStepDown() = 0; + virtual void stopAsyncUpdatesOfAndClearOplogTruncateAfterPoint() = 0; /** * Notifies the bgsync and syncSourceFeedback threads to choose a new sync source. diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp index 514f95dc4fa..54024dde95c 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp @@ -408,15 +408,15 @@ void ReplicationCoordinatorExternalStateImpl::shutdown(OperationContext* opCtx) // _taskExecutor pointer never changes. _taskExecutor->join(); - // Clear the truncate point if we are still primary, so nothing gets truncated unnecessarily on - // startup. There are no oplog holes on clean primary shutdown. Stepdown is similarly safe and - // clears the truncate point. The other replication states do need truncation if the truncate - // point is set: e.g. interruption mid batch application can leave oplog holes. + // The oplog truncate after point must be cleared, if we are still primary for shutdown, so + // nothing gets truncated unnecessarily on startup. There are no oplog holes on clean primary + // shutdown. Stepdown is similarly safe from holes and halts updates to and clears the truncate + // point. The other replication states do need truncation if the truncate point is set: e.g. + // interruption mid batch application can leave oplog holes. if (!storageGlobalParams.readOnly && _replicationProcess->getConsistencyMarkers() ->isOplogTruncateAfterPointBeingUsedForPrimary()) { - _replicationProcess->getConsistencyMarkers()->setOplogTruncateAfterPoint(opCtx, - Timestamp()); + stopAsyncUpdatesOfAndClearOplogTruncateAfterPoint(); } } @@ -768,7 +768,7 @@ void ReplicationCoordinatorExternalStateImpl::shardingOnStepDownHook() { } } -void ReplicationCoordinatorExternalStateImpl::clearOplogVisibilityStateForStepDown() { +void ReplicationCoordinatorExternalStateImpl::stopAsyncUpdatesOfAndClearOplogTruncateAfterPoint() { auto opCtx = cc().getOperationContext(); // Temporarily turn off flow control ticketing. Getting a ticket can stall on a ticket being // available, which may have to wait for the ticket refresher to run, which in turn blocks on diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.h b/src/mongo/db/repl/replication_coordinator_external_state_impl.h index 1d5d904c0f4..5a3b52229bf 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_impl.h +++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.h @@ -96,7 +96,7 @@ public: virtual HostAndPort getClientHostAndPort(const OperationContext* opCtx); virtual void closeConnections(); virtual void shardingOnStepDownHook(); - virtual void clearOplogVisibilityStateForStepDown() override; + virtual void stopAsyncUpdatesOfAndClearOplogTruncateAfterPoint() override; virtual void signalApplierToChooseNewSyncSource(); virtual void stopProducer(); virtual void startProducerIfStopped(); diff --git a/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp b/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp index e2b0ee9ecfd..5d6305a2c79 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp +++ b/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp @@ -230,7 +230,7 @@ void ReplicationCoordinatorExternalStateMock::closeConnections() { void ReplicationCoordinatorExternalStateMock::shardingOnStepDownHook() {} -void ReplicationCoordinatorExternalStateMock::clearOplogVisibilityStateForStepDown() {} +void ReplicationCoordinatorExternalStateMock::stopAsyncUpdatesOfAndClearOplogTruncateAfterPoint() {} void ReplicationCoordinatorExternalStateMock::signalApplierToChooseNewSyncSource() {} diff --git a/src/mongo/db/repl/replication_coordinator_external_state_mock.h b/src/mongo/db/repl/replication_coordinator_external_state_mock.h index be37389312a..1444eaeb1ef 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_mock.h +++ b/src/mongo/db/repl/replication_coordinator_external_state_mock.h @@ -85,7 +85,7 @@ public: virtual StatusWith<OpTimeAndWallTime> loadLastOpTimeAndWallTime(OperationContext* opCtx); virtual void closeConnections(); virtual void shardingOnStepDownHook(); - virtual void clearOplogVisibilityStateForStepDown() override; + virtual void stopAsyncUpdatesOfAndClearOplogTruncateAfterPoint() override; virtual void signalApplierToChooseNewSyncSource(); virtual void stopProducer(); virtual void startProducerIfStopped(); diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index bf516432716..023a20ad13d 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -3735,7 +3735,7 @@ void ReplicationCoordinatorImpl::_performPostMemberStateUpdateAction( // This code must be safe to run on node rollback and node removal! _externalState->shardingOnStepDownHook(); _externalState->stopNoopWriter(); - _externalState->clearOplogVisibilityStateForStepDown(); + _externalState->stopAsyncUpdatesOfAndClearOplogTruncateAfterPoint(); break; case kActionStartSingleNodeElection: // In protocol version 1, single node replset will run an election instead of |