From 7f5abac0250d152fc9cfa44effe45a842778b450 Mon Sep 17 00:00:00 2001 From: Suganthi Mani Date: Thu, 16 Jan 2020 21:57:40 +0000 Subject: SERVER-41333 Make safer for initial Sync and Shutdown to write the minValid document at the lastApplied. --- src/mongo/db/db.cpp | 30 +++++++++--- .../repl/replication_consistency_markers_impl.cpp | 15 +++--- src/mongo/db/repl/replication_coordinator.h | 9 ++++ .../repl/replication_coordinator_external_state.h | 9 ++++ ...replication_coordinator_external_state_impl.cpp | 53 ++++++++++++++-------- .../replication_coordinator_external_state_impl.h | 3 ++ ...replication_coordinator_external_state_mock.cpp | 3 ++ .../replication_coordinator_external_state_mock.h | 1 + src/mongo/db/repl/replication_coordinator_impl.cpp | 4 ++ src/mongo/db/repl/replication_coordinator_impl.h | 2 + src/mongo/db/repl/replication_coordinator_mock.cpp | 4 ++ src/mongo/db/repl/replication_coordinator_mock.h | 2 + src/mongo/db/repl/replication_coordinator_noop.cpp | 2 + src/mongo/db/repl/replication_coordinator_noop.h | 2 + src/mongo/dbtests/storage_timestamp_tests.cpp | 2 +- .../embedded/replication_coordinator_embedded.cpp | 2 + .../embedded/replication_coordinator_embedded.h | 2 + 17 files changed, 113 insertions(+), 32 deletions(-) diff --git a/src/mongo/db/db.cpp b/src/mongo/db/db.cpp index 8b1c28c11b2..65fb74748a5 100644 --- a/src/mongo/db/db.cpp +++ b/src/mongo/db/db.cpp @@ -992,19 +992,37 @@ void shutdownTask(const ShutdownTaskArgs& shutdownArgs) { repl::ReplicationStateTransitionLockGuard rstl( opCtx, MODE_X, repl::ReplicationStateTransitionLockGuard::EnqueueOnly()); - // Kill all operations. After this point, the opCtx will have been marked as killed and will - // not be usable other than to kill all transactions directly below. + // Kill all operations. And, makes all newly created opCtx to be immediately interrupted. + // After this point, the opCtx will have been marked as killed and will not be usable other + // than to kill all transactions directly below. serviceContext->setKillAllOperations(); // Destroy all stashed transaction resources, in order to release locks. killSessionsLocalShutdownAllTransactions(opCtx); rstl.waitForLockUntil(Date_t::max()); - } - // Shuts down the thread pool and waits for index builds to finish. - // Depends on setKillAllOperations() above to interrupt the index build operations. - IndexBuildsCoordinator::get(serviceContext)->shutdown(); + // Release the rstl before waiting for the index build threads to join as index build + // reacquires rstl in uninterruptible lock guard to finish their cleanup process. + rstl.release(); + + // Shuts down the thread pool and waits for index builds to finish. + // Depends on setKillAllOperations() above to interrupt the index build operations. + IndexBuildsCoordinator::get(serviceContext)->shutdown(); + + // No new readers can come in after the releasing the RSTL, as previously before releasing + // the RSTL, we made sure that all new operations will be immediately interrupted by setting + // ServiceContext::_globalKill to true. Reacquires RSTL in mode X. + rstl.reacquire(); + + // We are expected to have no active readers while performing + // markAsCleanShutdownIfPossible() step. We guarantee that there are no active readers at + // this point due to: + // 1) Acquiring RSTL in mode X as all readers (except single phase hybrid index builds on + // secondaries) are expected to hold RSTL in mode IX. + // 2) By waiting for all index build to finish. + repl::ReplicationCoordinator::get(serviceContext)->markAsCleanShutdownIfPossible(opCtx); + } ReplicaSetMonitor::shutdown(); diff --git a/src/mongo/db/repl/replication_consistency_markers_impl.cpp b/src/mongo/db/repl/replication_consistency_markers_impl.cpp index d45aa9c492f..8d7046a89f2 100644 --- a/src/mongo/db/repl/replication_consistency_markers_impl.cpp +++ b/src/mongo/db/repl/replication_consistency_markers_impl.cpp @@ -155,12 +155,15 @@ void ReplicationConsistencyMarkersImpl::clearInitialSyncFlag(OperationContext* o << MinValidDocument::kMinValidTermFieldName << time.getTerm() << MinValidDocument::kAppliedThroughFieldName << time)); - // We clear the initial sync flag at the 'lastAppliedOpTime'. This is unnecessary, since there - // should not be any stable checkpoints being taken that this write could inadvertantly enter. - // This 'lastAppliedOpTime' will be the first stable timestamp candidate, so it will be in the - // first stable checkpoint taken after initial sync. This provides more clarity than providing - // no timestamp. - update.timestamp = time.getTimestamp(); + // As we haven't yet updated our initialDataTimestamp from + // Timestamp::kAllowUnstableCheckpointsSentinel to lastAppliedTimestamp, we are only allowed to + // take unstable checkpoints. And, this "lastAppliedTimestamp" will be the first stable + // checkpoint taken after initial sync. So, no way this minValid update can be part of a stable + // checkpoint taken earlier than lastAppliedTimestamp. So, it's safe to make it as an + // non-timestamped write. Also, this has to be non-timestamped write because we may have readers + // at lastAppliedTimestamp, commiting the storage writes before or at such timestamps is + // illegal. + update.timestamp = Timestamp(); _updateMinValidDocument(opCtx, update); diff --git a/src/mongo/db/repl/replication_coordinator.h b/src/mongo/db/repl/replication_coordinator.h index b46b7ed6804..b0433698b2e 100644 --- a/src/mongo/db/repl/replication_coordinator.h +++ b/src/mongo/db/repl/replication_coordinator.h @@ -133,6 +133,15 @@ public: */ virtual void shutdown(OperationContext* opCtx) = 0; + /** + * Performs some bookkeeping to make sure that it's a clean shutdown (i.e. dataset is + * consistent with top of the oplog). + * This should be called after calling shutdown() and should make sure that that are no + * active readers while executing this method as this does perform timestamped storage + * writes at lastAppliedTimestamp. + */ + virtual void markAsCleanShutdownIfPossible(OperationContext* opCtx) = 0; + /** * Returns a reference to the parsed command line arguments that are related to replication. */ diff --git a/src/mongo/db/repl/replication_coordinator_external_state.h b/src/mongo/db/repl/replication_coordinator_external_state.h index e16c35fb016..a2fdb9a1d71 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state.h +++ b/src/mongo/db/repl/replication_coordinator_external_state.h @@ -101,6 +101,15 @@ public: */ virtual void shutdown(OperationContext* opCtx) = 0; + /** + * Clears appliedThrough to indicate that the dataset is consistent with top of the + * oplog on shutdown. + * This should be called after calling shutdown() and should be called holding RSTL + * in mode X to make sure that that are no active readers while executing this method + * as this does perform timestamped minvalid writes at lastAppliedTimestamp. + */ + virtual void clearAppliedThroughIfCleanShutdown(OperationContext* opCtx) = 0; + /** * Returns task executor for scheduling tasks to be run asynchronously. */ diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp index ee09e53c44a..7e8fb6ba89a 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp @@ -340,6 +340,40 @@ void ReplicationCoordinatorExternalStateImpl::startThreads(const ReplSettings& s _startedThreads = true; } +void ReplicationCoordinatorExternalStateImpl::clearAppliedThroughIfCleanShutdown( + OperationContext* opCtx) { + { + stdx::lock_guard lk(_threadMutex); + if (!_startedThreads) { + return; + } + invariant(_inShutdown); + } + + // Ensure that all writes are visible before reading. If we failed mid-batch, it would be + // possible to read from a kLastApplied ReadSource where not all writes to the minValid document + // are visible, generating a writeConflict that would not resolve. + opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp); + + auto loadLastOpTimeAndWallTimeResult = loadLastOpTimeAndWallTime(opCtx); + if (_replicationProcess->getConsistencyMarkers()->getOplogTruncateAfterPoint(opCtx).isNull() && + loadLastOpTimeAndWallTimeResult.isOK() && + loadLastOpTimeAndWallTimeResult.getValue().opTime == + _replicationProcess->getConsistencyMarkers()->getAppliedThrough(opCtx)) { + // Clear the appliedThrough marker to indicate we are consistent with the top of the + // oplog. We record this update at the 'lastAppliedOpTime'. If there are any outstanding + // checkpoints being taken, they should only reflect this write if they see all writes up + // to our 'lastAppliedOpTime'. + auto lastAppliedOpTime = repl::ReplicationCoordinator::get(opCtx)->getMyLastAppliedOpTime(); + + invariant(opCtx->lockState()->isRSTLExclusive()); + // Since we acquired RSTL in mode X, there can't be any active readers. So, it's safe to + // write the minvalid document to the storage. + _replicationProcess->getConsistencyMarkers()->clearAppliedThrough( + opCtx, lastAppliedOpTime.getTimestamp()); + } +} + void ReplicationCoordinatorExternalStateImpl::shutdown(OperationContext* opCtx) { stdx::unique_lock lk(_threadMutex); _inShutdown = true; @@ -370,25 +404,6 @@ void ReplicationCoordinatorExternalStateImpl::shutdown(OperationContext* opCtx) // _taskExecutor outside of _threadMutex because once _startedThreads is set to true, the // _taskExecutor pointer never changes. _taskExecutor->join(); - - // Ensure that all writes are visible before reading. If we failed mid-batch, it would be - // possible to read from a kLastApplied ReadSource where not all writes to the minValid document - // are visible, generating a writeConflict that would not resolve. - opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp); - - auto loadLastOpTimeAndWallTimeResult = loadLastOpTimeAndWallTime(opCtx); - if (_replicationProcess->getConsistencyMarkers()->getOplogTruncateAfterPoint(opCtx).isNull() && - loadLastOpTimeAndWallTimeResult.isOK() && - loadLastOpTimeAndWallTimeResult.getValue().opTime == - _replicationProcess->getConsistencyMarkers()->getAppliedThrough(opCtx)) { - // Clear the appliedThrough marker to indicate we are consistent with the top of the - // oplog. We record this update at the 'lastAppliedOpTime'. If there are any outstanding - // checkpoints being taken, they should only reflect this write if they see all writes up - // to our 'lastAppliedOpTime'. - auto lastAppliedOpTime = repl::ReplicationCoordinator::get(opCtx)->getMyLastAppliedOpTime(); - _replicationProcess->getConsistencyMarkers()->clearAppliedThrough( - opCtx, lastAppliedOpTime.getTimestamp()); - } } executor::TaskExecutor* ReplicationCoordinatorExternalStateImpl::getTaskExecutor() const { diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.h b/src/mongo/db/repl/replication_coordinator_external_state_impl.h index 4ebbeaa9eb6..b57dab30710 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_impl.h +++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.h @@ -74,6 +74,9 @@ public: virtual bool isInitialSyncFlagSet(OperationContext* opCtx) override; virtual void shutdown(OperationContext* opCtx); + + virtual void clearAppliedThroughIfCleanShutdown(OperationContext* opCtx); + virtual executor::TaskExecutor* getTaskExecutor() const override; virtual ThreadPool* getDbWorkThreadPool() const override; virtual Status initializeReplSetStorage(OperationContext* opCtx, const BSONObj& config); diff --git a/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp b/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp index 053f4d460a8..4f9e195937a 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp +++ b/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp @@ -82,6 +82,9 @@ Status ReplicationCoordinatorExternalStateMock::initializeReplSetStorage(Operati void ReplicationCoordinatorExternalStateMock::shutdown(OperationContext*) {} +void ReplicationCoordinatorExternalStateMock::clearAppliedThroughIfCleanShutdown( + OperationContext*) {} + executor::TaskExecutor* ReplicationCoordinatorExternalStateMock::getTaskExecutor() const { return nullptr; } diff --git a/src/mongo/db/repl/replication_coordinator_external_state_mock.h b/src/mongo/db/repl/replication_coordinator_external_state_mock.h index afcc16ce995..be37389312a 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_mock.h +++ b/src/mongo/db/repl/replication_coordinator_external_state_mock.h @@ -66,6 +66,7 @@ public: virtual bool isInitialSyncFlagSet(OperationContext* opCtx) override; virtual void shutdown(OperationContext* opCtx); + virtual void clearAppliedThroughIfCleanShutdown(OperationContext* opCtx); virtual executor::TaskExecutor* getTaskExecutor() const override; virtual ThreadPool* getDbWorkThreadPool() const override; virtual Status initializeReplSetStorage(OperationContext* opCtx, const BSONObj& config); diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index 9eb3a3848af..1d1fed3e8ff 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -899,6 +899,10 @@ void ReplicationCoordinatorImpl::shutdown(OperationContext* opCtx) { _replExecutor->join(); } +void ReplicationCoordinatorImpl::markAsCleanShutdownIfPossible(OperationContext* opCtx) { + _externalState->clearAppliedThroughIfCleanShutdown(opCtx); +} + const ReplSettings& ReplicationCoordinatorImpl::getSettings() const { return _settings; } diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h index 5fd6f46dd94..78825755349 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.h +++ b/src/mongo/db/repl/replication_coordinator_impl.h @@ -110,6 +110,8 @@ public: virtual void shutdown(OperationContext* opCtx) override; + void markAsCleanShutdownIfPossible(OperationContext* opCtx) override; + virtual const ReplSettings& getSettings() const override; virtual Mode getReplicationMode() const override; diff --git a/src/mongo/db/repl/replication_coordinator_mock.cpp b/src/mongo/db/repl/replication_coordinator_mock.cpp index 835029ac9a9..47fdddc82a6 100644 --- a/src/mongo/db/repl/replication_coordinator_mock.cpp +++ b/src/mongo/db/repl/replication_coordinator_mock.cpp @@ -85,6 +85,10 @@ void ReplicationCoordinatorMock::shutdown(OperationContext*) { // TODO } +void ReplicationCoordinatorMock::markAsCleanShutdownIfPossible(OperationContext*) { + // TODO +} + const ReplSettings& ReplicationCoordinatorMock::getSettings() const { return _settings; } diff --git a/src/mongo/db/repl/replication_coordinator_mock.h b/src/mongo/db/repl/replication_coordinator_mock.h index 2893920d1e5..9daca07d623 100644 --- a/src/mongo/db/repl/replication_coordinator_mock.h +++ b/src/mongo/db/repl/replication_coordinator_mock.h @@ -72,6 +72,8 @@ public: virtual void shutdown(OperationContext* opCtx); + void markAsCleanShutdownIfPossible(OperationContext* opCtx) override; + virtual void appendDiagnosticBSON(BSONObjBuilder* bob) override {} virtual const ReplSettings& getSettings() const; diff --git a/src/mongo/db/repl/replication_coordinator_noop.cpp b/src/mongo/db/repl/replication_coordinator_noop.cpp index 6f502574645..ffa6fb8b09b 100644 --- a/src/mongo/db/repl/replication_coordinator_noop.cpp +++ b/src/mongo/db/repl/replication_coordinator_noop.cpp @@ -43,6 +43,8 @@ void ReplicationCoordinatorNoOp::enterTerminalShutdown() {} void ReplicationCoordinatorNoOp::shutdown(OperationContext* opCtx) {} +void ReplicationCoordinatorNoOp::markAsCleanShutdownIfPossible(OperationContext* opCtx) {} + ReplicationCoordinator::Mode ReplicationCoordinatorNoOp::getReplicationMode() const { return modeReplSet; } diff --git a/src/mongo/db/repl/replication_coordinator_noop.h b/src/mongo/db/repl/replication_coordinator_noop.h index ea825567454..cbc755a8b76 100644 --- a/src/mongo/db/repl/replication_coordinator_noop.h +++ b/src/mongo/db/repl/replication_coordinator_noop.h @@ -53,6 +53,8 @@ public: void shutdown(OperationContext* opCtx) final; + void markAsCleanShutdownIfPossible(OperationContext* opCtx) final; + ServiceContext* getServiceContext() final { return _service; } diff --git a/src/mongo/dbtests/storage_timestamp_tests.cpp b/src/mongo/dbtests/storage_timestamp_tests.cpp index 05902a7756e..fcacb5041fc 100644 --- a/src/mongo/dbtests/storage_timestamp_tests.cpp +++ b/src/mongo/dbtests/storage_timestamp_tests.cpp @@ -1626,7 +1626,7 @@ public: expectedMinValidWithUnsetFlag.setAppliedThrough(repl::OpTime(presentTs, presentTerm)); assertMinValidDocumentAtTimestamp(nss, nullTs, expectedMinValidWithUnsetFlag); - assertMinValidDocumentAtTimestamp(nss, pastTs, expectedMinValidWithSetFlag); + assertMinValidDocumentAtTimestamp(nss, pastTs, expectedMinValidWithUnsetFlag); assertMinValidDocumentAtTimestamp(nss, presentTs, expectedMinValidWithUnsetFlag); assertMinValidDocumentAtTimestamp(nss, futureTs, expectedMinValidWithUnsetFlag); } diff --git a/src/mongo/embedded/replication_coordinator_embedded.cpp b/src/mongo/embedded/replication_coordinator_embedded.cpp index f8010210e61..9bbe51bdc4f 100644 --- a/src/mongo/embedded/replication_coordinator_embedded.cpp +++ b/src/mongo/embedded/replication_coordinator_embedded.cpp @@ -53,6 +53,8 @@ void ReplicationCoordinatorEmbedded::enterTerminalShutdown() {} void ReplicationCoordinatorEmbedded::shutdown(OperationContext* opCtx) {} +void ReplicationCoordinatorEmbedded::markAsCleanShutdownIfPossible(OperationContext* opCtx) {} + const ReplSettings& ReplicationCoordinatorEmbedded::getSettings() const { static ReplSettings _settings; return _settings; diff --git a/src/mongo/embedded/replication_coordinator_embedded.h b/src/mongo/embedded/replication_coordinator_embedded.h index eac2c5e97ae..7e346575312 100644 --- a/src/mongo/embedded/replication_coordinator_embedded.h +++ b/src/mongo/embedded/replication_coordinator_embedded.h @@ -51,6 +51,8 @@ public: void shutdown(OperationContext* opCtx) override; + void markAsCleanShutdownIfPossible(OperationContext* opCtx) override; + // Returns the ServiceContext where this instance runs. ServiceContext* getServiceContext() override { return _service; -- cgit v1.2.1