diff options
author | Louis Williams <louis.williams@mongodb.com> | 2020-05-12 13:39:31 -0400 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-05-12 18:03:48 +0000 |
commit | 25c694f365db0f07a445bd17b6cd5cbf32f5f2f9 (patch) | |
tree | c90451e347838f428b8cad851531b42c42cce6fa /src/mongo/db/repl | |
parent | e2602ad053b2120982fbcac8e33e1ad64e6ec30a (diff) | |
download | mongo-25c694f365db0f07a445bd17b6cd5cbf32f5f2f9.tar.gz |
SERVER-46721 Secondary readers should read at the no-overlap time instead of lastApplied
The no-overlap time, ReadSource::kNoOverlap, is the minimum of replication's lastApplied timestamp
and WiredTiger's all_durable time. This time is independent of replication state and ensures
queries do not see oplog holes after state transitions from secondary to primary.
Diffstat (limited to 'src/mongo/db/repl')
9 files changed, 22 insertions, 18 deletions
diff --git a/src/mongo/db/repl/oplog_batcher.cpp b/src/mongo/db/repl/oplog_batcher.cpp index 27653ab21dc..aba27772547 100644 --- a/src/mongo/db/repl/oplog_batcher.cpp +++ b/src/mongo/db/repl/oplog_batcher.cpp @@ -363,8 +363,8 @@ std::size_t getBatchLimitOplogBytes(OperationContext* opCtx, StorageInterface* s // We can't change the timestamp source within a write unit of work. invariant(!opCtx->lockState()->inAWriteUnitOfWork()); // We're only reading oplog metadata, so the timestamp is not important. If we read with the - // default (which is kLastApplied on secondaries), we may end up with a reader that is at - // kLastApplied. If we then roll back, then when we reconstruct prepared transactions during + // default (which is lastApplied on secondaries), we may end up with a reader that is at + // lastApplied. If we then roll back, then when we reconstruct prepared transactions during // rollback recovery we will be preparing transactions before the read timestamp, which triggers // an assertion in WiredTiger. ReadSourceScope readSourceScope(opCtx, RecoveryUnit::ReadSource::kNoTimestamp); diff --git a/src/mongo/db/repl/replication_coordinator_external_state.h b/src/mongo/db/repl/replication_coordinator_external_state.h index e5ca3cfe12c..29cc6dd9c5b 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state.h +++ b/src/mongo/db/repl/replication_coordinator_external_state.h @@ -260,11 +260,11 @@ public: virtual void updateCommittedSnapshot(const OpTime& newCommitPoint) = 0; /** - * Updates the local snapshot to a consistent point for secondary reads. + * Updates the lastApplied snapshot to a consistent point for secondary reads. * - * It is illegal to call with a optime that does not name an existing snapshot. + * It is illegal to call with a non-existent optime. */ - virtual void updateLocalSnapshot(const OpTime& optime) = 0; + virtual void updateLastAppliedSnapshot(const OpTime& optime) = 0; /** * Returns whether or not the SnapshotThread is active. diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp index 9fa64da2a2e..ce80ab0acac 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp @@ -356,7 +356,7 @@ void ReplicationCoordinatorExternalStateImpl::clearAppliedThroughIfCleanShutdown } // Ensure that all writes are visible before reading. If we failed mid-batch, it would be - // possible to read from a kLastApplied ReadSource where not all writes to the minValid document + // possible to read from a kNoOverlap ReadSource where not all writes to the minValid document // are visible, generating a writeConflict that would not resolve. opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp); @@ -972,10 +972,10 @@ void ReplicationCoordinatorExternalStateImpl::updateCommittedSnapshot( notifyOplogMetadataWaiters(newCommitPoint); } -void ReplicationCoordinatorExternalStateImpl::updateLocalSnapshot(const OpTime& optime) { +void ReplicationCoordinatorExternalStateImpl::updateLastAppliedSnapshot(const OpTime& optime) { auto manager = _service->getStorageEngine()->getSnapshotManager(); if (manager) { - manager->setLocalSnapshot(optime.getTimestamp()); + manager->setLastApplied(optime.getTimestamp()); } } diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.h b/src/mongo/db/repl/replication_coordinator_external_state_impl.h index d1d12e285b3..f38aee76a39 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_impl.h +++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.h @@ -104,7 +104,7 @@ public: virtual bool tooStale(); void dropAllSnapshots() final; void updateCommittedSnapshot(const OpTime& newCommitPoint) final; - void updateLocalSnapshot(const OpTime& optime) final; + void updateLastAppliedSnapshot(const OpTime& optime) final; virtual bool snapshotsEnabled() const; virtual void notifyOplogMetadataWaiters(const OpTime& committedOpTime); boost::optional<OpTime> getEarliestDropPendingOpTime() const final; diff --git a/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp b/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp index 84986f783c8..4c9ddbaf7f1 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp +++ b/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp @@ -250,7 +250,7 @@ void ReplicationCoordinatorExternalStateMock::dropAllSnapshots() {} void ReplicationCoordinatorExternalStateMock::updateCommittedSnapshot( const OpTime& newCommitPoint) {} -void ReplicationCoordinatorExternalStateMock::updateLocalSnapshot(const OpTime& optime) {} +void ReplicationCoordinatorExternalStateMock::updateLastAppliedSnapshot(const OpTime& optime) {} bool ReplicationCoordinatorExternalStateMock::snapshotsEnabled() const { return _areSnapshotsEnabled; diff --git a/src/mongo/db/repl/replication_coordinator_external_state_mock.h b/src/mongo/db/repl/replication_coordinator_external_state_mock.h index 0ef9ad2e893..fd867df8ac7 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_mock.h +++ b/src/mongo/db/repl/replication_coordinator_external_state_mock.h @@ -93,7 +93,7 @@ public: virtual bool tooStale(); virtual void dropAllSnapshots(); virtual void updateCommittedSnapshot(const OpTime& newCommitPoint); - virtual void updateLocalSnapshot(const OpTime& optime); + virtual void updateLastAppliedSnapshot(const OpTime& optime); virtual bool snapshotsEnabled() const; virtual void notifyOplogMetadataWaiters(const OpTime& committedOpTime); boost::optional<OpTime> getEarliestDropPendingOpTime() const final; diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index ebde2673416..bcae984ce91 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -1389,6 +1389,14 @@ void ReplicationCoordinatorImpl::_setMyLastAppliedOpTimeAndWallTime( // No need to wake up replication waiters because there should not be any replication waiters // waiting on our own lastApplied. + // Update the storage engine's lastApplied snapshot before updating the stable timestamp on the + // storage engine. New transactions reading from the lastApplied snapshot should start before + // the oldest timestamp is advanced to avoid races. Additionally, update this snapshot before + // signaling optime waiters. This avoids a race that would allow optime waiters to open + // transactions on stale lastApplied values because they do not hold or reacquire the + // replication coordinator mutex when signaled. + _externalState->updateLastAppliedSnapshot(opTime); + // Signal anyone waiting on optime changes. _opTimeWaiterList.setValueIf_inlock( [opTime](const OpTime& waitOpTime, const SharedWaiterHandle& waiter) { @@ -1396,11 +1404,6 @@ void ReplicationCoordinatorImpl::_setMyLastAppliedOpTimeAndWallTime( }, opTime); - // Update the local snapshot before updating the stable timestamp on the storage engine. New - // transactions reading from the local snapshot should start before the oldest timestamp is - // advanced to avoid races. - _externalState->updateLocalSnapshot(opTime); - // Notify the oplog waiters after updating the local snapshot. signalOplogWaiters(); diff --git a/src/mongo/db/repl/replication_recovery.cpp b/src/mongo/db/repl/replication_recovery.cpp index 82c406dc480..fc680d1a67a 100644 --- a/src/mongo/db/repl/replication_recovery.cpp +++ b/src/mongo/db/repl/replication_recovery.cpp @@ -130,6 +130,7 @@ public: _oplogApplicationEndPoint(oplogApplicationEndPoint) {} void startup(OperationContext* opCtx) final { + opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp); _client = std::make_unique<DBDirectClient>(opCtx); BSONObj predicate = _oplogApplicationEndPoint ? BSON("$gte" << _oplogApplicationStartPoint << "$lte" << *_oplogApplicationEndPoint) diff --git a/src/mongo/db/repl/transaction_oplog_application.cpp b/src/mongo/db/repl/transaction_oplog_application.cpp index b9f75c07d1a..8608926919c 100644 --- a/src/mongo/db/repl/transaction_oplog_application.cpp +++ b/src/mongo/db/repl/transaction_oplog_application.cpp @@ -512,8 +512,8 @@ void reconstructPreparedTransactions(OperationContext* opCtx, repl::OplogApplica } // Read the transactions table and the oplog collection without a timestamp. // The below DBDirectClient read uses AutoGetCollectionForRead which could implicitly change the - // read source to kLastApplied. So we need to explicitly set the read source to kNoTimestamp to - // force reads in this scope to be untimestamped. + // read source. So we need to explicitly set the read source to kNoTimestamp to force reads in + // this scope to be untimestamped. ReadSourceScope readSourceScope(opCtx, RecoveryUnit::ReadSource::kNoTimestamp); DBDirectClient client(opCtx); |