summaryrefslogtreecommitdiff
path: root/src/mongo/db/repl
diff options
context:
space:
mode:
authorLouis Williams <louis.williams@mongodb.com>2020-05-12 13:39:31 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-05-12 18:03:48 +0000
commit25c694f365db0f07a445bd17b6cd5cbf32f5f2f9 (patch)
treec90451e347838f428b8cad851531b42c42cce6fa /src/mongo/db/repl
parente2602ad053b2120982fbcac8e33e1ad64e6ec30a (diff)
downloadmongo-25c694f365db0f07a445bd17b6cd5cbf32f5f2f9.tar.gz
SERVER-46721 Secondary readers should read at the no-overlap time instead of lastApplied
The no-overlap time, ReadSource::kNoOverlap, is the minimum of replication's lastApplied timestamp and WiredTiger's all_durable time. This time is independent of replication state and ensures queries do not see oplog holes after state transitions from secondary to primary.
Diffstat (limited to 'src/mongo/db/repl')
-rw-r--r--src/mongo/db/repl/oplog_batcher.cpp4
-rw-r--r--src/mongo/db/repl/replication_coordinator_external_state.h6
-rw-r--r--src/mongo/db/repl/replication_coordinator_external_state_impl.cpp6
-rw-r--r--src/mongo/db/repl/replication_coordinator_external_state_impl.h2
-rw-r--r--src/mongo/db/repl/replication_coordinator_external_state_mock.cpp2
-rw-r--r--src/mongo/db/repl/replication_coordinator_external_state_mock.h2
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp13
-rw-r--r--src/mongo/db/repl/replication_recovery.cpp1
-rw-r--r--src/mongo/db/repl/transaction_oplog_application.cpp4
9 files changed, 22 insertions, 18 deletions
diff --git a/src/mongo/db/repl/oplog_batcher.cpp b/src/mongo/db/repl/oplog_batcher.cpp
index 27653ab21dc..aba27772547 100644
--- a/src/mongo/db/repl/oplog_batcher.cpp
+++ b/src/mongo/db/repl/oplog_batcher.cpp
@@ -363,8 +363,8 @@ std::size_t getBatchLimitOplogBytes(OperationContext* opCtx, StorageInterface* s
// We can't change the timestamp source within a write unit of work.
invariant(!opCtx->lockState()->inAWriteUnitOfWork());
// We're only reading oplog metadata, so the timestamp is not important. If we read with the
- // default (which is kLastApplied on secondaries), we may end up with a reader that is at
- // kLastApplied. If we then roll back, then when we reconstruct prepared transactions during
+ // default (which is lastApplied on secondaries), we may end up with a reader that is at
+ // lastApplied. If we then roll back, then when we reconstruct prepared transactions during
// rollback recovery we will be preparing transactions before the read timestamp, which triggers
// an assertion in WiredTiger.
ReadSourceScope readSourceScope(opCtx, RecoveryUnit::ReadSource::kNoTimestamp);
diff --git a/src/mongo/db/repl/replication_coordinator_external_state.h b/src/mongo/db/repl/replication_coordinator_external_state.h
index e5ca3cfe12c..29cc6dd9c5b 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state.h
+++ b/src/mongo/db/repl/replication_coordinator_external_state.h
@@ -260,11 +260,11 @@ public:
virtual void updateCommittedSnapshot(const OpTime& newCommitPoint) = 0;
/**
- * Updates the local snapshot to a consistent point for secondary reads.
+ * Updates the lastApplied snapshot to a consistent point for secondary reads.
*
- * It is illegal to call with a optime that does not name an existing snapshot.
+ * It is illegal to call with a non-existent optime.
*/
- virtual void updateLocalSnapshot(const OpTime& optime) = 0;
+ virtual void updateLastAppliedSnapshot(const OpTime& optime) = 0;
/**
* Returns whether or not the SnapshotThread is active.
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
index 9fa64da2a2e..ce80ab0acac 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
@@ -356,7 +356,7 @@ void ReplicationCoordinatorExternalStateImpl::clearAppliedThroughIfCleanShutdown
}
// Ensure that all writes are visible before reading. If we failed mid-batch, it would be
- // possible to read from a kLastApplied ReadSource where not all writes to the minValid document
+ // possible to read from a kNoOverlap ReadSource where not all writes to the minValid document
// are visible, generating a writeConflict that would not resolve.
opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
@@ -972,10 +972,10 @@ void ReplicationCoordinatorExternalStateImpl::updateCommittedSnapshot(
notifyOplogMetadataWaiters(newCommitPoint);
}
-void ReplicationCoordinatorExternalStateImpl::updateLocalSnapshot(const OpTime& optime) {
+void ReplicationCoordinatorExternalStateImpl::updateLastAppliedSnapshot(const OpTime& optime) {
auto manager = _service->getStorageEngine()->getSnapshotManager();
if (manager) {
- manager->setLocalSnapshot(optime.getTimestamp());
+ manager->setLastApplied(optime.getTimestamp());
}
}
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.h b/src/mongo/db/repl/replication_coordinator_external_state_impl.h
index d1d12e285b3..f38aee76a39 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_impl.h
+++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.h
@@ -104,7 +104,7 @@ public:
virtual bool tooStale();
void dropAllSnapshots() final;
void updateCommittedSnapshot(const OpTime& newCommitPoint) final;
- void updateLocalSnapshot(const OpTime& optime) final;
+ void updateLastAppliedSnapshot(const OpTime& optime) final;
virtual bool snapshotsEnabled() const;
virtual void notifyOplogMetadataWaiters(const OpTime& committedOpTime);
boost::optional<OpTime> getEarliestDropPendingOpTime() const final;
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp b/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp
index 84986f783c8..4c9ddbaf7f1 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp
+++ b/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp
@@ -250,7 +250,7 @@ void ReplicationCoordinatorExternalStateMock::dropAllSnapshots() {}
void ReplicationCoordinatorExternalStateMock::updateCommittedSnapshot(
const OpTime& newCommitPoint) {}
-void ReplicationCoordinatorExternalStateMock::updateLocalSnapshot(const OpTime& optime) {}
+void ReplicationCoordinatorExternalStateMock::updateLastAppliedSnapshot(const OpTime& optime) {}
bool ReplicationCoordinatorExternalStateMock::snapshotsEnabled() const {
return _areSnapshotsEnabled;
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_mock.h b/src/mongo/db/repl/replication_coordinator_external_state_mock.h
index 0ef9ad2e893..fd867df8ac7 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_mock.h
+++ b/src/mongo/db/repl/replication_coordinator_external_state_mock.h
@@ -93,7 +93,7 @@ public:
virtual bool tooStale();
virtual void dropAllSnapshots();
virtual void updateCommittedSnapshot(const OpTime& newCommitPoint);
- virtual void updateLocalSnapshot(const OpTime& optime);
+ virtual void updateLastAppliedSnapshot(const OpTime& optime);
virtual bool snapshotsEnabled() const;
virtual void notifyOplogMetadataWaiters(const OpTime& committedOpTime);
boost::optional<OpTime> getEarliestDropPendingOpTime() const final;
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index ebde2673416..bcae984ce91 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -1389,6 +1389,14 @@ void ReplicationCoordinatorImpl::_setMyLastAppliedOpTimeAndWallTime(
// No need to wake up replication waiters because there should not be any replication waiters
// waiting on our own lastApplied.
+ // Update the storage engine's lastApplied snapshot before updating the stable timestamp on the
+ // storage engine. New transactions reading from the lastApplied snapshot should start before
+ // the oldest timestamp is advanced to avoid races. Additionally, update this snapshot before
+ // signaling optime waiters. This avoids a race that would allow optime waiters to open
+ // transactions on stale lastApplied values because they do not hold or reacquire the
+ // replication coordinator mutex when signaled.
+ _externalState->updateLastAppliedSnapshot(opTime);
+
// Signal anyone waiting on optime changes.
_opTimeWaiterList.setValueIf_inlock(
[opTime](const OpTime& waitOpTime, const SharedWaiterHandle& waiter) {
@@ -1396,11 +1404,6 @@ void ReplicationCoordinatorImpl::_setMyLastAppliedOpTimeAndWallTime(
},
opTime);
- // Update the local snapshot before updating the stable timestamp on the storage engine. New
- // transactions reading from the local snapshot should start before the oldest timestamp is
- // advanced to avoid races.
- _externalState->updateLocalSnapshot(opTime);
-
// Notify the oplog waiters after updating the local snapshot.
signalOplogWaiters();
diff --git a/src/mongo/db/repl/replication_recovery.cpp b/src/mongo/db/repl/replication_recovery.cpp
index 82c406dc480..fc680d1a67a 100644
--- a/src/mongo/db/repl/replication_recovery.cpp
+++ b/src/mongo/db/repl/replication_recovery.cpp
@@ -130,6 +130,7 @@ public:
_oplogApplicationEndPoint(oplogApplicationEndPoint) {}
void startup(OperationContext* opCtx) final {
+ opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
_client = std::make_unique<DBDirectClient>(opCtx);
BSONObj predicate = _oplogApplicationEndPoint
? BSON("$gte" << _oplogApplicationStartPoint << "$lte" << *_oplogApplicationEndPoint)
diff --git a/src/mongo/db/repl/transaction_oplog_application.cpp b/src/mongo/db/repl/transaction_oplog_application.cpp
index b9f75c07d1a..8608926919c 100644
--- a/src/mongo/db/repl/transaction_oplog_application.cpp
+++ b/src/mongo/db/repl/transaction_oplog_application.cpp
@@ -512,8 +512,8 @@ void reconstructPreparedTransactions(OperationContext* opCtx, repl::OplogApplica
}
// Read the transactions table and the oplog collection without a timestamp.
// The below DBDirectClient read uses AutoGetCollectionForRead which could implicitly change the
- // read source to kLastApplied. So we need to explicitly set the read source to kNoTimestamp to
- // force reads in this scope to be untimestamped.
+ // read source. So we need to explicitly set the read source to kNoTimestamp to force reads in
+ // this scope to be untimestamped.
ReadSourceScope readSourceScope(opCtx, RecoveryUnit::ReadSource::kNoTimestamp);
DBDirectClient client(opCtx);