summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLouis Williams <louis.williams@mongodb.com>2020-05-12 13:39:31 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-05-12 18:03:48 +0000
commit25c694f365db0f07a445bd17b6cd5cbf32f5f2f9 (patch)
treec90451e347838f428b8cad851531b42c42cce6fa
parente2602ad053b2120982fbcac8e33e1ad64e6ec30a (diff)
downloadmongo-25c694f365db0f07a445bd17b6cd5cbf32f5f2f9.tar.gz
SERVER-46721 Secondary readers should read at the no-overlap time instead of lastApplied
The no-overlap time, ReadSource::kNoOverlap, is the minimum of replication's lastApplied timestamp and WiredTiger's all_durable time. This time is independent of replication state and ensures queries do not see oplog holes after state transitions from secondary to primary.
-rw-r--r--src/mongo/db/SConscript1
-rw-r--r--src/mongo/db/catalog_raii_test.cpp4
-rw-r--r--src/mongo/db/commands/getmore_cmd.cpp3
-rw-r--r--src/mongo/db/db_raii.cpp213
-rw-r--r--src/mongo/db/db_raii.h13
-rw-r--r--src/mongo/db/db_raii_test.cpp114
-rw-r--r--src/mongo/db/index_builds_coordinator.cpp3
-rw-r--r--src/mongo/db/repl/oplog_batcher.cpp4
-rw-r--r--src/mongo/db/repl/replication_coordinator_external_state.h6
-rw-r--r--src/mongo/db/repl/replication_coordinator_external_state_impl.cpp6
-rw-r--r--src/mongo/db/repl/replication_coordinator_external_state_impl.h2
-rw-r--r--src/mongo/db/repl/replication_coordinator_external_state_mock.cpp2
-rw-r--r--src/mongo/db/repl/replication_coordinator_external_state_mock.h2
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp13
-rw-r--r--src/mongo/db/repl/replication_recovery.cpp1
-rw-r--r--src/mongo/db/repl/transaction_oplog_application.cpp4
-rw-r--r--src/mongo/db/storage/SConscript17
-rw-r--r--src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp60
-rw-r--r--src/mongo/db/storage/recovery_unit.h24
-rw-r--r--src/mongo/db/storage/snapshot_helper.cpp151
-rw-r--r--src/mongo/db/storage/snapshot_helper.h43
-rw-r--r--src/mongo/db/storage/snapshot_manager.h8
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp6
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp61
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp83
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.cpp33
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.h22
-rw-r--r--src/mongo/dbtests/storage_timestamp_tests.cpp4
28 files changed, 604 insertions, 299 deletions
diff --git a/src/mongo/db/SConscript b/src/mongo/db/SConscript
index 55595f5999b..9519d1207f1 100644
--- a/src/mongo/db/SConscript
+++ b/src/mongo/db/SConscript
@@ -602,6 +602,7 @@ env.Library(
LIBDEPS_PRIVATE=[
'$BUILD_DIR/mongo/idl/server_parameter',
'catalog/database_holder',
+ 'storage/snapshot_helper',
],
)
diff --git a/src/mongo/db/catalog_raii_test.cpp b/src/mongo/db/catalog_raii_test.cpp
index 50913896118..733ed190250 100644
--- a/src/mongo/db/catalog_raii_test.cpp
+++ b/src/mongo/db/catalog_raii_test.cpp
@@ -260,8 +260,8 @@ TEST_F(ReadSourceScopeTest, RestoreReadSource) {
ReadSourceScope scope(opCtx());
ASSERT_EQ(opCtx()->recoveryUnit()->getTimestampReadSource(), ReadSource::kUnset);
- opCtx()->recoveryUnit()->setTimestampReadSource(ReadSource::kLastApplied);
- ASSERT_EQ(opCtx()->recoveryUnit()->getTimestampReadSource(), ReadSource::kLastApplied);
+ opCtx()->recoveryUnit()->setTimestampReadSource(ReadSource::kNoOverlap);
+ ASSERT_EQ(opCtx()->recoveryUnit()->getTimestampReadSource(), ReadSource::kNoOverlap);
ASSERT_EQ(opCtx()->recoveryUnit()->getPointInTimeReadTimestamp(), boost::none);
}
ASSERT_EQ(opCtx()->recoveryUnit()->getTimestampReadSource(), ReadSource::kProvided);
diff --git a/src/mongo/db/commands/getmore_cmd.cpp b/src/mongo/db/commands/getmore_cmd.cpp
index f183225cd8c..97e6f6b79ea 100644
--- a/src/mongo/db/commands/getmore_cmd.cpp
+++ b/src/mongo/db/commands/getmore_cmd.cpp
@@ -148,6 +148,7 @@ void applyCursorReadConcern(OperationContext* opCtx, repl::ReadConcernArgs rcArg
switch (rcArgs.getMajorityReadMechanism()) {
case repl::ReadConcernArgs::MajorityReadMechanism::kMajoritySnapshot: {
// Make sure we read from the majority snapshot.
+ opCtx->recoveryUnit()->abandonSnapshot();
opCtx->recoveryUnit()->setTimestampReadSource(
RecoveryUnit::ReadSource::kMajorityCommitted);
uassertStatusOK(opCtx->recoveryUnit()->obtainMajorityCommittedSnapshot());
@@ -156,6 +157,7 @@ void applyCursorReadConcern(OperationContext* opCtx, repl::ReadConcernArgs rcArg
case repl::ReadConcernArgs::MajorityReadMechanism::kSpeculative: {
// Mark the operation as speculative and select the correct read source.
repl::SpeculativeMajorityReadInfo::get(opCtx).setIsSpeculativeRead();
+ opCtx->recoveryUnit()->abandonSnapshot();
opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoOverlap);
break;
}
@@ -167,6 +169,7 @@ void applyCursorReadConcern(OperationContext* opCtx, repl::ReadConcernArgs rcArg
!opCtx->inMultiDocumentTransaction()) {
auto atClusterTime = rcArgs.getArgsAtClusterTime();
invariant(atClusterTime && *atClusterTime != LogicalTime::kUninitialized);
+ opCtx->recoveryUnit()->abandonSnapshot();
opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kProvided,
atClusterTime->asTimestamp());
}
diff --git a/src/mongo/db/db_raii.cpp b/src/mongo/db/db_raii.cpp
index 34775dd7d01..53dda3ca490 100644
--- a/src/mongo/db/db_raii.cpp
+++ b/src/mongo/db/db_raii.cpp
@@ -39,6 +39,7 @@
#include "mongo/db/db_raii_gen.h"
#include "mongo/db/repl/replication_coordinator.h"
#include "mongo/db/s/collection_sharding_state.h"
+#include "mongo/db/storage/snapshot_helper.h"
#include "mongo/logv2/log.h"
namespace mongo {
@@ -112,7 +113,7 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
_autoColl.emplace(opCtx, nsOrUUID, collectionLockMode, viewMode, deadline);
// If the read source is explicitly set to kNoTimestamp, we read the most up to date data and do
- // not consider reading at last applied (e.g. FTDC needs that).
+ // not consider reading at the no-overlap point (e.g. FTDC needs that).
if (opCtx->recoveryUnit()->getTimestampReadSource() == RecoveryUnit::ReadSource::kNoTimestamp)
return;
@@ -123,10 +124,6 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
// need to check for pending catalog changes.
while (auto coll = _autoColl->getCollection()) {
- auto readSource = opCtx->recoveryUnit()->getTimestampReadSource();
- auto minSnapshot = coll->getMinimumVisibleSnapshot();
- auto mySnapshot = opCtx->recoveryUnit()->getPointInTimeReadTimestamp();
-
// TODO(SERVER-47824): Also ban transaction snapshot reads on capped collections.
uassert(ErrorCodes::SnapshotUnavailable,
"Reading from capped collections with readConcern snapshot is not supported "
@@ -135,77 +132,85 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
readConcernLevel != repl::ReadConcernLevel::kSnapshotReadConcern ||
opCtx->inMultiDocumentTransaction());
- // If we are reading at a provided timestamp earlier than the latest catalog changes,
- // then we must return an error.
- if (readSource == RecoveryUnit::ReadSource::kProvided && minSnapshot &&
- (*mySnapshot < *minSnapshot)) {
- uasserted(ErrorCodes::SnapshotUnavailable,
- str::stream()
- << "Unable to read from a snapshot due to pending collection catalog "
- "changes; please retry the operation. Snapshot timestamp is "
- << mySnapshot->toString() << ". Collection minimum is "
- << minSnapshot->toString());
- }
-
// During batch application on secondaries, there is a potential to read inconsistent states
// that would normally be protected by the PBWM lock. In order to serve secondary reads
// during this period, we default to not acquiring the lock (by setting
// _shouldNotConflictWithSecondaryBatchApplicationBlock). On primaries, we always read at a
// consistent time, so not taking the PBWM lock is not a problem. On secondaries, we have to
- // guarantee we read at a consistent state, so we must read at the last applied timestamp,
- // which is set after each complete batch.
+ // guarantee we read at a consistent state, so we must read at the no-overlap timestamp,
+ // which is a function of the lastApplied timestamp, which is set after each complete batch.
//
- // If an attempt to read at the last applied timestamp is unsuccessful because there are
- // pending catalog changes that occur after the last applied timestamp, we release our locks
+ // If an attempt to read at the no-overlap timestamp is unsuccessful because there are
+ // pending catalog changes that occur after the no-overlap timestamp, we release our locks
// and try again with the PBWM lock (by unsetting
// _shouldNotConflictWithSecondaryBatchApplicationBlock).
const NamespaceString nss = coll->ns();
+ auto readSource = opCtx->recoveryUnit()->getTimestampReadSource();
- bool readAtLastAppliedTimestamp =
- _shouldReadAtLastAppliedTimestamp(opCtx, nss, readConcernLevel);
-
- if (readAtLastAppliedTimestamp) {
- opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kLastApplied);
- readSource = opCtx->recoveryUnit()->getTimestampReadSource();
+ // Once we have our locks, check whether or not we should override the ReadSource that was
+ // set before acquiring locks.
+ if (auto newReadSource = SnapshotHelper::getNewReadSource(opCtx, nss)) {
+ opCtx->recoveryUnit()->setTimestampReadSource(*newReadSource);
+ readSource = *newReadSource;
}
- // This timestamp could be earlier than the timestamp seen when the transaction is opened
- // because it is set asynchonously. This is not problematic because holding the collection
- // lock guarantees no metadata changes will occur in that time.
- auto lastAppliedTimestamp = readAtLastAppliedTimestamp
- ? boost::optional<Timestamp>(replCoord->getMyLastAppliedOpTime().getTimestamp())
- : boost::none;
+ const auto readTimestamp = opCtx->recoveryUnit()->getPointInTimeReadTimestamp();
+ const auto afterClusterTime = repl::ReadConcernArgs::get(opCtx).getArgsAfterClusterTime();
+ if (readTimestamp && afterClusterTime) {
+ // Readers that use afterClusterTime have already waited at a higher level for the
+ // lastApplied time to advance to a specified optime, and they assume the read timestamp
+ // of the operation is at least that waited-for timestamp. For kNoOverlap, which is the
+ // minimum of lastApplied and all_durable, this invariant ensures that afterClusterTime
+ // reads do not choose a read timestamp older than the one requested.
+ invariant(*readTimestamp >= afterClusterTime->asTimestamp(),
+ str::stream() << "read timestamp " << readTimestamp->toString()
+ << "was less than afterClusterTime: "
+ << afterClusterTime->asTimestamp().toString());
+ }
- if (!_conflictingCatalogChanges(opCtx, minSnapshot, lastAppliedTimestamp)) {
+ auto minSnapshot = coll->getMinimumVisibleSnapshot();
+ if (!SnapshotHelper::collectionChangesConflictWithRead(minSnapshot, readTimestamp)) {
return;
}
- invariant(lastAppliedTimestamp ||
- // The kMajorityCommitted and kNoOverlap read sources already read from timestamps
- // that are safe with respect to concurrent secondary batch application.
- readSource == RecoveryUnit::ReadSource::kMajorityCommitted ||
- readSource == RecoveryUnit::ReadSource::kNoOverlap);
+ // If we are reading at a provided timestamp earlier than the latest catalog changes,
+ // then we must return an error.
+ if (readSource == RecoveryUnit::ReadSource::kProvided) {
+ uasserted(ErrorCodes::SnapshotUnavailable,
+ str::stream()
+ << "Unable to read from a snapshot due to pending collection catalog "
+ "changes; please retry the operation. Snapshot timestamp is "
+ << readTimestamp->toString() << ". Collection minimum is "
+ << minSnapshot->toString());
+ }
+
+ invariant(
+ // The kMajorityCommitted and kNoOverlap read sources already read from timestamps
+ // that are safe with respect to concurrent secondary batch application, and are
+ // eligible for retrying.
+ readSource == RecoveryUnit::ReadSource::kMajorityCommitted ||
+ readSource == RecoveryUnit::ReadSource::kNoOverlap);
invariant(readConcernLevel != repl::ReadConcernLevel::kSnapshotReadConcern);
// Yield locks in order to do the blocking call below.
_autoColl = boost::none;
- // If there are pending catalog changes, we should conflict with any in-progress batches (by
- // taking the PBWM lock) and choose not to read from the last applied timestamp by unsetting
- // _shouldNotConflictWithSecondaryBatchApplicationBlock. Index builds on secondaries can
- // complete at timestamps later than the lastAppliedTimestamp during initial sync. After
- // initial sync finishes, if we waited instead of retrying, readers would block indefinitely
- // waiting for the lastAppliedTimestamp to move forward. Instead we force the reader take
- // the PBWM lock and retry.
- if (lastAppliedTimestamp) {
+ // If there are pending catalog changes when using a no-overlap read source, we choose to
+ // take the PBWM lock to conflict with any in-progress batches. This prevents us from idly
+ // spinning in this loop trying to get a new read timestamp ahead of the minimum visible
+ // snapshot. This helps us guarantee liveness (i.e. we can eventually get a suitable read
+ // timestamp) but should not be necessary for correctness. After initial sync finishes, if
+ // we waited instead of retrying, readers would block indefinitely waiting for the
+ // noOverlap time to move forward. Instead we force the reader take the PBWM lock and retry.
+ if (readSource == RecoveryUnit::ReadSource::kNoOverlap) {
+ invariant(readTimestamp);
LOGV2(20576,
- "tried reading at last-applied time: {lastAppliedTimestamp} on ns: {nss_ns}, but "
- "future catalog changes are pending at time {minSnapshot}. Trying again without "
- "reading at last-applied time.",
- "lastAppliedTimestamp"_attr = *lastAppliedTimestamp,
- "nss_ns"_attr = nss.ns(),
- "minSnapshot"_attr = *minSnapshot);
+ "Tried reading at no-overlap time, but future catalog changes are pending. "
+ "Trying again without reading at no-overlap time.",
+ "noOverlapTimestamp"_attr = *readTimestamp,
+ "collection"_attr = nss.ns(),
+ "collectionMinSnapshot"_attr = *minSnapshot);
// Destructing the block sets _shouldConflictWithSecondaryBatchApplication back to the
// previous value. If the previous value is false (because there is another
// shouldNotConflictWithSecondaryBatchApplicationBlock outside of this function), this
@@ -220,28 +225,12 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
ErrorCodes::SnapshotUnavailable,
str::stream() << "Unable to read from a snapshot due to pending collection catalog "
"changes; please retry the operation. Snapshot timestamp is "
- << (mySnapshot ? mySnapshot->toString() : "(none)")
- << ". Collection minimum is " << minSnapshot->toString(),
+ << readTimestamp->toString() << ". Collection minimum is "
+ << minSnapshot->toString(),
opCtx->lockState()->shouldConflictWithSecondaryBatchApplication());
- // Cannot change ReadSource while a RecoveryUnit is active, which may result from
- // calling getPointInTimeReadTimestamp().
- opCtx->recoveryUnit()->abandonSnapshot();
- opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset);
- }
-
- // If there are pending catalog changes when using a no-overlap read source, we choose to
- // take the PBWM lock to conflict with any in-progress batches. This prevents us from idly
- // spinning in this loop trying to get a new read timestamp ahead of the minimum visible
- // snapshot. This helps us guarantee liveness (i.e. we can eventually get a suitable read
- // timestamp) but should not be necessary for correctness.
- if (readSource == RecoveryUnit::ReadSource::kNoOverlap) {
- invariant(!lastAppliedTimestamp); // no-overlap read source selects its own timestamp.
- _shouldNotConflictWithSecondaryBatchApplicationBlock = boost::none;
- invariant(opCtx->lockState()->shouldConflictWithSecondaryBatchApplication());
-
- // Abandon our snapshot but don't change our read source, so that we can select a new
- // read timestamp on the next loop iteration.
+ // Abandon our snapshot. We may select a new read timestamp or ReadSource in the next
+ // loop iteration.
opCtx->recoveryUnit()->abandonSnapshot();
}
@@ -259,84 +248,6 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
}
}
-bool AutoGetCollectionForRead::_shouldReadAtLastAppliedTimestamp(
- OperationContext* opCtx,
- const NamespaceString& nss,
- repl::ReadConcernLevel readConcernLevel) const {
-
- // If this block is unset, then the operation did not opt-out of the PBWM lock, implying that it
- // cannot read at lastApplied. It's important to note that it is possible for this to be set,
- // but still be holding the PBWM lock, explained below.
- if (!_shouldNotConflictWithSecondaryBatchApplicationBlock) {
- return false;
- }
-
- // If we are already holding the PBWM lock, do not read at last-applied. This is because once an
- // operation reads without a timestamp (effectively seeing all writes), it is no longer safe to
- // start reading at a timestamp, as writes or catalog operations may appear to vanish.
- // This may occur when multiple collection locks are held concurrently, which is often the case
- // when DBDirectClient is used.
- if (opCtx->lockState()->isLockHeldForMode(resourceIdParallelBatchWriterMode, MODE_IS)) {
- LOGV2_DEBUG(20577, 1, "not reading at last-applied because the PBWM lock is held");
- return false;
- }
-
- // Majority and snapshot readConcern levels should not read from lastApplied; these read
- // concerns already have a designated timestamp to read from.
- if (readConcernLevel != repl::ReadConcernLevel::kLocalReadConcern &&
- readConcernLevel != repl::ReadConcernLevel::kAvailableReadConcern) {
- return false;
- }
-
- // If we are in a replication state (like secondary or primary catch-up) where we are not
- // accepting writes, we should read at lastApplied. If this node can accept writes, then no
- // conflicting replication batches are being applied and we can read from the default snapshot.
- if (repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesForDatabase(opCtx, "admin")) {
- return false;
- }
-
- // Non-replicated collections do not need to read at lastApplied, as those collections are not
- // written by the replication system. However, the oplog is special, as it *is* written by the
- // replication system.
- if (!nss.isReplicated() && !nss.isOplog()) {
- return false;
- }
-
- return true;
-}
-
-bool AutoGetCollectionForRead::_conflictingCatalogChanges(
- OperationContext* opCtx,
- boost::optional<Timestamp> minSnapshot,
- boost::optional<Timestamp> lastAppliedTimestamp) const {
- // This is the timestamp of the most recent catalog changes to this collection. If this is
- // greater than any point in time read timestamps, we should either wait or return an error.
- if (!minSnapshot) {
- return false;
- }
-
- // If we are reading from the lastAppliedTimestamp and it is up-to-date with any catalog
- // changes, we can return.
- if (lastAppliedTimestamp &&
- (lastAppliedTimestamp->isNull() || *lastAppliedTimestamp >= *minSnapshot)) {
- return false;
- }
-
- // This can be set when readConcern is "snapshot" or "majority".
- auto mySnapshot = opCtx->recoveryUnit()->getPointInTimeReadTimestamp();
-
- // If we do not have a point in time to conflict with minSnapshot, return.
- if (!mySnapshot && !lastAppliedTimestamp) {
- return false;
- }
-
- // Return if there are no conflicting catalog changes with mySnapshot.
- if (mySnapshot && *mySnapshot >= *minSnapshot) {
- return false;
- }
-
- return true;
-}
AutoGetCollectionForReadCommand::AutoGetCollectionForReadCommand(
OperationContext* opCtx,
diff --git a/src/mongo/db/db_raii.h b/src/mongo/db/db_raii.h
index edf32cd0010..51208643c6a 100644
--- a/src/mongo/db/db_raii.h
+++ b/src/mongo/db/db_raii.h
@@ -133,19 +133,6 @@ private:
// This field is optional, because the code to wait for majority committed snapshot needs to
// release locks in order to block waiting
boost::optional<AutoGetCollection> _autoColl;
-
- // Returns true if we should read at the last applied timestamp instead of at "no" timestamp
- // (i.e. reading with the "latest" snapshot reflecting all writes). Reading at the last applied
- // timestamp avoids reading in-flux data actively being written by the replication system.
- bool _shouldReadAtLastAppliedTimestamp(OperationContext* opCtx,
- const NamespaceString& nss,
- repl::ReadConcernLevel readConcernLevel) const;
-
- // Returns true if the minSnapshot causes conflicting catalog changes for either the provided
- // lastAppliedTimestamp or the point-in-time snapshot of the RecoveryUnit on 'opCtx'.
- bool _conflictingCatalogChanges(OperationContext* opCtx,
- boost::optional<Timestamp> minSnapshot,
- boost::optional<Timestamp> lastAppliedTimestamp) const;
};
/**
diff --git a/src/mongo/db/db_raii_test.cpp b/src/mongo/db/db_raii_test.cpp
index 4b9a3bac3c4..5258a51a33b 100644
--- a/src/mongo/db/db_raii_test.cpp
+++ b/src/mongo/db/db_raii_test.cpp
@@ -37,6 +37,8 @@
#include "mongo/db/client.h"
#include "mongo/db/concurrency/lock_state.h"
#include "mongo/db/db_raii.h"
+#include "mongo/db/query/internal_plans.h"
+#include "mongo/db/storage/snapshot_manager.h"
#include "mongo/logv2/log.h"
#include "mongo/unittest/unittest.h"
#include "mongo/util/time_support.h"
@@ -199,8 +201,13 @@ TEST_F(DBRAIITestFixture,
ASSERT_OK(
storageInterface()->createCollection(client1.second.get(), nss, defaultCollectionOptions));
ASSERT_OK(replCoord->setFollowerMode(repl::MemberState::RS_SECONDARY));
+
+ // Don't call into the ReplicationCoordinator to update lastApplied because it is only a mock
+ // class and does not update the correct state in the SnapshotManager.
repl::OpTime opTime(Timestamp(200, 1), 1);
- replCoord->setMyLastAppliedOpTimeAndWallTime({opTime, Date_t() + Seconds(1)});
+ auto snapshotManager =
+ client1.second.get()->getServiceContext()->getStorageEngine()->getSnapshotManager();
+ snapshotManager->setLastApplied(opTime.getTimestamp());
Lock::DBLock dbLock1(client1.second.get(), nss.db(), MODE_IX);
ASSERT(client1.second->lockState()->isDbLockedForMode(nss.db(), MODE_IX));
@@ -220,18 +227,119 @@ TEST_F(DBRAIITestFixture,
// for the collection. If we now manually set our last applied time to something very early, we
// will be guaranteed to hit the logic that triggers when the minimum snapshot time is greater
// than the read-at time, since we default to reading at last-applied when in SECONDARY state.
+
+ // Don't call into the ReplicationCoordinator to update lastApplied because it is only a mock
+ // class and does not update the correct state in the SnapshotManager.
repl::OpTime opTime(Timestamp(2, 1), 1);
- replCoord->setMyLastAppliedOpTimeAndWallTime({opTime, Date_t() + Seconds(1)});
+ auto snapshotManager =
+ client1.second.get()->getServiceContext()->getStorageEngine()->getSnapshotManager();
+ snapshotManager->setLastApplied(opTime.getTimestamp());
+
Lock::DBLock dbLock1(client1.second.get(), nss.db(), MODE_IX);
ASSERT(client1.second->lockState()->isDbLockedForMode(nss.db(), MODE_IX));
+
AutoGetCollectionForRead coll(client2.second.get(), NamespaceString("local.system.js"));
+ // Reading from an unreplicated collection does not change the ReadSource to kNoOverlap.
+ ASSERT_EQ(client2.second.get()->recoveryUnit()->getTimestampReadSource(),
+ RecoveryUnit::ReadSource::kUnset);
- // The current code uasserts in this situation, so we confirm that happens here.
+ // Reading from a replicated collection will try to switch to kNoOverlap. Because we are
+ // already reading without a timestamp and we can't reacquire the PBWM lock to continue reading
+ // without a timestamp, we uassert in this situation.
ASSERT_THROWS_CODE(AutoGetCollectionForRead(client2.second.get(), nss),
DBException,
ErrorCodes::SnapshotUnavailable);
}
+TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedConflict) {
+ // This test simulates a situation where AutoGetCollectionForRead cant read at the no-overlap
+ // point (minimum of all_durable and lastApplied) because it is set to a point earlier than the
+ // catalog change. We expect to read without a timestamp and hold the PBWM lock.
+ auto replCoord = repl::ReplicationCoordinator::get(client1.second.get());
+ CollectionOptions defaultCollectionOptions;
+ ASSERT_OK(
+ storageInterface()->createCollection(client1.second.get(), nss, defaultCollectionOptions));
+ ASSERT_OK(replCoord->setFollowerMode(repl::MemberState::RS_SECONDARY));
+
+ // Note that when the collection was created, above, the system chooses a minimum snapshot time
+ // for the collection. If we now manually set our last applied time to something very early, we
+ // will be guaranteed to hit the logic that triggers when the minimum snapshot time is greater
+ // than the read-at time, since we default to reading at last-applied when in SECONDARY state.
+
+ // Don't call into the ReplicationCoordinator to update lastApplied because it is only a mock
+ // class and does not update the correct state in the SnapshotManager.
+ repl::OpTime opTime(Timestamp(2, 1), 1);
+ auto snapshotManager =
+ client1.second.get()->getServiceContext()->getStorageEngine()->getSnapshotManager();
+ snapshotManager->setLastApplied(opTime.getTimestamp());
+ AutoGetCollectionForRead coll(client1.second.get(), nss);
+
+ // We can't read from kNoOverlap in this scenario because there is a catalog conflict. Resort
+ // to taking the PBWM lock and reading without a timestamp.
+ ASSERT_EQ(client1.second.get()->recoveryUnit()->getTimestampReadSource(),
+ RecoveryUnit::ReadSource::kUnset);
+ ASSERT_TRUE(client1.second.get()->lockState()->isLockHeldForMode(
+ resourceIdParallelBatchWriterMode, MODE_IS));
+}
+
+TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedUnavailable) {
+ // This test simulates a situation where AutoGetCollectionForRead reads at the no-overlap
+ // point (minimum of all_durable and lastApplied) even though lastApplied is not available.
+ auto replCoord = repl::ReplicationCoordinator::get(client1.second.get());
+ CollectionOptions defaultCollectionOptions;
+ ASSERT_OK(
+ storageInterface()->createCollection(client1.second.get(), nss, defaultCollectionOptions));
+ ASSERT_OK(replCoord->setFollowerMode(repl::MemberState::RS_SECONDARY));
+
+ // Note that when the collection was created, above, the system chooses a minimum snapshot time
+ // for the collection. Since last-applied isn't available, we default to all_durable, which is
+ // available, and is greater than the collection minimum snapshot.
+ auto snapshotManager =
+ client1.second.get()->getServiceContext()->getStorageEngine()->getSnapshotManager();
+ ASSERT_FALSE(snapshotManager->getLastApplied());
+ AutoGetCollectionForRead coll(client1.second.get(), nss);
+
+ // Even though lastApplied isn't available, the ReadSource is set to kNoOverlap, which reads
+ // at the all_durable time.
+ ASSERT_EQ(client1.second.get()->recoveryUnit()->getTimestampReadSource(),
+ RecoveryUnit::ReadSource::kNoOverlap);
+ ASSERT_TRUE(client1.second.get()->recoveryUnit()->getPointInTimeReadTimestamp());
+ ASSERT_FALSE(client1.second.get()->lockState()->isLockHeldForMode(
+ resourceIdParallelBatchWriterMode, MODE_IS));
+}
+
+TEST_F(DBRAIITestFixture, AutoGetCollectionForReadUsesNoOverlapOnSecondary) {
+ auto opCtx = client1.second.get();
+ ASSERT_OK(storageInterface()->createCollection(opCtx, nss, {}));
+ ASSERT_OK(
+ repl::ReplicationCoordinator::get(opCtx)->setFollowerMode(repl::MemberState::RS_SECONDARY));
+ AutoGetCollectionForRead autoColl(opCtx, nss);
+ auto exec = InternalPlanner::collectionScan(opCtx,
+ nss.ns(),
+ autoColl.getCollection(),
+ PlanExecutor::YIELD_MANUAL,
+ InternalPlanner::FORWARD);
+
+ // The collection scan should use the default ReadSource on a secondary.
+ ASSERT_EQ(RecoveryUnit::ReadSource::kNoOverlap,
+ opCtx->recoveryUnit()->getTimestampReadSource());
+
+ // While yielding the collection scan, simulate stepping-up to a primary.
+ exec->saveState();
+ Locker::LockSnapshot lockSnapshot;
+ ASSERT_TRUE(opCtx->lockState()->saveLockStateAndUnlock(&lockSnapshot));
+ ASSERT_OK(
+ repl::ReplicationCoordinator::get(opCtx)->setFollowerMode(repl::MemberState::RS_PRIMARY));
+
+ // After restoring, the collection scan should now be reading with kNoOverlap, the default on
+ // secondaries.
+ opCtx->lockState()->restoreLockState(opCtx, lockSnapshot);
+ exec->restoreState();
+ ASSERT_EQ(RecoveryUnit::ReadSource::kNoOverlap,
+ opCtx->recoveryUnit()->getTimestampReadSource());
+ BSONObj obj;
+ ASSERT_EQUALS(PlanExecutor::IS_EOF, exec->getNext(&obj, nullptr));
+}
} // namespace
} // namespace mongo
diff --git a/src/mongo/db/index_builds_coordinator.cpp b/src/mongo/db/index_builds_coordinator.cpp
index 839ca031062..16e2a483ca7 100644
--- a/src/mongo/db/index_builds_coordinator.cpp
+++ b/src/mongo/db/index_builds_coordinator.cpp
@@ -2037,6 +2037,9 @@ void IndexBuildsCoordinator::_runIndexBuildInner(OperationContext* opCtx,
void IndexBuildsCoordinator::_buildIndex(OperationContext* opCtx,
std::shared_ptr<ReplIndexBuildState> replState,
const IndexBuildOptions& indexBuildOptions) {
+ // Read without a timestamp. When we commit, we block writes which guarantees all writes are
+ // visible.
+ opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
_scanCollectionAndInsertKeysIntoSorter(opCtx, replState);
_insertKeysFromSideTablesWithoutBlockingWrites(opCtx, replState);
_signalPrimaryForCommitReadiness(opCtx, replState);
diff --git a/src/mongo/db/repl/oplog_batcher.cpp b/src/mongo/db/repl/oplog_batcher.cpp
index 27653ab21dc..aba27772547 100644
--- a/src/mongo/db/repl/oplog_batcher.cpp
+++ b/src/mongo/db/repl/oplog_batcher.cpp
@@ -363,8 +363,8 @@ std::size_t getBatchLimitOplogBytes(OperationContext* opCtx, StorageInterface* s
// We can't change the timestamp source within a write unit of work.
invariant(!opCtx->lockState()->inAWriteUnitOfWork());
// We're only reading oplog metadata, so the timestamp is not important. If we read with the
- // default (which is kLastApplied on secondaries), we may end up with a reader that is at
- // kLastApplied. If we then roll back, then when we reconstruct prepared transactions during
+ // default (which is lastApplied on secondaries), we may end up with a reader that is at
+ // lastApplied. If we then roll back, then when we reconstruct prepared transactions during
// rollback recovery we will be preparing transactions before the read timestamp, which triggers
// an assertion in WiredTiger.
ReadSourceScope readSourceScope(opCtx, RecoveryUnit::ReadSource::kNoTimestamp);
diff --git a/src/mongo/db/repl/replication_coordinator_external_state.h b/src/mongo/db/repl/replication_coordinator_external_state.h
index e5ca3cfe12c..29cc6dd9c5b 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state.h
+++ b/src/mongo/db/repl/replication_coordinator_external_state.h
@@ -260,11 +260,11 @@ public:
virtual void updateCommittedSnapshot(const OpTime& newCommitPoint) = 0;
/**
- * Updates the local snapshot to a consistent point for secondary reads.
+ * Updates the lastApplied snapshot to a consistent point for secondary reads.
*
- * It is illegal to call with a optime that does not name an existing snapshot.
+ * It is illegal to call with a non-existent optime.
*/
- virtual void updateLocalSnapshot(const OpTime& optime) = 0;
+ virtual void updateLastAppliedSnapshot(const OpTime& optime) = 0;
/**
* Returns whether or not the SnapshotThread is active.
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
index 9fa64da2a2e..ce80ab0acac 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
@@ -356,7 +356,7 @@ void ReplicationCoordinatorExternalStateImpl::clearAppliedThroughIfCleanShutdown
}
// Ensure that all writes are visible before reading. If we failed mid-batch, it would be
- // possible to read from a kLastApplied ReadSource where not all writes to the minValid document
+ // possible to read from a kNoOverlap ReadSource where not all writes to the minValid document
// are visible, generating a writeConflict that would not resolve.
opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
@@ -972,10 +972,10 @@ void ReplicationCoordinatorExternalStateImpl::updateCommittedSnapshot(
notifyOplogMetadataWaiters(newCommitPoint);
}
-void ReplicationCoordinatorExternalStateImpl::updateLocalSnapshot(const OpTime& optime) {
+void ReplicationCoordinatorExternalStateImpl::updateLastAppliedSnapshot(const OpTime& optime) {
auto manager = _service->getStorageEngine()->getSnapshotManager();
if (manager) {
- manager->setLocalSnapshot(optime.getTimestamp());
+ manager->setLastApplied(optime.getTimestamp());
}
}
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.h b/src/mongo/db/repl/replication_coordinator_external_state_impl.h
index d1d12e285b3..f38aee76a39 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_impl.h
+++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.h
@@ -104,7 +104,7 @@ public:
virtual bool tooStale();
void dropAllSnapshots() final;
void updateCommittedSnapshot(const OpTime& newCommitPoint) final;
- void updateLocalSnapshot(const OpTime& optime) final;
+ void updateLastAppliedSnapshot(const OpTime& optime) final;
virtual bool snapshotsEnabled() const;
virtual void notifyOplogMetadataWaiters(const OpTime& committedOpTime);
boost::optional<OpTime> getEarliestDropPendingOpTime() const final;
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp b/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp
index 84986f783c8..4c9ddbaf7f1 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp
+++ b/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp
@@ -250,7 +250,7 @@ void ReplicationCoordinatorExternalStateMock::dropAllSnapshots() {}
void ReplicationCoordinatorExternalStateMock::updateCommittedSnapshot(
const OpTime& newCommitPoint) {}
-void ReplicationCoordinatorExternalStateMock::updateLocalSnapshot(const OpTime& optime) {}
+void ReplicationCoordinatorExternalStateMock::updateLastAppliedSnapshot(const OpTime& optime) {}
bool ReplicationCoordinatorExternalStateMock::snapshotsEnabled() const {
return _areSnapshotsEnabled;
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_mock.h b/src/mongo/db/repl/replication_coordinator_external_state_mock.h
index 0ef9ad2e893..fd867df8ac7 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_mock.h
+++ b/src/mongo/db/repl/replication_coordinator_external_state_mock.h
@@ -93,7 +93,7 @@ public:
virtual bool tooStale();
virtual void dropAllSnapshots();
virtual void updateCommittedSnapshot(const OpTime& newCommitPoint);
- virtual void updateLocalSnapshot(const OpTime& optime);
+ virtual void updateLastAppliedSnapshot(const OpTime& optime);
virtual bool snapshotsEnabled() const;
virtual void notifyOplogMetadataWaiters(const OpTime& committedOpTime);
boost::optional<OpTime> getEarliestDropPendingOpTime() const final;
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index ebde2673416..bcae984ce91 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -1389,6 +1389,14 @@ void ReplicationCoordinatorImpl::_setMyLastAppliedOpTimeAndWallTime(
// No need to wake up replication waiters because there should not be any replication waiters
// waiting on our own lastApplied.
+ // Update the storage engine's lastApplied snapshot before updating the stable timestamp on the
+ // storage engine. New transactions reading from the lastApplied snapshot should start before
+ // the oldest timestamp is advanced to avoid races. Additionally, update this snapshot before
+ // signaling optime waiters. This avoids a race that would allow optime waiters to open
+ // transactions on stale lastApplied values because they do not hold or reacquire the
+ // replication coordinator mutex when signaled.
+ _externalState->updateLastAppliedSnapshot(opTime);
+
// Signal anyone waiting on optime changes.
_opTimeWaiterList.setValueIf_inlock(
[opTime](const OpTime& waitOpTime, const SharedWaiterHandle& waiter) {
@@ -1396,11 +1404,6 @@ void ReplicationCoordinatorImpl::_setMyLastAppliedOpTimeAndWallTime(
},
opTime);
- // Update the local snapshot before updating the stable timestamp on the storage engine. New
- // transactions reading from the local snapshot should start before the oldest timestamp is
- // advanced to avoid races.
- _externalState->updateLocalSnapshot(opTime);
-
// Notify the oplog waiters after updating the local snapshot.
signalOplogWaiters();
diff --git a/src/mongo/db/repl/replication_recovery.cpp b/src/mongo/db/repl/replication_recovery.cpp
index 82c406dc480..fc680d1a67a 100644
--- a/src/mongo/db/repl/replication_recovery.cpp
+++ b/src/mongo/db/repl/replication_recovery.cpp
@@ -130,6 +130,7 @@ public:
_oplogApplicationEndPoint(oplogApplicationEndPoint) {}
void startup(OperationContext* opCtx) final {
+ opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
_client = std::make_unique<DBDirectClient>(opCtx);
BSONObj predicate = _oplogApplicationEndPoint
? BSON("$gte" << _oplogApplicationStartPoint << "$lte" << *_oplogApplicationEndPoint)
diff --git a/src/mongo/db/repl/transaction_oplog_application.cpp b/src/mongo/db/repl/transaction_oplog_application.cpp
index b9f75c07d1a..8608926919c 100644
--- a/src/mongo/db/repl/transaction_oplog_application.cpp
+++ b/src/mongo/db/repl/transaction_oplog_application.cpp
@@ -512,8 +512,8 @@ void reconstructPreparedTransactions(OperationContext* opCtx, repl::OplogApplica
}
// Read the transactions table and the oplog collection without a timestamp.
// The below DBDirectClient read uses AutoGetCollectionForRead which could implicitly change the
- // read source to kLastApplied. So we need to explicitly set the read source to kNoTimestamp to
- // force reads in this scope to be untimestamped.
+ // read source. So we need to explicitly set the read source to kNoTimestamp to force reads in
+ // this scope to be untimestamped.
ReadSourceScope readSourceScope(opCtx, RecoveryUnit::ReadSource::kNoTimestamp);
DBDirectClient client(opCtx);
diff --git a/src/mongo/db/storage/SConscript b/src/mongo/db/storage/SConscript
index ccd5d048e1d..e32b8ba329a 100644
--- a/src/mongo/db/storage/SConscript
+++ b/src/mongo/db/storage/SConscript
@@ -30,6 +30,23 @@ env.Library(
)
env.Library(
+ target='snapshot_helper',
+ source=[
+ 'snapshot_helper.cpp',
+ ],
+ LIBDEPS=[
+ '$BUILD_DIR/mongo/base',
+ '$BUILD_DIR/mongo/db/namespace_string',
+ ],
+ LIBDEPS_PRIVATE=[
+ '$BUILD_DIR/mongo/db/concurrency/lock_manager',
+ '$BUILD_DIR/mongo/db/repl/read_concern_args',
+ '$BUILD_DIR/mongo/db/repl/repl_coordinator_interface',
+ 'recovery_unit_base',
+ ],
+ )
+
+env.Library(
target='duplicate_key_error_info',
source=[
'duplicate_key_error_info.cpp',
diff --git a/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp b/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp
index aeef71561ae..baecec3c8af 100644
--- a/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp
+++ b/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp
@@ -150,9 +150,9 @@ public:
return itCountOn(op);
}
- int itCountLocal() {
+ int itCountLastApplied() {
auto op = makeOperation();
- op->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kLastApplied);
+ op->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoOverlap);
return itCountOn(op);
}
@@ -176,14 +176,14 @@ public:
return std::string(record->data.data());
}
- boost::optional<Record> readRecordLocal(RecordId id) {
+ boost::optional<Record> readRecordLastApplied(RecordId id) {
auto op = makeOperation();
- op->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kLastApplied);
+ op->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoOverlap);
return readRecordOn(op, id);
}
- std::string readStringLocal(RecordId id) {
- auto record = readRecordLocal(id);
+ std::string readStringLastApplied(RecordId id) {
+ auto record = readRecordLastApplied(id);
ASSERT(record);
return std::string(record->data.data());
}
@@ -360,7 +360,7 @@ TEST_F(SnapshotManagerTests, UpdateAndDelete) {
ASSERT(!readRecordCommitted(id));
}
-TEST_F(SnapshotManagerTests, InsertAndReadOnLocalSnapshot) {
+TEST_F(SnapshotManagerTests, InsertAndReadOnLastAppliedSnapshot) {
if (!snapshotManager)
return; // This test is only for engines that DO support SnapshotManagers.
@@ -369,7 +369,7 @@ TEST_F(SnapshotManagerTests, InsertAndReadOnLocalSnapshot) {
auto id = insertRecordAndCommit();
auto afterInsert = fetchAndIncrementTimestamp();
- // Not reading on the last local timestamp returns the most recent data.
+ // Not reading on the last applied timestamp returns the most recent data.
auto op = makeOperation();
auto ru = op->recoveryUnit();
ru->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset);
@@ -379,18 +379,18 @@ TEST_F(SnapshotManagerTests, InsertAndReadOnLocalSnapshot) {
deleteRecordAndCommit(id);
auto afterDelete = fetchAndIncrementTimestamp();
- // Reading at the local snapshot timestamps returns data in order.
- snapshotManager->setLocalSnapshot(beforeInsert);
- ASSERT_EQ(itCountLocal(), 0);
- ASSERT(!readRecordLocal(id));
+ // Reading at the last applied snapshot timestamps returns data in order.
+ snapshotManager->setLastApplied(beforeInsert);
+ ASSERT_EQ(itCountLastApplied(), 0);
+ ASSERT(!readRecordLastApplied(id));
- snapshotManager->setLocalSnapshot(afterInsert);
- ASSERT_EQ(itCountLocal(), 1);
- ASSERT(readRecordLocal(id));
+ snapshotManager->setLastApplied(afterInsert);
+ ASSERT_EQ(itCountLastApplied(), 1);
+ ASSERT(readRecordLastApplied(id));
- snapshotManager->setLocalSnapshot(afterDelete);
- ASSERT_EQ(itCountLocal(), 0);
- ASSERT(!readRecordLocal(id));
+ snapshotManager->setLastApplied(afterDelete);
+ ASSERT_EQ(itCountLastApplied(), 0);
+ ASSERT(!readRecordLastApplied(id));
}
TEST_F(SnapshotManagerTests, UpdateAndDeleteOnLocalSnapshot) {
@@ -416,20 +416,20 @@ TEST_F(SnapshotManagerTests, UpdateAndDeleteOnLocalSnapshot) {
deleteRecordAndCommit(id);
auto afterDelete = fetchAndIncrementTimestamp();
- snapshotManager->setLocalSnapshot(beforeInsert);
- ASSERT_EQ(itCountLocal(), 0);
- ASSERT(!readRecordLocal(id));
+ snapshotManager->setLastApplied(beforeInsert);
+ ASSERT_EQ(itCountLastApplied(), 0);
+ ASSERT(!readRecordLastApplied(id));
- snapshotManager->setLocalSnapshot(afterInsert);
- ASSERT_EQ(itCountLocal(), 1);
- ASSERT_EQ(readStringLocal(id), "Aardvark");
+ snapshotManager->setLastApplied(afterInsert);
+ ASSERT_EQ(itCountLastApplied(), 1);
+ ASSERT_EQ(readStringLastApplied(id), "Aardvark");
- snapshotManager->setLocalSnapshot(afterUpdate);
- ASSERT_EQ(itCountLocal(), 1);
- ASSERT_EQ(readStringLocal(id), "Blue spotted stingray");
+ snapshotManager->setLastApplied(afterUpdate);
+ ASSERT_EQ(itCountLastApplied(), 1);
+ ASSERT_EQ(readStringLastApplied(id), "Blue spotted stingray");
- snapshotManager->setLocalSnapshot(afterDelete);
- ASSERT_EQ(itCountLocal(), 0);
- ASSERT(!readRecordLocal(id));
+ snapshotManager->setLastApplied(afterDelete);
+ ASSERT_EQ(itCountLastApplied(), 0);
+ ASSERT(!readRecordLastApplied(id));
}
} // namespace mongo
diff --git a/src/mongo/db/storage/recovery_unit.h b/src/mongo/db/storage/recovery_unit.h
index f8c70309ae2..eeddfafd624 100644
--- a/src/mongo/db/storage/recovery_unit.h
+++ b/src/mongo/db/storage/recovery_unit.h
@@ -253,7 +253,6 @@ public:
* - when using ReadSource::kNoOverlap, the timestamp chosen by the storage engine.
* - when using ReadSource::kAllDurableSnapshot, the timestamp chosen using the storage
* engine's all_durable timestamp.
- * - when using ReadSource::kLastApplied, the timestamp chosen using the storage engine's last
* applied timestamp. Can return boost::none if no timestamp has been established.
* - when using ReadSource::kMajorityCommitted, the majority committed timestamp chosen by the
* storage engine after a transaction has been opened or after a call to
@@ -399,11 +398,6 @@ public:
*/
kNoOverlap,
/**
- * Read from the last applied timestamp. New transactions start at the most up-to-date
- * timestamp.
- */
- kLastApplied,
- /**
* Read from the all_durable timestamp. New transactions will always read from the same
* timestamp and never advance.
*/
@@ -414,6 +408,24 @@ public:
kProvided
};
+ static std::string toString(ReadSource rs) {
+ switch (rs) {
+ case ReadSource::kUnset:
+ return "kUnset";
+ case ReadSource::kNoTimestamp:
+ return "kNoTimestamp";
+ case ReadSource::kMajorityCommitted:
+ return "kMajorityCommitted";
+ case ReadSource::kNoOverlap:
+ return "kNoOverlap";
+ case ReadSource::kAllDurableSnapshot:
+ return "kAllDurableSnapshot";
+ case ReadSource::kProvided:
+ return "kProvided";
+ }
+ MONGO_UNREACHABLE;
+ }
+
/**
* Sets which timestamp to use for read transactions. If 'provided' is supplied, only kProvided
* is an acceptable input.
diff --git a/src/mongo/db/storage/snapshot_helper.cpp b/src/mongo/db/storage/snapshot_helper.cpp
new file mode 100644
index 00000000000..777868f7830
--- /dev/null
+++ b/src/mongo/db/storage/snapshot_helper.cpp
@@ -0,0 +1,151 @@
+/**
+ * Copyright (C) 2020-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kStorage
+
+#include "mongo/platform/basic.h"
+
+#include "mongo/db/storage/snapshot_helper.h"
+
+#include "mongo/db/repl/read_concern_args.h"
+#include "mongo/db/repl/replication_coordinator.h"
+#include "mongo/logv2/log.h"
+
+namespace mongo {
+namespace SnapshotHelper {
+bool canSwitchReadSource(OperationContext* opCtx) {
+
+ // Most readConcerns have behavior controlled at higher levels. Local and available are the only
+ // ReadConcerns that should consider changing, since they read without a timestamp by default.
+ const auto readConcernLevel = repl::ReadConcernArgs::get(opCtx).getLevel();
+ if (readConcernLevel == repl::ReadConcernLevel::kLocalReadConcern ||
+ readConcernLevel == repl::ReadConcernLevel::kAvailableReadConcern) {
+ return true;
+ }
+
+ return false;
+}
+
+bool shouldReadAtNoOverlap(OperationContext* opCtx,
+ const NamespaceString& nss,
+ std::string* reason) {
+
+ // If this is true, then the operation opted-in to the PBWM lock, implying that it cannot read
+ // at no-overlap. It's important to note that it is possible for this to be false, but still be
+ // holding the PBWM lock, explained below.
+ if (opCtx->lockState()->shouldConflictWithSecondaryBatchApplication()) {
+ *reason = "conflicts with batch application";
+ return false;
+ }
+
+ // If we are already holding the PBWM lock, do not read at no-overlap. Snapshots acquired by an
+ // operation after a yield/restore must see all writes in the pre-yield snapshot. Once a
+ // snapshot is reading without a timestamp, we choose to continue acquiring snapshots without a
+ // timestamp. This is done in lieu of determining a timestamp far enough in the future that's
+ // guaranteed to observe all previous writes. This may occur when multiple collection locks are
+ // held concurrently, which is often the case when DBDirectClient is used.
+ if (opCtx->lockState()->isLockHeldForMode(resourceIdParallelBatchWriterMode, MODE_IS)) {
+ *reason = "PBWM lock is held";
+ LOGV2_DEBUG(20577, 1, "not reading at no-overlap because the PBWM lock is held");
+ return false;
+ }
+
+ // If we are in a replication state (like secondary or primary catch-up) where we are not
+ // accepting writes, we should read at no-overlap. If this node can accept writes, then no
+ // conflicting replication batches are being applied and we can read from the default snapshot.
+ if (repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesForDatabase(opCtx, "admin")) {
+ *reason = "primary";
+ return false;
+ }
+
+ // Non-replicated collections do not need to read at no-overlap, as those collections are not
+ // written by the replication system. However, the oplog is special, as it *is* written by the
+ // replication system.
+ if (!nss.isReplicated() && !nss.isOplog()) {
+ *reason = "unreplicated collection";
+ return false;
+ }
+
+ return true;
+}
+boost::optional<RecoveryUnit::ReadSource> getNewReadSource(OperationContext* opCtx,
+ const NamespaceString& nss) {
+ const bool canSwitch = canSwitchReadSource(opCtx);
+ if (!canSwitch) {
+ return boost::none;
+ }
+
+ const auto existing = opCtx->recoveryUnit()->getTimestampReadSource();
+ std::string reason;
+ const bool readAtNoOverlap = shouldReadAtNoOverlap(opCtx, nss, &reason);
+ if (existing == RecoveryUnit::ReadSource::kUnset) {
+ // Shifting from reading without a timestamp to reading with a timestamp can be dangerous
+ // because writes will appear to vanish. This case is intended for new reads on secondaries
+ // and query yield recovery after state transitions from primary to secondary.
+ if (readAtNoOverlap) {
+ LOGV2_DEBUG(4452901, 2, "Changing ReadSource to kNoOverlap", logAttrs(nss));
+ return RecoveryUnit::ReadSource::kNoOverlap;
+ }
+ } else if (existing == RecoveryUnit::ReadSource::kNoOverlap) {
+ // For some reason, we can no longer read at kNoOverlap.
+ // An operation that yields a timestamped snapshot must restore a snapshot with at least as
+ // large of a timestamp, or with proper consideration of rollback scenarios, no timestamp.
+ // Given readers do not survive rollbacks, it's okay to go from reading with a timestamp to
+ // reading without one. More writes will become visible.
+ if (!readAtNoOverlap) {
+ LOGV2_DEBUG(
+ 4452902, 2, "Changing ReadSource to kUnset", logAttrs(nss), "reason"_attr = reason);
+ // This shift to kUnset assumes that callers will not make future attempts to manipulate
+ // their ReadSources after performing reads at an un-timetamped snapshot. The only
+ // exception is callers of this function that may need to change from kUnset to
+ // kNoOverlap in the event of a catalog conflict or query yield.
+ return RecoveryUnit::ReadSource::kUnset;
+ }
+ }
+ return boost::none;
+}
+
+bool collectionChangesConflictWithRead(boost::optional<Timestamp> collectionMin,
+ boost::optional<Timestamp> readTimestamp) {
+ // This is the timestamp of the most recent catalog changes to this collection. If this is
+ // greater than any point in time read timestamps, we should either wait or return an error.
+ if (!collectionMin) {
+ return false;
+ }
+
+ // If we do not have a point in time to conflict with collectionMin, return.
+ if (!readTimestamp || readTimestamp->isNull()) {
+ return false;
+ }
+
+ // Return if there are no conflicting catalog changes with the readTimestamp.
+ return *collectionMin > readTimestamp;
+}
+} // namespace SnapshotHelper
+} // namespace mongo \ No newline at end of file
diff --git a/src/mongo/db/storage/snapshot_helper.h b/src/mongo/db/storage/snapshot_helper.h
new file mode 100644
index 00000000000..fa8fdd85f24
--- /dev/null
+++ b/src/mongo/db/storage/snapshot_helper.h
@@ -0,0 +1,43 @@
+/**
+ * Copyright (C) 2020-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/db/operation_context.h"
+
+namespace mongo {
+namespace SnapshotHelper {
+// Returns a ReadSource if we should change our current ReadSource. Returns boost::none otherwise.
+boost::optional<RecoveryUnit::ReadSource> getNewReadSource(OperationContext* opCtx,
+ const NamespaceString& nss);
+
+bool collectionChangesConflictWithRead(boost::optional<Timestamp> collectionMin,
+ boost::optional<Timestamp> readTimestamp);
+} // namespace SnapshotHelper
+} // namespace mongo
diff --git a/src/mongo/db/storage/snapshot_manager.h b/src/mongo/db/storage/snapshot_manager.h
index d41df1c5013..7839ef179ba 100644
--- a/src/mongo/db/storage/snapshot_manager.h
+++ b/src/mongo/db/storage/snapshot_manager.h
@@ -59,14 +59,14 @@ public:
virtual void setCommittedSnapshot(const Timestamp& timestamp) = 0;
/**
- * Sets the snapshot for the last stable timestamp for reading on secondaries.
+ * Sets the lastApplied timestamp.
*/
- virtual void setLocalSnapshot(const Timestamp& timestamp) = 0;
+ virtual void setLastApplied(const Timestamp& timestamp) = 0;
/**
- * Returns the local snapshot timestamp.
+ * Returns the lastApplied timestamp.
*/
- virtual boost::optional<Timestamp> getLocalSnapshot() = 0;
+ virtual boost::optional<Timestamp> getLastApplied() = 0;
/**
* Drops all snapshots and clears the "committed" snapshot.
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
index 661b0e009c1..352699a9de8 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
@@ -668,6 +668,12 @@ WiredTigerKVEngine::WiredTigerKVEngine(const std::string& canonicalName,
setInitialDataTimestamp(_recoveryTimestamp);
setOldestTimestamp(_recoveryTimestamp, false);
setStableTimestamp(_recoveryTimestamp, false);
+
+ _sessionCache->snapshotManager().setLastApplied(_recoveryTimestamp);
+ {
+ stdx::lock_guard<Latch> lk(_highestDurableTimestampMutex);
+ _highestSeenDurableTimestamp = _recoveryTimestamp.asULL();
+ }
}
}
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp
index 8cc45e0201e..f9d9751d464 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp
@@ -453,7 +453,6 @@ boost::optional<Timestamp> WiredTigerRecoveryUnit::getPointInTimeReadTimestamp()
// The following ReadSources can only establish a read timestamp when a transaction is
// opened.
case ReadSource::kNoOverlap:
- case ReadSource::kLastApplied:
case ReadSource::kAllDurableSnapshot:
break;
}
@@ -462,14 +461,14 @@ boost::optional<Timestamp> WiredTigerRecoveryUnit::getPointInTimeReadTimestamp()
getSession();
switch (_timestampReadSource) {
- case ReadSource::kLastApplied:
- // The lastApplied timestamp is not always available, so it is not possible to invariant
- // that it exists as other ReadSources do.
+ case ReadSource::kNoOverlap:
+ // The lastApplied and allDurable timestamps are not always available if the system has
+ // not accepted writes, so it is not possible to invariant that it exists as other
+ // ReadSources do.
if (!_readAtTimestamp.isNull()) {
return _readAtTimestamp;
}
return boost::none;
- case ReadSource::kNoOverlap:
case ReadSource::kAllDurableSnapshot:
invariant(!_readAtTimestamp.isNull());
return _readAtTimestamp;
@@ -515,17 +514,6 @@ void WiredTigerRecoveryUnit::_txnOpen() {
session, _prepareConflictBehavior, _roundUpPreparedTimestamps);
break;
}
- case ReadSource::kLastApplied: {
- if (_sessionCache->snapshotManager().getLocalSnapshot()) {
- _readAtTimestamp = _sessionCache->snapshotManager().beginTransactionOnLocalSnapshot(
- session, _prepareConflictBehavior, _roundUpPreparedTimestamps);
- } else {
- WiredTigerBeginTxnBlock(
- session, _prepareConflictBehavior, _roundUpPreparedTimestamps)
- .done();
- }
- break;
- }
case ReadSource::kNoOverlap: {
_readAtTimestamp = _beginTransactionAtNoOverlapTimestamp(session);
break;
@@ -555,8 +543,9 @@ void WiredTigerRecoveryUnit::_txnOpen() {
LOGV2_DEBUG(22414,
3,
- "WT begin_transaction for snapshot id {getSnapshotId_toNumber}",
- "getSnapshotId_toNumber"_attr = getSnapshotId().toNumber());
+ "WT begin_transaction",
+ "snapshotId"_attr = getSnapshotId().toNumber(),
+ "readSource"_attr = toString(_timestampReadSource));
}
Timestamp WiredTigerRecoveryUnit::_beginTransactionAtAllDurableTimestamp(WT_SESSION* session) {
@@ -578,8 +567,8 @@ Timestamp WiredTigerRecoveryUnit::_beginTransactionAtAllDurableTimestamp(WT_SESS
Timestamp WiredTigerRecoveryUnit::_beginTransactionAtNoOverlapTimestamp(WT_SESSION* session) {
- auto lastApplied = _sessionCache->snapshotManager().getLocalSnapshot();
- Timestamp allDurable = Timestamp(_oplogManager->fetchAllDurableValue(session->connection));
+ auto lastApplied = _sessionCache->snapshotManager().getLastApplied();
+ Timestamp allDurable = Timestamp(_sessionCache->getKVEngine()->getAllDurableTimestamp());
// When using timestamps for reads and writes, it's important that readers and writers don't
// overlap with the timestamps they use. In other words, at any point in the system there should
@@ -607,17 +596,34 @@ Timestamp WiredTigerRecoveryUnit::_beginTransactionAtNoOverlapTimestamp(WT_SESSI
// should read afterward.
Timestamp readTimestamp = (lastApplied) ? std::min(*lastApplied, allDurable) : allDurable;
+ if (readTimestamp.isNull()) {
+ // When there is not an all_durable or lastApplied timestamp available, read without a
+ // timestamp. Do not round up the read timestamp to the oldest timestamp.
+
+ // There is a race that allows new transactions to start between the time we check for a
+ // read timestamp and start our transaction, which can temporarily violate the contract of
+ // kNoOverlap. That is, writes will be visible that occur after the all_durable time. This
+ // is only possible for readers that start immediately after an initial sync that did not
+ // replicate any oplog entries. Future transactions will start reading at a timestamp once
+ // timestamped writes have been made.
+ WiredTigerBeginTxnBlock txnOpen(
+ session, _prepareConflictBehavior, _roundUpPreparedTimestamps);
+ LOGV2_DEBUG(4452900, 1, "no read timestamp available for kNoOverlap");
+ txnOpen.done();
+ return readTimestamp;
+ }
+
WiredTigerBeginTxnBlock txnOpen(session,
_prepareConflictBehavior,
_roundUpPreparedTimestamps,
RoundUpReadTimestamp::kRound);
auto status = txnOpen.setReadSnapshot(readTimestamp);
fassert(51066, status);
+ txnOpen.done();
- // We might have rounded to oldest between calling getAllDurable and setReadSnapshot. We need
- // to get the actual read timestamp we used.
+ // We might have rounded to oldest between calling getAllDurable and setReadSnapshot. We
+ // need to get the actual read timestamp we used.
readTimestamp = _getTransactionReadTimestamp(session);
- txnOpen.done();
return readTimestamp;
}
@@ -769,15 +775,14 @@ void WiredTigerRecoveryUnit::setTimestampReadSource(ReadSource readSource,
boost::optional<Timestamp> provided) {
LOGV2_DEBUG(22416,
3,
- "setting timestamp read source: {static_cast_int_readSource}, provided timestamp: "
- "{provided_provided_none}",
- "static_cast_int_readSource"_attr = static_cast<int>(readSource),
- "provided_provided_none"_attr = ((provided) ? provided->toString() : "none"));
+ "setting timestamp read source",
+ "readSource"_attr = toString(readSource),
+ "provided"_attr = ((provided) ? provided->toString() : "none"));
invariant(!_isActive() || _timestampReadSource == readSource,
str::stream() << "Current state: " << toString(_getState())
<< ". Invalid internal state while setting timestamp read source: "
- << static_cast<int>(readSource) << ", provided timestamp: "
+ << toString(readSource) << ", provided timestamp: "
<< (provided ? provided->toString() : "none"));
invariant(!provided == (readSource != ReadSource::kProvided));
invariant(!(provided && provided->isNull()));
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp
index 314a524063e..1f440d319b7 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp
@@ -163,11 +163,14 @@ public:
clientAndCtx2 = makeClientAndOpCtx(harnessHelper.get(), "reader");
ru1 = checked_cast<WiredTigerRecoveryUnit*>(clientAndCtx1.second->recoveryUnit());
ru2 = checked_cast<WiredTigerRecoveryUnit*>(clientAndCtx2.second->recoveryUnit());
+ snapshotManager = dynamic_cast<WiredTigerSnapshotManager*>(
+ harnessHelper->getEngine()->getSnapshotManager());
}
std::unique_ptr<WiredTigerRecoveryUnitHarnessHelper> harnessHelper;
ClientAndCtx clientAndCtx1, clientAndCtx2;
WiredTigerRecoveryUnit *ru1, *ru2;
+ WiredTigerSnapshotManager* snapshotManager;
private:
const char* wt_uri = "table:prepare_transaction";
@@ -180,6 +183,86 @@ TEST_F(WiredTigerRecoveryUnitTestFixture, SetReadSource) {
ASSERT_EQ(Timestamp(1, 1), ru1->getPointInTimeReadTimestamp());
}
+TEST_F(WiredTigerRecoveryUnitTestFixture, NoOverlapReadSource) {
+ OperationContext* opCtx1 = clientAndCtx1.second.get();
+ std::unique_ptr<RecordStore> rs(harnessHelper->createRecordStore(opCtx1, "a.b"));
+
+ const std::string str = str::stream() << "test";
+ const Timestamp ts1{1, 1};
+ const Timestamp ts2{1, 2};
+ const Timestamp ts3{1, 2};
+
+ RecordId rid1;
+ {
+ WriteUnitOfWork wuow(opCtx1);
+ StatusWith<RecordId> res = rs->insertRecord(opCtx1, str.c_str(), str.size() + 1, ts1);
+ ASSERT_OK(res);
+ wuow.commit();
+ rid1 = res.getValue();
+ snapshotManager->setLastApplied(ts1);
+ }
+
+ // Read without a timestamp. The write should be visible.
+ ASSERT_EQ(opCtx1->recoveryUnit()->getTimestampReadSource(), RecoveryUnit::ReadSource::kUnset);
+ RecordData unused;
+ ASSERT_TRUE(rs->findRecord(opCtx1, rid1, &unused));
+
+ // Read with kNoOverlap. The write should be visible.
+ opCtx1->recoveryUnit()->abandonSnapshot();
+ opCtx1->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoOverlap);
+ ASSERT_TRUE(rs->findRecord(opCtx1, rid1, &unused));
+
+ RecordId rid2, rid3;
+ {
+ // Start, but do not commit a transaction with opCtx2. This sets a timestamp at ts2, which
+ // creates a hole. kNoOverlap, which is a function of all_durable, will only be able to read
+ // at the time immediately before.
+ OperationContext* opCtx2 = clientAndCtx2.second.get();
+ WriteUnitOfWork wuow(opCtx2);
+ StatusWith<RecordId> res =
+ rs->insertRecord(opCtx2, str.c_str(), str.size() + 1, Timestamp());
+ ASSERT_OK(opCtx2->recoveryUnit()->setTimestamp(ts2));
+ ASSERT_OK(res);
+
+ // While holding open a transaction with opCtx2, perform an insert at ts3 with opCtx1. This
+ // creates a "hole".
+ {
+ WriteUnitOfWork wuow(opCtx1);
+ StatusWith<RecordId> res = rs->insertRecord(opCtx1, str.c_str(), str.size() + 1, ts3);
+ ASSERT_OK(res);
+ wuow.commit();
+ rid3 = res.getValue();
+ snapshotManager->setLastApplied(ts3);
+ }
+
+ // Read without a timestamp, and we should see the first and third records.
+ opCtx1->recoveryUnit()->abandonSnapshot();
+ opCtx1->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset);
+ ASSERT_TRUE(rs->findRecord(opCtx1, rid1, &unused));
+ ASSERT_FALSE(rs->findRecord(opCtx1, rid2, &unused));
+ ASSERT_TRUE(rs->findRecord(opCtx1, rid3, &unused));
+
+ // Now read at kNoOverlap. Since the transaction at ts2 has not committed, all_durable is
+ // held back to ts1. LastApplied has advanced to ts3, but because kNoOverlap is the minimum,
+ // we should only see one record.
+ opCtx1->recoveryUnit()->abandonSnapshot();
+ opCtx1->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoOverlap);
+ ASSERT_TRUE(rs->findRecord(opCtx1, rid1, &unused));
+ ASSERT_FALSE(rs->findRecord(opCtx1, rid2, &unused));
+ ASSERT_FALSE(rs->findRecord(opCtx1, rid3, &unused));
+
+ wuow.commit();
+ rid2 = res.getValue();
+ }
+
+ // Now that the hole has been closed, kNoOverlap should see all 3 records.
+ opCtx1->recoveryUnit()->abandonSnapshot();
+ opCtx1->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoOverlap);
+ ASSERT_TRUE(rs->findRecord(opCtx1, rid1, &unused));
+ ASSERT_TRUE(rs->findRecord(opCtx1, rid2, &unused));
+ ASSERT_TRUE(rs->findRecord(opCtx1, rid3, &unused));
+}
+
TEST_F(WiredTigerRecoveryUnitTestFixture, CreateAndCheckForCachePressure) {
int time = 1;
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.cpp
index 74988508a4c..efc2d8adfe5 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.cpp
@@ -48,17 +48,17 @@ void WiredTigerSnapshotManager::setCommittedSnapshot(const Timestamp& timestamp)
_committedSnapshot = timestamp;
}
-void WiredTigerSnapshotManager::setLocalSnapshot(const Timestamp& timestamp) {
- stdx::lock_guard<Latch> lock(_localSnapshotMutex);
+void WiredTigerSnapshotManager::setLastApplied(const Timestamp& timestamp) {
+ stdx::lock_guard<Latch> lock(_lastAppliedMutex);
if (timestamp.isNull())
- _localSnapshot = boost::none;
+ _lastApplied = boost::none;
else
- _localSnapshot = timestamp;
+ _lastApplied = timestamp;
}
-boost::optional<Timestamp> WiredTigerSnapshotManager::getLocalSnapshot() {
- stdx::lock_guard<Latch> lock(_localSnapshotMutex);
- return _localSnapshot;
+boost::optional<Timestamp> WiredTigerSnapshotManager::getLastApplied() {
+ stdx::lock_guard<Latch> lock(_lastAppliedMutex);
+ return _lastApplied;
}
void WiredTigerSnapshotManager::dropAllSnapshots() {
@@ -93,23 +93,4 @@ Timestamp WiredTigerSnapshotManager::beginTransactionOnCommittedSnapshot(
return *_committedSnapshot;
}
-Timestamp WiredTigerSnapshotManager::beginTransactionOnLocalSnapshot(
- WT_SESSION* session,
- PrepareConflictBehavior prepareConflictBehavior,
- RoundUpPreparedTimestamps roundUpPreparedTimestamps) const {
- WiredTigerBeginTxnBlock txnOpen(session, prepareConflictBehavior, roundUpPreparedTimestamps);
-
- stdx::lock_guard<Latch> lock(_localSnapshotMutex);
- invariant(_localSnapshot);
- LOGV2_DEBUG(22427,
- 3,
- "begin_transaction on local snapshot {localSnapshot_get}",
- "localSnapshot_get"_attr = _localSnapshot.get().toString());
- auto status = txnOpen.setReadSnapshot(_localSnapshot.get());
- fassert(50775, status);
-
- txnOpen.done();
- return *_localSnapshot;
-}
-
} // namespace mongo
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.h b/src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.h
index 1726a7d4c2b..b285c694e70 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.h
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.h
@@ -51,8 +51,8 @@ public:
WiredTigerSnapshotManager() = default;
void setCommittedSnapshot(const Timestamp& timestamp) final;
- void setLocalSnapshot(const Timestamp& timestamp) final;
- boost::optional<Timestamp> getLocalSnapshot() final;
+ void setLastApplied(const Timestamp& timestamp) final;
+ boost::optional<Timestamp> getLastApplied() final;
void dropAllSnapshots() final;
//
@@ -70,16 +70,6 @@ public:
RoundUpPreparedTimestamps roundUpPreparedTimestamps) const;
/**
- * Starts a transaction on the last stable local timestamp, set by setLocalSnapshot.
- *
- * Throws if no local snapshot has been set.
- */
- Timestamp beginTransactionOnLocalSnapshot(
- WT_SESSION* session,
- PrepareConflictBehavior prepareConflictBehavior,
- RoundUpPreparedTimestamps roundUpPreparedTimestamps) const;
-
- /**
* Returns lowest SnapshotName that could possibly be used by a future call to
* beginTransactionOnCommittedSnapshot, or boost::none if there is currently no committed
* snapshot.
@@ -95,9 +85,9 @@ private:
MONGO_MAKE_LATCH("WiredTigerSnapshotManager::_committedSnapshotMutex");
boost::optional<Timestamp> _committedSnapshot;
- // Snapshot to use for reads at a local stable timestamp.
- mutable Mutex _localSnapshotMutex = // Guards _localSnapshot.
- MONGO_MAKE_LATCH("WiredTigerSnapshotManager::_localSnapshotMutex");
- boost::optional<Timestamp> _localSnapshot;
+ // Timestamp to use for reads at a the lastApplied timestamp.
+ mutable Mutex _lastAppliedMutex = // Guards _lastApplied.
+ MONGO_MAKE_LATCH("WiredTigerSnapshotManager::_lastAppliedMutex");
+ boost::optional<Timestamp> _lastApplied;
};
} // namespace mongo
diff --git a/src/mongo/dbtests/storage_timestamp_tests.cpp b/src/mongo/dbtests/storage_timestamp_tests.cpp
index d148f48d416..fca1968b3e4 100644
--- a/src/mongo/dbtests/storage_timestamp_tests.cpp
+++ b/src/mongo/dbtests/storage_timestamp_tests.cpp
@@ -1525,7 +1525,7 @@ public:
// This test does not run a real ReplicationCoordinator, so must advance the snapshot
// manager manually.
auto storageEngine = cc().getServiceContext()->getStorageEngine();
- storageEngine->getSnapshotManager()->setLocalSnapshot(presentTs);
+ storageEngine->getSnapshotManager()->setLastApplied(presentTs);
const auto beforeTxnTime = _clock->reserveTicks(1);
auto beforeTxnTs = beforeTxnTime.asTimestamp();
@@ -3106,7 +3106,7 @@ public:
// This test does not run a real ReplicationCoordinator, so must advance the snapshot
// manager manually.
auto storageEngine = cc().getServiceContext()->getStorageEngine();
- storageEngine->getSnapshotManager()->setLocalSnapshot(presentTs);
+ storageEngine->getSnapshotManager()->setLastApplied(presentTs);
const auto beforeTxnTime = _clock->reserveTicks(1);
beforeTxnTs = beforeTxnTime.asTimestamp();
commitEntryTs = beforeTxnTime.addTicks(1).asTimestamp();