summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLouis Williams <louis.williams@mongodb.com>2020-09-10 12:30:21 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-09-10 18:15:18 +0000
commit11c68393df88a6f1ea4855e6ac15e54ca9f9d976 (patch)
tree893c5bb98404d3dabfa35f017e466345b0a44d80
parentce0487190cf69d200028e7f9f4bcd3cefeea805d (diff)
downloadmongo-11c68393df88a6f1ea4855e6ac15e54ca9f9d976.tar.gz
SERVER-48452 Internal readers should default to reading without a timestamp
Removes ReadSource::kUnset in favor of kNoTimestamp as the default Makes the following behavioral changes to AutoGetCollectionForRead: * Removes special early-return handling for kNoTimestamp * Only user or DBDirectClient operations are eligible to read at kLastApplied. * Operations only read at kLastApplied when in the SECONDARY state, nothing else. This means most internal operations that use DBDirectClient do not need to use a ReadSourceScope to ensure they read at kNoTimestamp.
-rw-r--r--src/mongo/db/catalog_raii.h2
-rw-r--r--src/mongo/db/catalog_raii_test.cpp6
-rw-r--r--src/mongo/db/db_raii.cpp35
-rw-r--r--src/mongo/db/db_raii_test.cpp82
-rw-r--r--src/mongo/db/free_mon/free_mon_storage.cpp4
-rw-r--r--src/mongo/db/ftdc/collector.cpp5
-rw-r--r--src/mongo/db/index_build_entry_helpers.cpp3
-rw-r--r--src/mongo/db/index_builds_coordinator.cpp11
-rw-r--r--src/mongo/db/namespace_string.cpp12
-rw-r--r--src/mongo/db/namespace_string.h5
-rw-r--r--src/mongo/db/pipeline/document_source_writer.h2
-rw-r--r--src/mongo/db/repl/bgsync.cpp10
-rw-r--r--src/mongo/db/repl/collection_bulk_loader_impl.cpp2
-rw-r--r--src/mongo/db/repl/oplog_applier_impl.cpp5
-rw-r--r--src/mongo/db/repl/oplog_batcher.cpp16
-rw-r--r--src/mongo/db/repl/replication_coordinator_external_state_impl.cpp3
-rw-r--r--src/mongo/db/repl/replication_recovery.cpp4
-rw-r--r--src/mongo/db/repl/transaction_oplog_application.cpp14
-rw-r--r--src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_kv_engine_test.cpp4
-rw-r--r--src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_recovery_unit.cpp1
-rw-r--r--src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_recovery_unit.h2
-rw-r--r--src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp4
-rw-r--r--src/mongo/db/storage/recovery_unit.h10
-rw-r--r--src/mongo/db/storage/snapshot_helper.cpp80
-rw-r--r--src/mongo/db/storage/snapshot_helper.h4
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp4
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h2
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp5
-rw-r--r--src/mongo/db/transaction_participant.cpp5
-rw-r--r--src/mongo/dbtests/querytests.cpp2
-rw-r--r--src/mongo/dbtests/storage_timestamp_tests.cpp12
31 files changed, 247 insertions, 109 deletions
diff --git a/src/mongo/db/catalog_raii.h b/src/mongo/db/catalog_raii.h
index 47444538dd5..367b87e933b 100644
--- a/src/mongo/db/catalog_raii.h
+++ b/src/mongo/db/catalog_raii.h
@@ -291,7 +291,7 @@ private:
class ReadSourceScope {
public:
ReadSourceScope(OperationContext* opCtx,
- RecoveryUnit::ReadSource readSource = RecoveryUnit::ReadSource::kUnset,
+ RecoveryUnit::ReadSource readSource,
boost::optional<Timestamp> provided = boost::none);
~ReadSourceScope();
diff --git a/src/mongo/db/catalog_raii_test.cpp b/src/mongo/db/catalog_raii_test.cpp
index cc222301ca0..e767d1f30ca 100644
--- a/src/mongo/db/catalog_raii_test.cpp
+++ b/src/mongo/db/catalog_raii_test.cpp
@@ -230,7 +230,7 @@ public:
}
private:
- ReadSource _source = ReadSource::kUnset;
+ ReadSource _source = ReadSource::kNoTimestamp;
boost::optional<Timestamp> _timestamp;
};
@@ -257,8 +257,8 @@ TEST_F(ReadSourceScopeTest, RestoreReadSource) {
ASSERT_EQ(opCtx()->recoveryUnit()->getTimestampReadSource(), ReadSource::kProvided);
ASSERT_EQ(opCtx()->recoveryUnit()->getPointInTimeReadTimestamp(), Timestamp(1, 2));
{
- ReadSourceScope scope(opCtx());
- ASSERT_EQ(opCtx()->recoveryUnit()->getTimestampReadSource(), ReadSource::kUnset);
+ ReadSourceScope scope(opCtx(), ReadSource::kNoTimestamp);
+ ASSERT_EQ(opCtx()->recoveryUnit()->getTimestampReadSource(), ReadSource::kNoTimestamp);
opCtx()->recoveryUnit()->setTimestampReadSource(ReadSource::kNoOverlap);
ASSERT_EQ(opCtx()->recoveryUnit()->getTimestampReadSource(), ReadSource::kNoOverlap);
diff --git a/src/mongo/db/db_raii.cpp b/src/mongo/db/db_raii.cpp
index a8329f4641d..22a9181f157 100644
--- a/src/mongo/db/db_raii.cpp
+++ b/src/mongo/db/db_raii.cpp
@@ -90,6 +90,10 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
const NamespaceStringOrUUID& nsOrUUID,
AutoGetCollectionViewMode viewMode,
Date_t deadline) {
+ // The caller was expecting to conflict with batch application before entering this function.
+ // i.e. the caller does not currently have a ShouldNotConflict... block in scope.
+ bool callerWasConflicting = opCtx->lockState()->shouldConflictWithSecondaryBatchApplication();
+
// Don't take the ParallelBatchWriterMode lock when the server parameter is set and our
// storage engine supports snapshot reads.
if (gAllowSecondaryReadsDuringBatchApplication.load() &&
@@ -100,11 +104,6 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
const auto collectionLockMode = getLockModeForQuery(opCtx, nsOrUUID.nss());
_autoColl.emplace(opCtx, nsOrUUID, collectionLockMode, viewMode, deadline);
- // If the read source is explicitly set to kNoTimestamp, we read the most up to date data and do
- // not consider changing our ReadSource (e.g. FTDC needs that).
- if (opCtx->recoveryUnit()->getTimestampReadSource() == RecoveryUnit::ReadSource::kNoTimestamp)
- return;
-
repl::ReplicationCoordinator* const replCoord = repl::ReplicationCoordinator::get(opCtx);
const auto readConcernLevel = repl::ReadConcernArgs::get(opCtx).getLevel();
@@ -154,6 +153,32 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
<< afterClusterTime->asTimestamp().toString());
}
+ // This assertion protects operations from reading inconsistent data on secondaries when
+ // using the default ReadSource of kNoTimestamp.
+
+ // Reading at lastApplied on secondaries is the safest behavior and is enabled for all user
+ // and DBDirectClient reads using 'local' and 'available' readConcerns. If an internal
+ // operation wishes to read without a timestamp during a batch, a ShouldNotConflict can
+ // suppress this fatal assertion with the following considerations:
+ // * The operation is not reading replicated data in a replication state where batch
+ // application is active OR
+ // * Reading inconsistent, out-of-order data is either inconsequential or required by
+ // the operation.
+
+ // If the caller entered this function expecting to conflict with batch application
+ // (i.e. no ShouldNotConflict block in scope), but they are reading without a timestamp and
+ // not holding the PBWM lock, then there is a possibility that this reader may
+ // unintentionally see inconsistent data during a batch. Certain namespaces are applied
+ // serially in oplog application, and therefore can be safely read without taking the PBWM
+ // lock or reading at a timestamp.
+ if (readSource == RecoveryUnit::ReadSource::kNoTimestamp && callerWasConflicting &&
+ !nss.mustBeAppliedInOwnOplogBatch() &&
+ SnapshotHelper::shouldReadAtLastApplied(opCtx, nss)) {
+ LOGV2_FATAL(4728700,
+ "Reading from replicated collection without read timestamp or PBWM lock",
+ "collection"_attr = nss);
+ }
+
auto minSnapshot = coll->getMinimumVisibleSnapshot();
if (!SnapshotHelper::collectionChangesConflictWithRead(minSnapshot, readTimestamp)) {
return;
diff --git a/src/mongo/db/db_raii_test.cpp b/src/mongo/db/db_raii_test.cpp
index b101ce91961..eba322c5581 100644
--- a/src/mongo/db/db_raii_test.cpp
+++ b/src/mongo/db/db_raii_test.cpp
@@ -42,6 +42,7 @@
#include "mongo/db/query/internal_plans.h"
#include "mongo/db/storage/snapshot_manager.h"
#include "mongo/logv2/log.h"
+#include "mongo/unittest/death_test.h"
#include "mongo/unittest/unittest.h"
#include "mongo/util/time_support.h"
@@ -219,6 +220,8 @@ TEST_F(DBRAIITestFixture,
Lock::DBLock dbLock1(client1.second.get(), nss.db(), MODE_IX);
ASSERT(client1.second->lockState()->isDbLockedForMode(nss.db(), MODE_IX));
+ // Simulate using a DBDirectClient to test this behavior for user reads.
+ client2.first->setInDirectClient(true);
AutoGetCollectionForRead coll(client2.second.get(), nss);
}
@@ -239,6 +242,8 @@ TEST_F(DBRAIITestFixture,
Lock::DBLock dbLock1(client1.second.get(), nss.db(), MODE_IX);
ASSERT(client1.second->lockState()->isDbLockedForMode(nss.db(), MODE_IX));
+ // Simulate using a DBDirectClient to test this behavior for user reads.
+ client2.first->setInDirectClient(true);
AutoGetCollectionForRead coll(client2.second.get(), nss);
}
@@ -266,10 +271,12 @@ TEST_F(DBRAIITestFixture,
Lock::DBLock dbLock1(client1.second.get(), nss.db(), MODE_IX);
ASSERT(client1.second->lockState()->isDbLockedForMode(nss.db(), MODE_IX));
+ // Simulate using a DBDirectClient to test this behavior for user reads.
+ client2.first->setInDirectClient(true);
AutoGetCollectionForRead coll(client2.second.get(), NamespaceString("local.system.js"));
// Reading from an unreplicated collection does not change the ReadSource to kLastApplied.
ASSERT_EQ(client2.second.get()->recoveryUnit()->getTimestampReadSource(),
- RecoveryUnit::ReadSource::kUnset);
+ RecoveryUnit::ReadSource::kNoTimestamp);
// Reading from a replicated collection will try to switch to kLastApplied. Because we are
// already reading without a timestamp and we can't reacquire the PBWM lock to continue reading
@@ -300,12 +307,15 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedConflict) {
auto snapshotManager =
client1.second.get()->getServiceContext()->getStorageEngine()->getSnapshotManager();
snapshotManager->setLastApplied(opTime.getTimestamp());
+
+ // Simulate using a DBDirectClient to test this behavior for user reads.
+ client1.first->setInDirectClient(true);
AutoGetCollectionForRead coll(client1.second.get(), nss);
// We can't read from kLastApplied in this scenario because there is a catalog conflict. Resort
// to taking the PBWM lock and reading without a timestamp.
ASSERT_EQ(client1.second.get()->recoveryUnit()->getTimestampReadSource(),
- RecoveryUnit::ReadSource::kUnset);
+ RecoveryUnit::ReadSource::kNoTimestamp);
ASSERT_TRUE(client1.second.get()->lockState()->isLockHeldForMode(
resourceIdParallelBatchWriterMode, MODE_IS));
}
@@ -325,6 +335,9 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedUnavailable) {
auto snapshotManager =
client1.second.get()->getServiceContext()->getStorageEngine()->getSnapshotManager();
ASSERT_FALSE(snapshotManager->getLastApplied());
+
+ // Simulate using a DBDirectClient to test this behavior for user reads.
+ client1.first->setInDirectClient(true);
AutoGetCollectionForRead coll(client1.second.get(), nss);
ASSERT_EQ(client1.second.get()->recoveryUnit()->getTimestampReadSource(),
@@ -334,6 +347,33 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedUnavailable) {
resourceIdParallelBatchWriterMode, MODE_IS));
}
+TEST_F(DBRAIITestFixture, AutoGetCollectionForReadOplogOnSecondary) {
+ // This test simulates a situation where AutoGetCollectionForRead reads at lastApplied on a
+ // secondary.
+ auto replCoord = repl::ReplicationCoordinator::get(client1.second.get());
+ ASSERT_OK(replCoord->setFollowerMode(repl::MemberState::RS_SECONDARY));
+
+ // Ensure the default ReadSource is used.
+ ASSERT_EQ(client1.second.get()->recoveryUnit()->getTimestampReadSource(),
+ RecoveryUnit::ReadSource::kNoTimestamp);
+
+ // Don't call into the ReplicationCoordinator to update lastApplied because it is only a mock
+ // class and does not update the correct state in the SnapshotManager.
+ repl::OpTime opTime(Timestamp(2, 1), 1);
+ auto snapshotManager =
+ client1.second.get()->getServiceContext()->getStorageEngine()->getSnapshotManager();
+ snapshotManager->setLastApplied(opTime.getTimestamp());
+
+ // Simulate using a DBDirectClient to test this behavior for user reads.
+ client1.first->setInDirectClient(true);
+ AutoGetCollectionForRead coll(client1.second.get(), NamespaceString::kRsOplogNamespace);
+
+ ASSERT_EQ(client1.second.get()->recoveryUnit()->getTimestampReadSource(),
+ RecoveryUnit::ReadSource::kLastApplied);
+ ASSERT_FALSE(client1.second.get()->lockState()->isLockHeldForMode(
+ resourceIdParallelBatchWriterMode, MODE_IS));
+}
+
TEST_F(DBRAIITestFixture, AutoGetCollectionForReadUsesLastAppliedOnSecondary) {
auto opCtx = client1.second.get();
@@ -342,11 +382,15 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadUsesLastAppliedOnSecondary) {
CollectionOptions options;
options.capped = true;
ASSERT_OK(storageInterface()->createCollection(opCtx, nss, options));
+
+ // Simulate using a DBDirectClient to test this behavior for user reads.
+ opCtx->getClient()->setInDirectClient(true);
AutoGetCollectionForRead autoColl(opCtx, nss);
auto exec = makeTailableQueryPlan(opCtx, autoColl.getCollection());
// The collection scan should use the default ReadSource on a primary.
- ASSERT_EQ(RecoveryUnit::ReadSource::kUnset, opCtx->recoveryUnit()->getTimestampReadSource());
+ ASSERT_EQ(RecoveryUnit::ReadSource::kNoTimestamp,
+ opCtx->recoveryUnit()->getTimestampReadSource());
// When the tailable query recovers from its yield, it should discover that the node is
// secondary and change its read source.
@@ -373,6 +417,9 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadChangedReadSourceAfterStepUp)
ASSERT_OK(storageInterface()->createCollection(opCtx, nss, options));
ASSERT_OK(
repl::ReplicationCoordinator::get(opCtx)->setFollowerMode(repl::MemberState::RS_SECONDARY));
+
+ // Simulate using a DBDirectClient to test this behavior for user reads.
+ opCtx->getClient()->setInDirectClient(true);
AutoGetCollectionForRead autoColl(opCtx, nss);
auto exec = makeTailableQueryPlan(opCtx, autoColl.getCollection());
@@ -390,9 +437,36 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadChangedReadSourceAfterStepUp)
// After restoring, the collection scan should now be reading with kUnset, the default on
// primaries.
- ASSERT_EQ(RecoveryUnit::ReadSource::kUnset, opCtx->recoveryUnit()->getTimestampReadSource());
+ ASSERT_EQ(RecoveryUnit::ReadSource::kNoTimestamp,
+ opCtx->recoveryUnit()->getTimestampReadSource());
ASSERT_EQUALS(PlanExecutor::IS_EOF, exec->getNext(&unused, nullptr));
}
+DEATH_TEST_F(DBRAIITestFixture, AutoGetCollectionForReadUnsafe, "Fatal assertion") {
+ auto opCtx = client1.second.get();
+ ASSERT_OK(storageInterface()->createCollection(opCtx, nss, {}));
+
+ ASSERT_OK(
+ repl::ReplicationCoordinator::get(opCtx)->setFollowerMode(repl::MemberState::RS_SECONDARY));
+
+ // Non-user read on a replicated collection should fail because we are reading on a secondary
+ // without a timestamp.
+ AutoGetCollectionForRead autoColl(opCtx, nss);
+}
+
+TEST_F(DBRAIITestFixture, AutoGetCollectionForReadSafe) {
+ auto opCtx = client1.second.get();
+ ASSERT_OK(storageInterface()->createCollection(opCtx, nss, {}));
+
+ ASSERT_OK(
+ repl::ReplicationCoordinator::get(opCtx)->setFollowerMode(repl::MemberState::RS_SECONDARY));
+
+ // Non-user read on a replicated collection should not fail because of the ShouldNotConflict
+ // block.
+ ShouldNotConflictWithSecondaryBatchApplicationBlock noConflict(opCtx->lockState());
+
+ AutoGetCollectionForRead autoColl(opCtx, nss);
+}
+
} // namespace
} // namespace mongo
diff --git a/src/mongo/db/free_mon/free_mon_storage.cpp b/src/mongo/db/free_mon/free_mon_storage.cpp
index 7c25c6a671c..89be39295e1 100644
--- a/src/mongo/db/free_mon/free_mon_storage.cpp
+++ b/src/mongo/db/free_mon/free_mon_storage.cpp
@@ -57,6 +57,10 @@ boost::optional<FreeMonStorageState> FreeMonStorage::read(OperationContext* opCt
auto storageInterface = repl::StorageInterface::get(opCtx);
+ // Ensure we read without a timestamp.
+ invariant(RecoveryUnit::ReadSource::kNoTimestamp ==
+ opCtx->recoveryUnit()->getTimestampReadSource());
+
AutoGetCollectionForRead autoRead(opCtx, NamespaceString::kServerConfigurationNamespace);
auto swObj = storageInterface->findById(
diff --git a/src/mongo/db/ftdc/collector.cpp b/src/mongo/db/ftdc/collector.cpp
index 37dd68b136e..11ba9d4d3a4 100644
--- a/src/mongo/db/ftdc/collector.cpp
+++ b/src/mongo/db/ftdc/collector.cpp
@@ -70,8 +70,9 @@ std::tuple<BSONObj, Date_t> FTDCCollectorCollection::collect(Client* client) {
ShouldNotConflictWithSecondaryBatchApplicationBlock shouldNotConflictBlock(opCtx->lockState());
opCtx->lockState()->skipAcquireTicket();
- // Explicitly start future read transactions without a timestamp.
- opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
+ // Ensure future transactions read without a timestamp.
+ invariant(RecoveryUnit::ReadSource::kNoTimestamp ==
+ opCtx->recoveryUnit()->getTimestampReadSource());
for (auto& collector : _collectors) {
BSONObjBuilder subObjBuilder(builder.subobjStart(collector->name()));
diff --git a/src/mongo/db/index_build_entry_helpers.cpp b/src/mongo/db/index_build_entry_helpers.cpp
index da3f43b29e2..fc689873f6e 100644
--- a/src/mongo/db/index_build_entry_helpers.cpp
+++ b/src/mongo/db/index_build_entry_helpers.cpp
@@ -254,7 +254,8 @@ Status removeIndexBuildEntry(OperationContext* opCtx, UUID indexBuildUUID) {
StatusWith<IndexBuildEntry> getIndexBuildEntry(OperationContext* opCtx, UUID indexBuildUUID) {
// Read the most up to date data.
- ReadSourceScope readSourceScope(opCtx, RecoveryUnit::ReadSource::kNoTimestamp);
+ invariant(RecoveryUnit::ReadSource::kNoTimestamp ==
+ opCtx->recoveryUnit()->getTimestampReadSource());
AutoGetCollectionForRead autoCollection(opCtx, NamespaceString::kIndexBuildEntryNamespace);
const Collection* collection = autoCollection.getCollection();
diff --git a/src/mongo/db/index_builds_coordinator.cpp b/src/mongo/db/index_builds_coordinator.cpp
index c8caafc318f..d27dd0848db 100644
--- a/src/mongo/db/index_builds_coordinator.cpp
+++ b/src/mongo/db/index_builds_coordinator.cpp
@@ -2553,7 +2553,8 @@ void IndexBuildsCoordinator::_buildIndex(OperationContext* opCtx,
// Read without a timestamp. When we commit, we block writes which guarantees all writes are
// visible.
- opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
+ invariant(RecoveryUnit::ReadSource::kNoTimestamp ==
+ opCtx->recoveryUnit()->getTimestampReadSource());
// The collection scan might read with a kMajorityCommitted read source, but will restore
// kNoTimestamp afterwards.
_scanCollectionAndInsertSortedKeysIntoIndex(opCtx, replState);
@@ -2655,7 +2656,7 @@ void IndexBuildsCoordinator::_insertKeysFromSideTablesWithoutBlockingWrites(
uassertStatusOK(_indexBuildsManager.drainBackgroundWrites(
opCtx,
replState->buildUUID,
- RecoveryUnit::ReadSource::kUnset,
+ RecoveryUnit::ReadSource::kNoTimestamp,
IndexBuildInterceptor::DrainYieldPolicy::kYield));
}
@@ -2681,7 +2682,7 @@ void IndexBuildsCoordinator::_insertKeysFromSideTablesBlockingWrites(
uassertStatusOK(_indexBuildsManager.drainBackgroundWrites(
opCtx,
replState->buildUUID,
- RecoveryUnit::ReadSource::kUnset,
+ RecoveryUnit::ReadSource::kNoTimestamp,
IndexBuildInterceptor::DrainYieldPolicy::kNoYield));
}
@@ -2769,7 +2770,7 @@ IndexBuildsCoordinator::CommitResult IndexBuildsCoordinator::_insertKeysFromSide
uassertStatusOK(_indexBuildsManager.drainBackgroundWrites(
opCtx,
replState->buildUUID,
- RecoveryUnit::ReadSource::kUnset,
+ RecoveryUnit::ReadSource::kNoTimestamp,
IndexBuildInterceptor::DrainYieldPolicy::kNoYield));
try {
@@ -2916,7 +2917,7 @@ StatusWith<std::pair<long long, long long>> IndexBuildsCoordinator::_runIndexReb
uassertStatusOK(_indexBuildsManager.drainBackgroundWrites(
opCtx,
replState->buildUUID,
- RecoveryUnit::ReadSource::kUnset,
+ RecoveryUnit::ReadSource::kNoTimestamp,
IndexBuildInterceptor::DrainYieldPolicy::kNoYield));
uassertStatusOK(
diff --git a/src/mongo/db/namespace_string.cpp b/src/mongo/db/namespace_string.cpp
index 9471aca909c..bee7df5ca40 100644
--- a/src/mongo/db/namespace_string.cpp
+++ b/src/mongo/db/namespace_string.cpp
@@ -144,6 +144,18 @@ bool NamespaceString::isLegalClientSystemNS() const {
return false;
}
+/**
+ * Oplog entries on 'system.views' should also be processed one at a time. View catalog immediately
+ * reflects changes for each oplog entry so we can see inconsistent view catalog if multiple oplog
+ * entries on 'system.views' are being applied out of the original order.
+ *
+ * Process updates to 'admin.system.version' individually as well so the secondary's FCV when
+ * processing each operation matches the primary's when committing that operation.
+ */
+bool NamespaceString::mustBeAppliedInOwnOplogBatch() const {
+ return isSystemDotViews() || isServerConfigurationCollection() || isPrivilegeCollection();
+}
+
NamespaceString NamespaceString::makeListCollectionsNSS(StringData dbName) {
NamespaceString nss(dbName, listCollectionsCursorCol);
dassert(nss.isValid());
diff --git a/src/mongo/db/namespace_string.h b/src/mongo/db/namespace_string.h
index a43406f8bd4..e5de9877c84 100644
--- a/src/mongo/db/namespace_string.h
+++ b/src/mongo/db/namespace_string.h
@@ -338,6 +338,11 @@ public:
bool isDropPendingNamespace() const;
/**
+ * Returns true if operations on this namespace must be applied in their own oplog batch.
+ */
+ bool mustBeAppliedInOwnOplogBatch() const;
+
+ /**
* Returns the drop-pending namespace name for this namespace, provided the given optime.
*
* Example:
diff --git a/src/mongo/db/pipeline/document_source_writer.h b/src/mongo/db/pipeline/document_source_writer.h
index 9c175890ecf..b91c49a90db 100644
--- a/src/mongo/db/pipeline/document_source_writer.h
+++ b/src/mongo/db/pipeline/document_source_writer.h
@@ -65,7 +65,7 @@ public:
}
repl::ReadConcernArgs::get(_opCtx) = repl::ReadConcernArgs();
- _opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::kUnset);
+ _opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
}
~DocumentSourceWriteBlock() {
diff --git a/src/mongo/db/repl/bgsync.cpp b/src/mongo/db/repl/bgsync.cpp
index 77daa595256..a9c8b47e61a 100644
--- a/src/mongo/db/repl/bgsync.cpp
+++ b/src/mongo/db/repl/bgsync.cpp
@@ -700,8 +700,9 @@ void BackgroundSync::_runRollback(OperationContext* opCtx,
ShouldNotConflictWithSecondaryBatchApplicationBlock noConflict(opCtx->lockState());
- // Explicitly start future read transactions without a timestamp.
- opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
+ // Ensure future transactions read without a timestamp.
+ invariant(RecoveryUnit::ReadSource::kNoTimestamp ==
+ opCtx->recoveryUnit()->getTimestampReadSource());
// Rollback is a synchronous operation that uses the task executor and may not be
// executed inside the fetcher callback.
@@ -878,8 +879,9 @@ void BackgroundSync::start(OperationContext* opCtx) {
OpTime lastAppliedOpTime;
ShouldNotConflictWithSecondaryBatchApplicationBlock noConflict(opCtx->lockState());
- // Explicitly start future read transactions without a timestamp.
- opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
+ // Ensure future transactions read without a timestamp.
+ invariant(RecoveryUnit::ReadSource::kNoTimestamp ==
+ opCtx->recoveryUnit()->getTimestampReadSource());
do {
lastAppliedOpTime = _readLastAppliedOpTime(opCtx);
diff --git a/src/mongo/db/repl/collection_bulk_loader_impl.cpp b/src/mongo/db/repl/collection_bulk_loader_impl.cpp
index eab00297cdd..23fce736413 100644
--- a/src/mongo/db/repl/collection_bulk_loader_impl.cpp
+++ b/src/mongo/db/repl/collection_bulk_loader_impl.cpp
@@ -278,7 +278,7 @@ Status CollectionBulkLoaderImpl::commit() {
status = _idIndexBlock->drainBackgroundWrites(
_opCtx.get(),
- RecoveryUnit::ReadSource::kUnset,
+ RecoveryUnit::ReadSource::kNoTimestamp,
_nss.isSystemDotViews() ? IndexBuildInterceptor::DrainYieldPolicy::kNoYield
: IndexBuildInterceptor::DrainYieldPolicy::kYield);
if (!status.isOK()) {
diff --git a/src/mongo/db/repl/oplog_applier_impl.cpp b/src/mongo/db/repl/oplog_applier_impl.cpp
index 24ff5ad96d6..f769fd14c6d 100644
--- a/src/mongo/db/repl/oplog_applier_impl.cpp
+++ b/src/mongo/db/repl/oplog_applier_impl.cpp
@@ -779,8 +779,9 @@ Status OplogApplierImpl::applyOplogBatchPerWorker(OperationContext* opCtx,
// destroyed by unstash in its destructor. Thus we set the flag explicitly.
opCtx->lockState()->setShouldConflictWithSecondaryBatchApplication(false);
- // Explicitly start future read transactions without a timestamp.
- opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
+ // Ensure future transactions read without a timestamp.
+ invariant(RecoveryUnit::ReadSource::kNoTimestamp ==
+ opCtx->recoveryUnit()->getTimestampReadSource());
// When querying indexes, we return the record matching the key if it exists, or an adjacent
// document. This means that it is possible for us to hit a prepare conflict if we query for an
diff --git a/src/mongo/db/repl/oplog_batcher.cpp b/src/mongo/db/repl/oplog_batcher.cpp
index 99f7077519d..efd257d26d8 100644
--- a/src/mongo/db/repl/oplog_batcher.cpp
+++ b/src/mongo/db/repl/oplog_batcher.cpp
@@ -121,13 +121,6 @@ bool isUnpreparedCommit(const OplogEntry& entry) {
* the final oplog entry in the transaction is processed individually, since the operations are not
* actually run until the commit operation is reached.
*
- * Oplog entries on 'system.views' should also be processed one at a time. View catalog immediately
- * reflects changes for each oplog entry so we can see inconsistent view catalog if multiple oplog
- * entries on 'system.views' are being applied out of the original order.
- *
- * Process updates to 'admin.system.version' individually as well so the secondary's FCV when
- * processing each operation matches the primary's when committing that operation.
- *
* The ends of large transactions (> 16MB) should also be processed immediately on its own in order
* to avoid scenarios where parts of the transaction is batched with other operations not in the
* transaction.
@@ -143,8 +136,7 @@ bool OplogBatcher::mustProcessIndividually(const OplogEntry& entry) {
}
const auto nss = entry.getNss();
- return nss.isSystemDotViews() || nss.isServerConfigurationCollection() ||
- nss.isPrivilegeCollection();
+ return nss.mustBeAppliedInOwnOplogBatch();
}
std::size_t OplogBatcher::getOpCount(const OplogEntry& entry) {
@@ -355,12 +347,6 @@ std::size_t getBatchLimitOplogEntries() {
std::size_t getBatchLimitOplogBytes(OperationContext* opCtx, StorageInterface* storageInterface) {
// We can't change the timestamp source within a write unit of work.
invariant(!opCtx->lockState()->inAWriteUnitOfWork());
- // We're only reading oplog metadata, so the timestamp is not important. If we read with the
- // default (which is lastApplied on secondaries), we may end up with a reader that is at
- // lastApplied. If we then roll back, then when we reconstruct prepared transactions during
- // rollback recovery we will be preparing transactions before the read timestamp, which triggers
- // an assertion in WiredTiger.
- ReadSourceScope readSourceScope(opCtx, RecoveryUnit::ReadSource::kNoTimestamp);
auto oplogMaxSizeResult = storageInterface->getOplogMaxSize(opCtx);
auto oplogMaxSize = fassert(40301, oplogMaxSizeResult);
return std::min(oplogMaxSize / 10, std::size_t(replBatchLimitBytes.load()));
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
index 3cfe7be562b..3f03ceb9d28 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
@@ -357,7 +357,8 @@ void ReplicationCoordinatorExternalStateImpl::clearAppliedThroughIfCleanShutdown
// Ensure that all writes are visible before reading. If we failed mid-batch, it would be
// possible to read from a kNoOverlap ReadSource where not all writes to the minValid document
// are visible, generating a writeConflict that would not resolve.
- opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
+ invariant(RecoveryUnit::ReadSource::kNoTimestamp ==
+ opCtx->recoveryUnit()->getTimestampReadSource());
auto loadLastOpTimeAndWallTimeResult = loadLastOpTimeAndWallTime(opCtx);
if (_replicationProcess->getConsistencyMarkers()->getOplogTruncateAfterPoint(opCtx).isNull() &&
diff --git a/src/mongo/db/repl/replication_recovery.cpp b/src/mongo/db/repl/replication_recovery.cpp
index bba59beb626..c0c242421f9 100644
--- a/src/mongo/db/repl/replication_recovery.cpp
+++ b/src/mongo/db/repl/replication_recovery.cpp
@@ -131,7 +131,9 @@ public:
_oplogApplicationEndPoint(oplogApplicationEndPoint) {}
void startup(OperationContext* opCtx) final {
- opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
+ invariant(opCtx->recoveryUnit()->getTimestampReadSource() ==
+ RecoveryUnit::ReadSource::kNoTimestamp);
+
_client = std::make_unique<DBDirectClient>(opCtx);
BSONObj predicate = _oplogApplicationEndPoint
? BSON("$gte" << _oplogApplicationStartPoint << "$lte" << *_oplogApplicationEndPoint)
diff --git a/src/mongo/db/repl/transaction_oplog_application.cpp b/src/mongo/db/repl/transaction_oplog_application.cpp
index 0c7a1f0727b..67fb840de64 100644
--- a/src/mongo/db/repl/transaction_oplog_application.cpp
+++ b/src/mongo/db/repl/transaction_oplog_application.cpp
@@ -262,8 +262,9 @@ std::pair<std::vector<OplogEntry>, bool> _readTransactionOperationsFromOplogChai
const std::vector<OplogEntry*>& cachedOps,
const bool checkForCommands) noexcept {
bool isTransactionWithCommand = false;
- // Traverse the oplog chain with its own snapshot and read timestamp.
- ReadSourceScope readSourceScope(opCtx);
+ // Ensure future transactions read without a timestamp.
+ invariant(RecoveryUnit::ReadSource::kNoTimestamp ==
+ opCtx->recoveryUnit()->getTimestampReadSource());
std::vector<OplogEntry> ops;
@@ -538,11 +539,10 @@ void reconstructPreparedTransactions(OperationContext* opCtx, repl::OplogApplica
LOGV2(21848, "Hit skipReconstructPreparedTransactions failpoint");
return;
}
- // Read the transactions table and the oplog collection without a timestamp.
- // The below DBDirectClient read uses AutoGetCollectionForRead which could implicitly change the
- // read source. So we need to explicitly set the read source to kNoTimestamp to force reads in
- // this scope to be untimestamped.
- ReadSourceScope readSourceScope(opCtx, RecoveryUnit::ReadSource::kNoTimestamp);
+
+ // Ensure future transactions read without a timestamp.
+ invariant(RecoveryUnit::ReadSource::kNoTimestamp ==
+ opCtx->recoveryUnit()->getTimestampReadSource());
DBDirectClient client(opCtx);
const auto cursor = client.query(NamespaceString::kSessionTransactionsTableNamespace,
diff --git a/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_kv_engine_test.cpp b/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_kv_engine_test.cpp
index fcf49f74442..e249daed751 100644
--- a/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_kv_engine_test.cpp
+++ b/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_kv_engine_test.cpp
@@ -359,7 +359,7 @@ TEST_F(EphemeralForTestKVEngineTest, ReadOlderSnapshotsSimple) {
ASSERT(!rs->findRecord(&opCtx, loc2, &rd));
opCtx.recoveryUnit()->abandonSnapshot();
- opCtx.recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset);
+ opCtx.recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
ASSERT(rs->findRecord(&opCtx, loc1, &rd));
ASSERT(rs->findRecord(&opCtx, loc2, &rd));
}
@@ -452,7 +452,7 @@ TEST_F(EphemeralForTestKVEngineTest, SetReadTimestampBehindOldestTimestamp) {
ASSERT_THROWS_CODE(rs->findRecord(&opCtx, loc2, &rd), DBException, ErrorCodes::SnapshotTooOld);
opCtx.recoveryUnit()->abandonSnapshot();
- opCtx.recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset);
+ opCtx.recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
ASSERT(rs->findRecord(&opCtx, loc1, &rd));
ASSERT(rs->findRecord(&opCtx, loc2, &rd));
}
diff --git a/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_recovery_unit.cpp b/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_recovery_unit.cpp
index 5b2e77e6292..44d73995482 100644
--- a/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_recovery_unit.cpp
+++ b/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_recovery_unit.cpp
@@ -119,7 +119,6 @@ bool RecoveryUnit::forkIfNeeded() {
boost::optional<Timestamp> readFrom = boost::none;
switch (_timestampReadSource) {
- case ReadSource::kUnset:
case ReadSource::kNoTimestamp:
case ReadSource::kMajorityCommitted:
case ReadSource::kNoOverlap:
diff --git a/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_recovery_unit.h b/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_recovery_unit.h
index 0e0afbb1a13..c31d0d54d86 100644
--- a/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_recovery_unit.h
+++ b/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_recovery_unit.h
@@ -131,7 +131,7 @@ private:
Timestamp _commitTimestamp = Timestamp::min();
// Specifies which external source to use when setting read timestamps on transactions.
- ReadSource _timestampReadSource = ReadSource::kUnset;
+ ReadSource _timestampReadSource = ReadSource::kNoTimestamp;
boost::optional<Timestamp> _readAtTimestamp = boost::none;
};
diff --git a/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp b/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp
index 57bf3bf714d..1e928738d57 100644
--- a/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp
+++ b/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp
@@ -372,7 +372,7 @@ TEST_F(SnapshotManagerTests, InsertAndReadOnLastAppliedSnapshot) {
// Not reading on the last applied timestamp returns the most recent data.
auto op = makeOperation();
auto ru = op->recoveryUnit();
- ru->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset);
+ ru->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
ASSERT_EQ(itCountOn(op), 1);
ASSERT(readRecordOn(op, id));
@@ -408,7 +408,7 @@ TEST_F(SnapshotManagerTests, UpdateAndDeleteOnLocalSnapshot) {
// Not reading on the last local timestamp returns the most recent data.
auto op = makeOperation();
auto ru = op->recoveryUnit();
- ru->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset);
+ ru->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
ASSERT_EQ(itCountOn(op), 1);
auto record = readRecordOn(op, id);
ASSERT_EQ(std::string(record->data.data()), "Blue spotted stingray");
diff --git a/src/mongo/db/storage/recovery_unit.h b/src/mongo/db/storage/recovery_unit.h
index 5c8be96b528..2057f8854b3 100644
--- a/src/mongo/db/storage/recovery_unit.h
+++ b/src/mongo/db/storage/recovery_unit.h
@@ -392,11 +392,7 @@ public:
*/
enum ReadSource {
/**
- * Do not read from a timestamp. This is the default.
- */
- kUnset,
- /**
- * Read without a timestamp explicitly.
+ * Read without a timestamp. This is the default.
*/
kNoTimestamp,
/**
@@ -424,8 +420,6 @@ public:
static std::string toString(ReadSource rs) {
switch (rs) {
- case ReadSource::kUnset:
- return "kUnset";
case ReadSource::kNoTimestamp:
return "kNoTimestamp";
case ReadSource::kMajorityCommitted:
@@ -455,7 +449,7 @@ public:
boost::optional<Timestamp> provided = boost::none) {}
virtual ReadSource getTimestampReadSource() const {
- return ReadSource::kUnset;
+ return ReadSource::kNoTimestamp;
};
/**
diff --git a/src/mongo/db/storage/snapshot_helper.cpp b/src/mongo/db/storage/snapshot_helper.cpp
index 5acbcd3a513..84af208d391 100644
--- a/src/mongo/db/storage/snapshot_helper.cpp
+++ b/src/mongo/db/storage/snapshot_helper.cpp
@@ -38,29 +38,37 @@
#include "mongo/logv2/log.h"
namespace mongo {
-namespace SnapshotHelper {
-bool canSwitchReadSource(OperationContext* opCtx) {
-
- // Most readConcerns have behavior controlled at higher levels. Local and available are the only
- // ReadConcerns that should consider changing, since they read without a timestamp by default.
+namespace {
+bool canReadAtLastApplied(OperationContext* opCtx) {
+ // Local and available are the only ReadConcern levels that allow their ReadSource to be
+ // overridden to read at lastApplied. They read without a timestamp by default, but this check
+ // allows user secondary reads from conflicting with oplog batch application by reading at a
+ // consistent point in time.
+ // Internal operations use DBDirectClient as a loopback to perform local operations, and they
+ // expect the same level of consistency guarantees as any user operation. For that reason,
+ // DBDirectClient should be able to change the owning operation's ReadSource in order to serve
+ // consistent data.
const auto readConcernLevel = repl::ReadConcernArgs::get(opCtx).getLevel();
- if (readConcernLevel == repl::ReadConcernLevel::kLocalReadConcern ||
- readConcernLevel == repl::ReadConcernLevel::kAvailableReadConcern) {
+ if ((opCtx->getClient()->isFromUserConnection() || opCtx->getClient()->isInDirectClient()) &&
+ (readConcernLevel == repl::ReadConcernLevel::kLocalReadConcern ||
+ readConcernLevel == repl::ReadConcernLevel::kAvailableReadConcern)) {
return true;
}
-
return false;
}
+} // namespace
+namespace SnapshotHelper {
bool shouldReadAtLastApplied(OperationContext* opCtx,
const NamespaceString& nss,
std::string* reason) {
-
// If this is true, then the operation opted-in to the PBWM lock, implying that it cannot change
// its ReadSource. It's important to note that it is possible for this to be false, but still be
// holding the PBWM lock, explained below.
if (opCtx->lockState()->shouldConflictWithSecondaryBatchApplication()) {
- *reason = "conflicts with batch application";
+ if (reason) {
+ *reason = "conflicts with batch application";
+ }
return false;
}
@@ -71,16 +79,32 @@ bool shouldReadAtLastApplied(OperationContext* opCtx,
// guaranteed to observe all previous writes. This may occur when multiple collection locks are
// held concurrently, which is often the case when DBDirectClient is used.
if (opCtx->lockState()->isLockHeldForMode(resourceIdParallelBatchWriterMode, MODE_IS)) {
- *reason = "PBWM lock is held";
+ if (reason) {
+ *reason = "PBWM lock is held";
+ }
LOGV2_DEBUG(20577, 1, "not reading at lastApplied because the PBWM lock is held");
return false;
}
- // If we are in a replication state (like secondary or primary catch-up) where we are not
- // accepting writes, we should read at lastApplied. If this node can accept writes, then no
- // conflicting replication batches are being applied and we can read from the default snapshot.
+ // If this node can accept writes (i.e. primary), then no conflicting replication batches are
+ // being applied and we can read from the default snapshot. If we are in a replication state
+ // (like secondary or primary catch-up) where we are not accepting writes, we should read at
+ // lastApplied.
if (repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesForDatabase(opCtx, "admin")) {
- *reason = "primary";
+ if (reason) {
+ *reason = "primary";
+ }
+ return false;
+ }
+
+ // If we are not secondary, then we should not attempt to read at lastApplied because it may not
+ // be available or valid. Any operations reading outside of the primary or secondary states must
+ // be internal. We give these operations the benefit of the doubt rather than attempting to read
+ // at a lastApplied timestamp that is not valid.
+ if (!repl::ReplicationCoordinator::get(opCtx)->isInPrimaryOrSecondaryState(opCtx)) {
+ if (reason) {
+ *reason = "not primary or secondary";
+ }
return false;
}
@@ -88,7 +112,9 @@ bool shouldReadAtLastApplied(OperationContext* opCtx,
// written by the replication system. However, the oplog is special, as it *is* written by the
// replication system.
if (!nss.isReplicated() && !nss.isOplog()) {
- *reason = "unreplicated collection";
+ if (reason) {
+ *reason = "unreplicated collection";
+ }
return false;
}
@@ -96,15 +122,14 @@ bool shouldReadAtLastApplied(OperationContext* opCtx,
}
boost::optional<RecoveryUnit::ReadSource> getNewReadSource(OperationContext* opCtx,
const NamespaceString& nss) {
- const bool canSwitch = canSwitchReadSource(opCtx);
- if (!canSwitch) {
+ if (!canReadAtLastApplied(opCtx)) {
return boost::none;
}
const auto existing = opCtx->recoveryUnit()->getTimestampReadSource();
std::string reason;
const bool readAtLastApplied = shouldReadAtLastApplied(opCtx, nss, &reason);
- if (existing == RecoveryUnit::ReadSource::kUnset) {
+ if (existing == RecoveryUnit::ReadSource::kNoTimestamp) {
// Shifting from reading without a timestamp to reading with a timestamp can be dangerous
// because writes will appear to vanish. This case is intended for new reads on secondaries
// and query yield recovery after state transitions from primary to secondary.
@@ -122,13 +147,16 @@ boost::optional<RecoveryUnit::ReadSource> getNewReadSource(OperationContext* opC
// Given readers do not survive rollbacks, it's okay to go from reading with a timestamp to
// reading without one. More writes will become visible.
if (!readAtLastApplied) {
- LOGV2_DEBUG(
- 4452902, 2, "Changing ReadSource to kUnset", logAttrs(nss), "reason"_attr = reason);
- // This shift to kUnset assumes that callers will not make future attempts to manipulate
- // their ReadSources after performing reads at an un-timetamped snapshot. The only
- // exception is callers of this function that may need to change from kUnset to
- // kLastApplied in the event of a catalog conflict or query yield.
- return RecoveryUnit::ReadSource::kUnset;
+ LOGV2_DEBUG(4452902,
+ 2,
+ "Changing ReadSource to kNoTimestamp",
+ logAttrs(nss),
+ "reason"_attr = reason);
+ // This shift to kNoTimestamp assumes that callers will not make future attempts to
+ // manipulate their ReadSources after performing reads at an un-timetamped snapshot. The
+ // only exception is callers of this function that may need to change from kNoTimestamp
+ // to kLastApplied in the event of a catalog conflict or query yield.
+ return RecoveryUnit::ReadSource::kNoTimestamp;
}
}
return boost::none;
diff --git a/src/mongo/db/storage/snapshot_helper.h b/src/mongo/db/storage/snapshot_helper.h
index fa8fdd85f24..c24dfd16d8c 100644
--- a/src/mongo/db/storage/snapshot_helper.h
+++ b/src/mongo/db/storage/snapshot_helper.h
@@ -37,6 +37,10 @@ namespace SnapshotHelper {
boost::optional<RecoveryUnit::ReadSource> getNewReadSource(OperationContext* opCtx,
const NamespaceString& nss);
+bool shouldReadAtLastApplied(OperationContext* opCtx,
+ const NamespaceString& nss,
+ std::string* reason = nullptr);
+
bool collectionChangesConflictWithRead(boost::optional<Timestamp> collectionMin,
boost::optional<Timestamp> readTimestamp);
} // namespace SnapshotHelper
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp
index 1167fd673f3..b3cc4c6dde7 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp
@@ -445,7 +445,6 @@ boost::optional<Timestamp> WiredTigerRecoveryUnit::getPointInTimeReadTimestamp()
// transaction to establish a read timestamp, but only for ReadSources that are expected to have
// read timestamps.
switch (_timestampReadSource) {
- case ReadSource::kUnset:
case ReadSource::kNoTimestamp:
return boost::none;
case ReadSource::kMajorityCommitted:
@@ -484,7 +483,6 @@ boost::optional<Timestamp> WiredTigerRecoveryUnit::getPointInTimeReadTimestamp()
return _readAtTimestamp;
// The follow ReadSources returned values in the first switch block.
- case ReadSource::kUnset:
case ReadSource::kNoTimestamp:
case ReadSource::kMajorityCommitted:
case ReadSource::kProvided:
@@ -507,7 +505,6 @@ void WiredTigerRecoveryUnit::_txnOpen() {
WT_SESSION* session = _session->getSession();
switch (_timestampReadSource) {
- case ReadSource::kUnset:
case ReadSource::kNoTimestamp: {
if (_isOplogReader) {
_oplogVisibleTs = static_cast<std::int64_t>(_oplogManager->getOplogReadTimestamp());
@@ -827,7 +824,6 @@ void WiredTigerRecoveryUnit::setTimestampReadSource(ReadSource readSource,
"setting timestamp read source",
"readSource"_attr = toString(readSource),
"provided"_attr = ((provided) ? provided->toString() : "none"));
-
invariant(!_isActive() || _timestampReadSource == readSource,
str::stream() << "Current state: " << toString(_getState())
<< ". Invalid internal state while setting timestamp read source: "
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h
index 312a46f5c09..0d557fc6329 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h
@@ -250,7 +250,7 @@ private:
bool _isTimestamped = false;
// Specifies which external source to use when setting read timestamps on transactions.
- ReadSource _timestampReadSource = ReadSource::kUnset;
+ ReadSource _timestampReadSource = ReadSource::kNoTimestamp;
// Commits are assumed ordered. Unordered commits are assumed to always need to reserve a
// new optime, and thus always call oplogDiskLocRegister() on the record store.
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp
index b50d4b79889..740672e7a2c 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp
@@ -203,7 +203,8 @@ TEST_F(WiredTigerRecoveryUnitTestFixture, NoOverlapReadSource) {
}
// Read without a timestamp. The write should be visible.
- ASSERT_EQ(opCtx1->recoveryUnit()->getTimestampReadSource(), RecoveryUnit::ReadSource::kUnset);
+ ASSERT_EQ(opCtx1->recoveryUnit()->getTimestampReadSource(),
+ RecoveryUnit::ReadSource::kNoTimestamp);
RecordData unused;
ASSERT_TRUE(rs->findRecord(opCtx1, rid1, &unused));
@@ -237,7 +238,7 @@ TEST_F(WiredTigerRecoveryUnitTestFixture, NoOverlapReadSource) {
// Read without a timestamp, and we should see the first and third records.
opCtx1->recoveryUnit()->abandonSnapshot();
- opCtx1->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset);
+ opCtx1->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
ASSERT_TRUE(rs->findRecord(opCtx1, rid1, &unused));
ASSERT_FALSE(rs->findRecord(opCtx1, rid2, &unused));
ASSERT_TRUE(rs->findRecord(opCtx1, rid3, &unused));
diff --git a/src/mongo/db/transaction_participant.cpp b/src/mongo/db/transaction_participant.cpp
index 742bfd087b4..de5874ae3a6 100644
--- a/src/mongo/db/transaction_participant.cpp
+++ b/src/mongo/db/transaction_participant.cpp
@@ -124,8 +124,9 @@ struct ActiveTransactionHistory {
ActiveTransactionHistory fetchActiveTransactionHistory(OperationContext* opCtx,
const LogicalSessionId& lsid) {
- // Restore the current timestamp read source after fetching transaction history.
- ReadSourceScope readSourceScope(opCtx);
+ // Restore the current timestamp read source after fetching transaction history using
+ // DBDirectClient, which may change our ReadSource.
+ ReadSourceScope readSourceScope(opCtx, RecoveryUnit::ReadSource::kNoTimestamp);
ActiveTransactionHistory result;
diff --git a/src/mongo/dbtests/querytests.cpp b/src/mongo/dbtests/querytests.cpp
index 84d533e2069..022dfb970ed 100644
--- a/src/mongo/dbtests/querytests.cpp
+++ b/src/mongo/dbtests/querytests.cpp
@@ -117,7 +117,7 @@ protected:
uassertStatusOK(indexer.insertAllDocumentsInCollection(&_opCtx, _collection));
uassertStatusOK(
indexer.drainBackgroundWrites(&_opCtx,
- RecoveryUnit::ReadSource::kUnset,
+ RecoveryUnit::ReadSource::kNoTimestamp,
IndexBuildInterceptor::DrainYieldPolicy::kNoYield));
uassertStatusOK(indexer.checkConstraints(&_opCtx));
{
diff --git a/src/mongo/dbtests/storage_timestamp_tests.cpp b/src/mongo/dbtests/storage_timestamp_tests.cpp
index 750c8ac447d..d270e0467d7 100644
--- a/src/mongo/dbtests/storage_timestamp_tests.cpp
+++ b/src/mongo/dbtests/storage_timestamp_tests.cpp
@@ -103,7 +103,7 @@ public:
OneOffRead(OperationContext* opCtx, const Timestamp& ts) : _opCtx(opCtx) {
_opCtx->recoveryUnit()->abandonSnapshot();
if (ts.isNull()) {
- _opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset);
+ _opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
} else {
_opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kProvided, ts);
}
@@ -111,7 +111,7 @@ public:
~OneOffRead() {
_opCtx->recoveryUnit()->abandonSnapshot();
- _opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset);
+ _opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
}
private:
@@ -234,7 +234,7 @@ public:
*/
void reset(NamespaceString nss) const {
::mongo::writeConflictRetry(_opCtx, "deleteAll", nss.ns(), [&] {
- _opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset);
+ _opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
AutoGetCollection collRaii(_opCtx, nss, LockMode::MODE_X);
if (collRaii) {
@@ -2057,7 +2057,7 @@ public:
firstInsert.asTimestamp());
ASSERT_OK(indexer.drainBackgroundWrites(_opCtx,
- RecoveryUnit::ReadSource::kUnset,
+ RecoveryUnit::ReadSource::kNoTimestamp,
IndexBuildInterceptor::DrainYieldPolicy::kNoYield));
auto indexCatalog = autoColl.getCollection()->getIndexCatalog();
@@ -2100,7 +2100,7 @@ public:
setReplCoordAppliedOpTime(repl::OpTime(afterSecondInsert.asTimestamp(), presentTerm));
ASSERT_OK(indexer.drainBackgroundWrites(_opCtx,
- RecoveryUnit::ReadSource::kUnset,
+ RecoveryUnit::ReadSource::kNoTimestamp,
IndexBuildInterceptor::DrainYieldPolicy::kNoYield));
{
@@ -2843,7 +2843,7 @@ public:
ASSERT_FALSE(buildingIndex->indexBuildInterceptor()->areAllWritesApplied(_opCtx));
ASSERT_OK(indexer.drainBackgroundWrites(_opCtx,
- RecoveryUnit::ReadSource::kUnset,
+ RecoveryUnit::ReadSource::kNoTimestamp,
IndexBuildInterceptor::DrainYieldPolicy::kNoYield));