diff options
author | Louis Williams <louis.williams@mongodb.com> | 2020-09-15 12:43:48 -0400 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-09-15 17:10:52 +0000 |
commit | a68bbe9d37f0c576295016919c9e0595c4ec1427 (patch) | |
tree | 2b0fac4bc73bb554ae145cd07409c931c7c0dbc6 /src/mongo/db/storage | |
parent | 2b904bb7d987b1538dce3d1800d97ae73cd791b4 (diff) | |
download | mongo-a68bbe9d37f0c576295016919c9e0595c4ec1427.tar.gz |
SERVER-48452 Internal readers should default to reading without a timestamp
Removes ReadSource::kUnset in favor of kNoTimestamp as the default
Makes the following behavioral changes to AutoGetCollectionForRead:
* Removes special early-return handling for kNoTimestamp
* Only user or DBDirectClient operations are eligible to read at
kLastApplied.
* Operations only read at kLastApplied when in the SECONDARY state, nothing
else. This means most internal operations that use DBDirectClient do not need
to use a ReadSourceScope to ensure they read at kNoTimestamp.
(cherry picked from commit 11c68393df88a6f1ea4855e6ac15e54ca9f9d976)
Diffstat (limited to 'src/mongo/db/storage')
7 files changed, 62 insertions, 41 deletions
diff --git a/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp b/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp index baecec3c8af..26ade4d6cfe 100644 --- a/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp +++ b/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp @@ -372,7 +372,7 @@ TEST_F(SnapshotManagerTests, InsertAndReadOnLastAppliedSnapshot) { // Not reading on the last applied timestamp returns the most recent data. auto op = makeOperation(); auto ru = op->recoveryUnit(); - ru->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset); + ru->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp); ASSERT_EQ(itCountOn(op), 1); ASSERT(readRecordOn(op, id)); @@ -408,7 +408,7 @@ TEST_F(SnapshotManagerTests, UpdateAndDeleteOnLocalSnapshot) { // Not reading on the last local timestamp returns the most recent data. auto op = makeOperation(); auto ru = op->recoveryUnit(); - ru->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset); + ru->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp); ASSERT_EQ(itCountOn(op), 1); auto record = readRecordOn(op, id); ASSERT_EQ(std::string(record->data.data()), "Blue spotted stingray"); diff --git a/src/mongo/db/storage/recovery_unit.h b/src/mongo/db/storage/recovery_unit.h index e98452c20f2..96d5ec25f66 100644 --- a/src/mongo/db/storage/recovery_unit.h +++ b/src/mongo/db/storage/recovery_unit.h @@ -382,11 +382,7 @@ public: */ enum ReadSource { /** - * Do not read from a timestamp. This is the default. - */ - kUnset, - /** - * Read without a timestamp explicitly. + * Read without a timestamp. This is the default. */ kNoTimestamp, /** @@ -414,8 +410,6 @@ public: static std::string toString(ReadSource rs) { switch (rs) { - case ReadSource::kUnset: - return "kUnset"; case ReadSource::kNoTimestamp: return "kNoTimestamp"; case ReadSource::kMajorityCommitted: @@ -445,7 +439,7 @@ public: boost::optional<Timestamp> provided = boost::none) {} virtual ReadSource getTimestampReadSource() const { - return ReadSource::kUnset; + return ReadSource::kNoTimestamp; }; /** diff --git a/src/mongo/db/storage/snapshot_helper.cpp b/src/mongo/db/storage/snapshot_helper.cpp index 57a1c3b99e6..913248d7ad7 100644 --- a/src/mongo/db/storage/snapshot_helper.cpp +++ b/src/mongo/db/storage/snapshot_helper.cpp @@ -38,29 +38,37 @@ #include "mongo/logv2/log.h" namespace mongo { -namespace SnapshotHelper { -bool canSwitchReadSource(OperationContext* opCtx) { - - // Most readConcerns have behavior controlled at higher levels. Local and available are the only - // ReadConcerns that should consider changing, since they read without a timestamp by default. +namespace { +bool canReadAtLastApplied(OperationContext* opCtx) { + // Local and available are the only ReadConcern levels that allow their ReadSource to be + // overridden to read at lastApplied. They read without a timestamp by default, but this check + // allows user secondary reads from conflicting with oplog batch application by reading at a + // consistent point in time. + // Internal operations use DBDirectClient as a loopback to perform local operations, and they + // expect the same level of consistency guarantees as any user operation. For that reason, + // DBDirectClient should be able to change the owning operation's ReadSource in order to serve + // consistent data. const auto readConcernLevel = repl::ReadConcernArgs::get(opCtx).getLevel(); - if (readConcernLevel == repl::ReadConcernLevel::kLocalReadConcern || - readConcernLevel == repl::ReadConcernLevel::kAvailableReadConcern) { + if ((opCtx->getClient()->isFromUserConnection() || opCtx->getClient()->isInDirectClient()) && + (readConcernLevel == repl::ReadConcernLevel::kLocalReadConcern || + readConcernLevel == repl::ReadConcernLevel::kAvailableReadConcern)) { return true; } - return false; } +} // namespace +namespace SnapshotHelper { bool shouldReadAtLastApplied(OperationContext* opCtx, const NamespaceString& nss, std::string* reason) { - // If this is true, then the operation opted-in to the PBWM lock, implying that it cannot change // its ReadSource. It's important to note that it is possible for this to be false, but still be // holding the PBWM lock, explained below. if (opCtx->lockState()->shouldConflictWithSecondaryBatchApplication()) { - *reason = "conflicts with batch application"; + if (reason) { + *reason = "conflicts with batch application"; + } return false; } @@ -71,16 +79,32 @@ bool shouldReadAtLastApplied(OperationContext* opCtx, // guaranteed to observe all previous writes. This may occur when multiple collection locks are // held concurrently, which is often the case when DBDirectClient is used. if (opCtx->lockState()->isLockHeldForMode(resourceIdParallelBatchWriterMode, MODE_IS)) { - *reason = "PBWM lock is held"; + if (reason) { + *reason = "PBWM lock is held"; + } LOGV2_DEBUG(20577, 1, "not reading at lastApplied because the PBWM lock is held"); return false; } - // If we are in a replication state (like secondary or primary catch-up) where we are not - // accepting writes, we should read at lastApplied. If this node can accept writes, then no - // conflicting replication batches are being applied and we can read from the default snapshot. + // If this node can accept writes (i.e. primary), then no conflicting replication batches are + // being applied and we can read from the default snapshot. If we are in a replication state + // (like secondary or primary catch-up) where we are not accepting writes, we should read at + // lastApplied. if (repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesForDatabase(opCtx, "admin")) { - *reason = "primary"; + if (reason) { + *reason = "primary"; + } + return false; + } + + // If we are not secondary, then we should not attempt to read at lastApplied because it may not + // be available or valid. Any operations reading outside of the primary or secondary states must + // be internal. We give these operations the benefit of the doubt rather than attempting to read + // at a lastApplied timestamp that is not valid. + if (!repl::ReplicationCoordinator::get(opCtx)->isInPrimaryOrSecondaryState(opCtx)) { + if (reason) { + *reason = "not primary or secondary"; + } return false; } @@ -88,7 +112,9 @@ bool shouldReadAtLastApplied(OperationContext* opCtx, // written by the replication system. However, the oplog is special, as it *is* written by the // replication system. if (!nss.isReplicated() && !nss.isOplog()) { - *reason = "unreplicated collection"; + if (reason) { + *reason = "unreplicated collection"; + } return false; } @@ -96,15 +122,14 @@ bool shouldReadAtLastApplied(OperationContext* opCtx, } boost::optional<RecoveryUnit::ReadSource> getNewReadSource(OperationContext* opCtx, const NamespaceString& nss) { - const bool canSwitch = canSwitchReadSource(opCtx); - if (!canSwitch) { + if (!canReadAtLastApplied(opCtx)) { return boost::none; } const auto existing = opCtx->recoveryUnit()->getTimestampReadSource(); std::string reason; const bool readAtLastApplied = shouldReadAtLastApplied(opCtx, nss, &reason); - if (existing == RecoveryUnit::ReadSource::kUnset) { + if (existing == RecoveryUnit::ReadSource::kNoTimestamp) { // Shifting from reading without a timestamp to reading with a timestamp can be dangerous // because writes will appear to vanish. This case is intended for new reads on secondaries // and query yield recovery after state transitions from primary to secondary. @@ -124,13 +149,14 @@ boost::optional<RecoveryUnit::ReadSource> getNewReadSource(OperationContext* opC if (!readAtLastApplied) { LOGV2_DEBUG(4452902, 2, - "Changing ReadSource to kUnset", + "Changing ReadSource to kNoTimestamp", "collection"_attr = nss, "reason"_attr = reason); - // their ReadSources after performing reads at an un-timetamped snapshot. The only - // exception is callers of this function that may need to change from kUnset to - // kLastApplied in the event of a catalog conflict or query yield. - return RecoveryUnit::ReadSource::kUnset; + // This shift to kNoTimestamp assumes that callers will not make future attempts to + // manipulate their ReadSources after performing reads at an un-timetamped snapshot. The + // only exception is callers of this function that may need to change from kNoTimestamp + // to kLastApplied in the event of a catalog conflict or query yield. + return RecoveryUnit::ReadSource::kNoTimestamp; } } return boost::none; diff --git a/src/mongo/db/storage/snapshot_helper.h b/src/mongo/db/storage/snapshot_helper.h index fa8fdd85f24..c24dfd16d8c 100644 --- a/src/mongo/db/storage/snapshot_helper.h +++ b/src/mongo/db/storage/snapshot_helper.h @@ -37,6 +37,10 @@ namespace SnapshotHelper { boost::optional<RecoveryUnit::ReadSource> getNewReadSource(OperationContext* opCtx, const NamespaceString& nss); +bool shouldReadAtLastApplied(OperationContext* opCtx, + const NamespaceString& nss, + std::string* reason = nullptr); + bool collectionChangesConflictWithRead(boost::optional<Timestamp> collectionMin, boost::optional<Timestamp> readTimestamp); } // namespace SnapshotHelper diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp index 7ddc76c2cf9..02d2986b736 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp @@ -449,7 +449,6 @@ boost::optional<Timestamp> WiredTigerRecoveryUnit::getPointInTimeReadTimestamp() // transaction to establish a read timestamp, but only for ReadSources that are expected to have // read timestamps. switch (_timestampReadSource) { - case ReadSource::kUnset: case ReadSource::kNoTimestamp: return boost::none; case ReadSource::kMajorityCommitted: @@ -488,7 +487,6 @@ boost::optional<Timestamp> WiredTigerRecoveryUnit::getPointInTimeReadTimestamp() return _readAtTimestamp; // The follow ReadSources returned values in the first switch block. - case ReadSource::kUnset: case ReadSource::kNoTimestamp: case ReadSource::kMajorityCommitted: case ReadSource::kProvided: @@ -511,7 +509,6 @@ void WiredTigerRecoveryUnit::_txnOpen() { WT_SESSION* session = _session->getSession(); switch (_timestampReadSource) { - case ReadSource::kUnset: case ReadSource::kNoTimestamp: { if (_isOplogReader) { _oplogVisibleTs = static_cast<std::int64_t>(_oplogManager->getOplogReadTimestamp()); @@ -828,7 +825,6 @@ void WiredTigerRecoveryUnit::setTimestampReadSource(ReadSource readSource, "setting timestamp read source", "readSource"_attr = toString(readSource), "provided"_attr = ((provided) ? provided->toString() : "none")); - invariant(!_isActive() || _timestampReadSource == readSource, str::stream() << "Current state: " << toString(_getState()) << ". Invalid internal state while setting timestamp read source: " diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h index c61d80205ac..b3961eacee0 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h @@ -244,7 +244,7 @@ private: bool _isTimestamped = false; // Specifies which external source to use when setting read timestamps on transactions. - ReadSource _timestampReadSource = ReadSource::kUnset; + ReadSource _timestampReadSource = ReadSource::kNoTimestamp; // Commits are assumed ordered. Unordered commits are assumed to always need to reserve a // new optime, and thus always call oplogDiskLocRegister() on the record store. diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp index 1f440d319b7..7171f75fbeb 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp @@ -203,7 +203,8 @@ TEST_F(WiredTigerRecoveryUnitTestFixture, NoOverlapReadSource) { } // Read without a timestamp. The write should be visible. - ASSERT_EQ(opCtx1->recoveryUnit()->getTimestampReadSource(), RecoveryUnit::ReadSource::kUnset); + ASSERT_EQ(opCtx1->recoveryUnit()->getTimestampReadSource(), + RecoveryUnit::ReadSource::kNoTimestamp); RecordData unused; ASSERT_TRUE(rs->findRecord(opCtx1, rid1, &unused)); @@ -237,7 +238,7 @@ TEST_F(WiredTigerRecoveryUnitTestFixture, NoOverlapReadSource) { // Read without a timestamp, and we should see the first and third records. opCtx1->recoveryUnit()->abandonSnapshot(); - opCtx1->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset); + opCtx1->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp); ASSERT_TRUE(rs->findRecord(opCtx1, rid1, &unused)); ASSERT_FALSE(rs->findRecord(opCtx1, rid2, &unused)); ASSERT_TRUE(rs->findRecord(opCtx1, rid3, &unused)); |