diff options
author | Louis Williams <louis.williams@mongodb.com> | 2020-06-05 11:41:22 -0400 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-06-05 19:16:38 +0000 |
commit | 4e3f3b96826772baa777b7563b38574c1e11c2e4 (patch) | |
tree | 35b660384f5b6bcc918f66c7cfca9e1b43302e3d | |
parent | c1bc1b6d6b7b7d216b8243a609f1c7231045e5be (diff) | |
download | mongo-4e3f3b96826772baa777b7563b38574c1e11c2e4.tar.gz |
SERVER-48475 Reimplement lastApplied for secondary reads
This partially reverts work to use the kNoOverlap ReadSource on
secondaries since the all_durable calculation is unnecessary and
expensive.
(cherry picked from commit ff92d4435fa75c2b947c49cffaf48805b320a5ae)
-rw-r--r-- | src/mongo/db/db_raii.cpp | 41 | ||||
-rw-r--r-- | src/mongo/db/db_raii_test.cpp | 34 | ||||
-rw-r--r-- | src/mongo/db/storage/recovery_unit.h | 6 | ||||
-rw-r--r-- | src/mongo/db/storage/snapshot_helper.cpp | 42 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp | 38 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h | 6 |
6 files changed, 106 insertions, 61 deletions
diff --git a/src/mongo/db/db_raii.cpp b/src/mongo/db/db_raii.cpp index 6064e3b9b6c..a1b684d94c0 100644 --- a/src/mongo/db/db_raii.cpp +++ b/src/mongo/db/db_raii.cpp @@ -113,7 +113,7 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx, _autoColl.emplace(opCtx, nsOrUUID, collectionLockMode, viewMode, deadline); // If the read source is explicitly set to kNoTimestamp, we read the most up to date data and do - // not consider reading at the no-overlap point (e.g. FTDC needs that). + // not consider changing our ReadSource (e.g. FTDC needs that). if (opCtx->recoveryUnit()->getTimestampReadSource() == RecoveryUnit::ReadSource::kNoTimestamp) return; @@ -129,12 +129,12 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx, // during this period, we default to not acquiring the lock (by setting // _shouldNotConflictWithSecondaryBatchApplicationBlock). On primaries, we always read at a // consistent time, so not taking the PBWM lock is not a problem. On secondaries, we have to - // guarantee we read at a consistent state, so we must read at the no-overlap timestamp, - // which is a function of the lastApplied timestamp, which is set after each complete batch. + // guarantee we read at a consistent state, so we must read at the lastApplied timestamp, + // which is set after each complete batch. // - // If an attempt to read at the no-overlap timestamp is unsuccessful because there are - // pending catalog changes that occur after the no-overlap timestamp, we release our locks - // and try again with the PBWM lock (by unsetting + // If an attempt to read at the lastApplied timestamp is unsuccessful because there are + // pending catalog changes that occur after that timestamp, we release our locks and try + // again with the PBWM lock (by unsetting // _shouldNotConflictWithSecondaryBatchApplicationBlock). const NamespaceString nss = coll->ns(); @@ -165,29 +165,32 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx, } invariant( - // The kMajorityCommitted and kNoOverlap read sources already read from timestamps + // The kMajorityCommitted and kLastApplied read sources already read from timestamps // that are safe with respect to concurrent secondary batch application, and are // eligible for retrying. readSource == RecoveryUnit::ReadSource::kMajorityCommitted || - readSource == RecoveryUnit::ReadSource::kNoOverlap); + readSource == RecoveryUnit::ReadSource::kNoOverlap || + readSource == RecoveryUnit::ReadSource::kLastApplied); invariant(readConcernLevel != repl::ReadConcernLevel::kSnapshotReadConcern); // Yield locks in order to do the blocking call below. _autoColl = boost::none; - // If there are pending catalog changes when using a no-overlap read source, we choose to - // take the PBWM lock to conflict with any in-progress batches. This prevents us from idly - // spinning in this loop trying to get a new read timestamp ahead of the minimum visible - // snapshot. This helps us guarantee liveness (i.e. we can eventually get a suitable read - // timestamp) but should not be necessary for correctness. After initial sync finishes, if - // we waited instead of retrying, readers would block indefinitely waiting for the - // noOverlap time to move forward. Instead we force the reader take the PBWM lock and retry. - if (readSource == RecoveryUnit::ReadSource::kNoOverlap) { + // If there are pending catalog changes when using a no-overlap or lastApplied read source, + // we choose to take the PBWM lock to conflict with any in-progress batches. This prevents + // us from idly spinning in this loop trying to get a new read timestamp ahead of the + // minimum visible snapshot. This helps us guarantee liveness (i.e. we can eventually get a + // suitable read timestamp) but should not be necessary for correctness. After initial sync + // finishes, if we waited instead of retrying, readers would block indefinitely waiting for + // their read timstamp to move forward. Instead we force the reader take the PBWM lock and + // retry. + if (readSource == RecoveryUnit::ReadSource::kLastApplied || + readSource == RecoveryUnit::ReadSource::kNoOverlap) { invariant(readTimestamp); LOGV2(20576, - "Tried reading at no-overlap time, but future catalog changes are pending. " - "Trying again without reading at no-overlap time.", - "noOverlapTimestamp"_attr = *readTimestamp, + "Tried reading at a timestamp, but future catalog changes are pending. " + "Trying again without reading at a timestamp", + "readTimestamp"_attr = *readTimestamp, "collection"_attr = nss.ns(), "collectionMinSnapshot"_attr = *minSnapshot); // Destructing the block sets _shouldConflictWithSecondaryBatchApplication back to the diff --git a/src/mongo/db/db_raii_test.cpp b/src/mongo/db/db_raii_test.cpp index 460d4acec10..77874604c86 100644 --- a/src/mongo/db/db_raii_test.cpp +++ b/src/mongo/db/db_raii_test.cpp @@ -267,11 +267,11 @@ TEST_F(DBRAIITestFixture, ASSERT(client1.second->lockState()->isDbLockedForMode(nss.db(), MODE_IX)); AutoGetCollectionForRead coll(client2.second.get(), NamespaceString("local.system.js")); - // Reading from an unreplicated collection does not change the ReadSource to kNoOverlap. + // Reading from an unreplicated collection does not change the ReadSource to kLastApplied. ASSERT_EQ(client2.second.get()->recoveryUnit()->getTimestampReadSource(), RecoveryUnit::ReadSource::kUnset); - // Reading from a replicated collection will try to switch to kNoOverlap. Because we are + // Reading from a replicated collection will try to switch to kLastApplied. Because we are // already reading without a timestamp and we can't reacquire the PBWM lock to continue reading // without a timestamp, we uassert in this situation. ASSERT_THROWS_CODE(AutoGetCollectionForRead(client2.second.get(), nss), @@ -280,9 +280,9 @@ TEST_F(DBRAIITestFixture, } TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedConflict) { - // This test simulates a situation where AutoGetCollectionForRead cant read at the no-overlap - // point (minimum of all_durable and lastApplied) because it is set to a point earlier than the - // catalog change. We expect to read without a timestamp and hold the PBWM lock. + // This test simulates a situation where AutoGetCollectionForRead cant read at lastApplied + // because it is set to a point earlier than the catalog change. We expect to read without a + // timestamp and hold the PBWM lock. auto replCoord = repl::ReplicationCoordinator::get(client1.second.get()); CollectionOptions defaultCollectionOptions; ASSERT_OK( @@ -302,7 +302,7 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedConflict) { snapshotManager->setLastApplied(opTime.getTimestamp()); AutoGetCollectionForRead coll(client1.second.get(), nss); - // We can't read from kNoOverlap in this scenario because there is a catalog conflict. Resort + // We can't read from kLastApplied in this scenario because there is a catalog conflict. Resort // to taking the PBWM lock and reading without a timestamp. ASSERT_EQ(client1.second.get()->recoveryUnit()->getTimestampReadSource(), RecoveryUnit::ReadSource::kUnset); @@ -311,8 +311,8 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedConflict) { } TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedUnavailable) { - // This test simulates a situation where AutoGetCollectionForRead reads at the no-overlap - // point (minimum of all_durable and lastApplied) even though lastApplied is not available. + // This test simulates a situation where AutoGetCollectionForRead reads without a timestamp + // even though lastApplied is not available. auto replCoord = repl::ReplicationCoordinator::get(client1.second.get()); CollectionOptions defaultCollectionOptions; ASSERT_OK( @@ -320,23 +320,21 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedUnavailable) { ASSERT_OK(replCoord->setFollowerMode(repl::MemberState::RS_SECONDARY)); // Note that when the collection was created, above, the system chooses a minimum snapshot time - // for the collection. Since last-applied isn't available, we default to all_durable, which is - // available, and is greater than the collection minimum snapshot. + // for the collection. Since last-applied isn't available, we default to read without a + // timestamp. auto snapshotManager = client1.second.get()->getServiceContext()->getStorageEngine()->getSnapshotManager(); ASSERT_FALSE(snapshotManager->getLastApplied()); AutoGetCollectionForRead coll(client1.second.get(), nss); - // Even though lastApplied isn't available, the ReadSource is set to kNoOverlap, which reads - // at the all_durable time. ASSERT_EQ(client1.second.get()->recoveryUnit()->getTimestampReadSource(), - RecoveryUnit::ReadSource::kNoOverlap); - ASSERT_TRUE(client1.second.get()->recoveryUnit()->getPointInTimeReadTimestamp()); + RecoveryUnit::ReadSource::kLastApplied); + ASSERT_FALSE(client1.second.get()->recoveryUnit()->getPointInTimeReadTimestamp()); ASSERT_FALSE(client1.second.get()->lockState()->isLockHeldForMode( resourceIdParallelBatchWriterMode, MODE_IS)); } -TEST_F(DBRAIITestFixture, AutoGetCollectionForReadUsesNoOverlapOnSecondary) { +TEST_F(DBRAIITestFixture, AutoGetCollectionForReadUsesLastAppliedOnSecondary) { auto opCtx = client1.second.get(); // Use a tailable query on a capped collection because we can anticipate it automatically @@ -358,9 +356,9 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadUsesNoOverlapOnSecondary) { auto state = exec->getNext(&unused, nullptr); ASSERT_EQ(state, PlanExecutor::ExecState::IS_EOF); - // After restoring, the collection scan should now be reading with kNoOverlap, the default on + // After restoring, the collection scan should now be reading with kLastApplied, the default on // secondaries. - ASSERT_EQ(RecoveryUnit::ReadSource::kNoOverlap, + ASSERT_EQ(RecoveryUnit::ReadSource::kLastApplied, opCtx->recoveryUnit()->getTimestampReadSource()); ASSERT_EQUALS(PlanExecutor::IS_EOF, exec->getNext(&unused, nullptr)); } @@ -379,7 +377,7 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadChangedReadSourceAfterStepUp) auto exec = makeTailableQueryPlan(opCtx, autoColl.getCollection()); // The collection scan should use the default ReadSource on a secondary. - ASSERT_EQ(RecoveryUnit::ReadSource::kNoOverlap, + ASSERT_EQ(RecoveryUnit::ReadSource::kLastApplied, opCtx->recoveryUnit()->getTimestampReadSource()); // When the tailable query recovers from its yield, it should discover that the node is primary diff --git a/src/mongo/db/storage/recovery_unit.h b/src/mongo/db/storage/recovery_unit.h index a2767c661c7..0fd44b18b42 100644 --- a/src/mongo/db/storage/recovery_unit.h +++ b/src/mongo/db/storage/recovery_unit.h @@ -398,6 +398,10 @@ public: */ kNoOverlap, /** + * Read from the lastApplied timestamp. + */ + kLastApplied, + /** * Read from the all_durable timestamp. New transactions will always read from the same * timestamp and never advance. */ @@ -418,6 +422,8 @@ public: return "kMajorityCommitted"; case ReadSource::kNoOverlap: return "kNoOverlap"; + case ReadSource::kLastApplied: + return "kLastApplied"; case ReadSource::kAllDurableSnapshot: return "kAllDurableSnapshot"; case ReadSource::kProvided: diff --git a/src/mongo/db/storage/snapshot_helper.cpp b/src/mongo/db/storage/snapshot_helper.cpp index 51d7f0327c1..57a1c3b99e6 100644 --- a/src/mongo/db/storage/snapshot_helper.cpp +++ b/src/mongo/db/storage/snapshot_helper.cpp @@ -52,19 +52,19 @@ bool canSwitchReadSource(OperationContext* opCtx) { return false; } -bool shouldReadAtNoOverlap(OperationContext* opCtx, - const NamespaceString& nss, - std::string* reason) { +bool shouldReadAtLastApplied(OperationContext* opCtx, + const NamespaceString& nss, + std::string* reason) { - // If this is true, then the operation opted-in to the PBWM lock, implying that it cannot read - // at no-overlap. It's important to note that it is possible for this to be false, but still be + // If this is true, then the operation opted-in to the PBWM lock, implying that it cannot change + // its ReadSource. It's important to note that it is possible for this to be false, but still be // holding the PBWM lock, explained below. if (opCtx->lockState()->shouldConflictWithSecondaryBatchApplication()) { *reason = "conflicts with batch application"; return false; } - // If we are already holding the PBWM lock, do not read at no-overlap. Snapshots acquired by an + // If we are already holding the PBWM lock, do not change ReadSource. Snapshots acquired by an // operation after a yield/restore must see all writes in the pre-yield snapshot. Once a // snapshot is reading without a timestamp, we choose to continue acquiring snapshots without a // timestamp. This is done in lieu of determining a timestamp far enough in the future that's @@ -72,19 +72,19 @@ bool shouldReadAtNoOverlap(OperationContext* opCtx, // held concurrently, which is often the case when DBDirectClient is used. if (opCtx->lockState()->isLockHeldForMode(resourceIdParallelBatchWriterMode, MODE_IS)) { *reason = "PBWM lock is held"; - LOGV2_DEBUG(20577, 1, "not reading at no-overlap because the PBWM lock is held"); + LOGV2_DEBUG(20577, 1, "not reading at lastApplied because the PBWM lock is held"); return false; } // If we are in a replication state (like secondary or primary catch-up) where we are not - // accepting writes, we should read at no-overlap. If this node can accept writes, then no + // accepting writes, we should read at lastApplied. If this node can accept writes, then no // conflicting replication batches are being applied and we can read from the default snapshot. if (repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesForDatabase(opCtx, "admin")) { *reason = "primary"; return false; } - // Non-replicated collections do not need to read at no-overlap, as those collections are not + // Non-replicated collections do not need to read at lastApplied, as those collections are not // written by the replication system. However, the oplog is special, as it *is* written by the // replication system. if (!nss.isReplicated() && !nss.isOplog()) { @@ -103,39 +103,33 @@ boost::optional<RecoveryUnit::ReadSource> getNewReadSource(OperationContext* opC const auto existing = opCtx->recoveryUnit()->getTimestampReadSource(); std::string reason; - const bool readAtNoOverlap = shouldReadAtNoOverlap(opCtx, nss, &reason); + const bool readAtLastApplied = shouldReadAtLastApplied(opCtx, nss, &reason); if (existing == RecoveryUnit::ReadSource::kUnset) { // Shifting from reading without a timestamp to reading with a timestamp can be dangerous // because writes will appear to vanish. This case is intended for new reads on secondaries // and query yield recovery after state transitions from primary to secondary. // If a query recovers from a yield and the node is no longer primary, it must start reading - // at the no-overlap point because reading without a timestamp is not safe. The no-overlap - // point (the minimum of WT all_durable and lastApplied) is safe to use because step-down - // closes all oplog holes, which hold back all_durable. The all_durable timestamp will then - // be at least equal to the timestamp of the last write before the state transition, meaning - // a reader performing an untimestamped read before the state transition will see all of the - // same writes after the state transition. - if (readAtNoOverlap) { - LOGV2_DEBUG(4452901, 2, "Changing ReadSource to kNoOverlap", "collection"_attr = nss); - return RecoveryUnit::ReadSource::kNoOverlap; + // at the lastApplied point because reading without a timestamp is not safe. + if (readAtLastApplied) { + LOGV2_DEBUG(4452901, 2, "Changing ReadSource to kLastApplied", "collection"_attr = nss); + return RecoveryUnit::ReadSource::kLastApplied; } - } else if (existing == RecoveryUnit::ReadSource::kNoOverlap) { - // For some reason, we can no longer read at kNoOverlap. + } else if (existing == RecoveryUnit::ReadSource::kLastApplied) { + // For some reason, we can no longer read at lastApplied. // An operation that yields a timestamped snapshot must restore a snapshot with at least as // large of a timestamp, or with proper consideration of rollback scenarios, no timestamp. // Given readers do not survive rollbacks, it's okay to go from reading with a timestamp to // reading without one. More writes will become visible. - if (!readAtNoOverlap) { + if (!readAtLastApplied) { LOGV2_DEBUG(4452902, 2, "Changing ReadSource to kUnset", "collection"_attr = nss, "reason"_attr = reason); - // This shift to kUnset assumes that callers will not make future attempts to manipulate // their ReadSources after performing reads at an un-timetamped snapshot. The only // exception is callers of this function that may need to change from kUnset to - // kNoOverlap in the event of a catalog conflict or query yield. + // kLastApplied in the event of a catalog conflict or query yield. return RecoveryUnit::ReadSource::kUnset; } } diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp index 1577c27111b..a54fd288176 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp @@ -454,6 +454,7 @@ boost::optional<Timestamp> WiredTigerRecoveryUnit::getPointInTimeReadTimestamp() // The following ReadSources can only establish a read timestamp when a transaction is // opened. case ReadSource::kNoOverlap: + case ReadSource::kLastApplied: case ReadSource::kAllDurableSnapshot: break; } @@ -462,6 +463,7 @@ boost::optional<Timestamp> WiredTigerRecoveryUnit::getPointInTimeReadTimestamp() getSession(); switch (_timestampReadSource) { + case ReadSource::kLastApplied: case ReadSource::kNoOverlap: // The lastApplied and allDurable timestamps are not always available if the system has // not accepted writes, so it is not possible to invariant that it exists as other @@ -515,6 +517,10 @@ void WiredTigerRecoveryUnit::_txnOpen() { session, _prepareConflictBehavior, _roundUpPreparedTimestamps); break; } + case ReadSource::kLastApplied: { + _readAtTimestamp = _beginTransactionAtLastAppliedTimestamp(session); + break; + } case ReadSource::kNoOverlap: { _readAtTimestamp = _beginTransactionAtNoOverlapTimestamp(session); break; @@ -566,6 +572,38 @@ Timestamp WiredTigerRecoveryUnit::_beginTransactionAtAllDurableTimestamp(WT_SESS return readTimestamp; } +Timestamp WiredTigerRecoveryUnit::_beginTransactionAtLastAppliedTimestamp(WT_SESSION* session) { + auto lastApplied = _sessionCache->snapshotManager().getLastApplied(); + if (!lastApplied) { + // When there is not a lastApplied timestamp available, read without a timestamp. Do not + // round up the read timestamp to the oldest timestamp. + + // There is a race that allows new transactions to start between the time we check for a + // read timestamp and start our transaction, which can temporarily violate the contract of + // kLastApplied. That is, writes will be visible that occur after the lastApplied time. This + // is only possible for readers that start immediately after an initial sync that did not + // replicate any oplog entries. Future transactions will start reading at a timestamp once + // timestamped writes have been made. + WiredTigerBeginTxnBlock txnOpen( + session, _prepareConflictBehavior, _roundUpPreparedTimestamps); + LOGV2_DEBUG(4847500, 2, "no read timestamp available for kLastApplied"); + txnOpen.done(); + return Timestamp(); + } + + WiredTigerBeginTxnBlock txnOpen(session, + _prepareConflictBehavior, + _roundUpPreparedTimestamps, + RoundUpReadTimestamp::kRound); + auto status = txnOpen.setReadSnapshot(*lastApplied); + fassert(4847501, status); + txnOpen.done(); + + // We might have rounded to oldest between calling getLastApplied and setReadSnapshot. We + // need to get the actual read timestamp we used. + return _getTransactionReadTimestamp(session); +} + Timestamp WiredTigerRecoveryUnit::_beginTransactionAtNoOverlapTimestamp(WT_SESSION* session) { auto lastApplied = _sessionCache->snapshotManager().getLastApplied(); diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h index 19c67fcfe62..c61d80205ac 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h @@ -228,6 +228,12 @@ private: Timestamp _beginTransactionAtNoOverlapTimestamp(WT_SESSION* session); /** + * Starts a transaction at the lastApplied timestamp. Returns the timestamp at which the + * transaction was started. + */ + Timestamp _beginTransactionAtLastAppliedTimestamp(WT_SESSION* session); + + /** * Returns the timestamp at which the current transaction is reading. */ Timestamp _getTransactionReadTimestamp(WT_SESSION* session); |