1 files changed, 62 insertions, 151 deletions
diff --git a/src/mongo/db/db_raii.cpp b/src/mongo/db/db_raii.cpp
index 34775dd7d01..53dda3ca490 100644
--- a/src/mongo/db/db_raii.cpp
+++ b/src/mongo/db/db_raii.cpp
@@ -39,6 +39,7 @@
 #include "mongo/db/db_raii_gen.h"
 #include "mongo/db/repl/replication_coordinator.h"
 #include "mongo/db/s/collection_sharding_state.h"
+#include "mongo/db/storage/snapshot_helper.h"
 #include "mongo/logv2/log.h"
 
 namespace mongo {
@@ -112,7 +113,7 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
     _autoColl.emplace(opCtx, nsOrUUID, collectionLockMode, viewMode, deadline);
 
     // If the read source is explicitly set to kNoTimestamp, we read the most up to date data and do
-    // not consider reading at last applied (e.g. FTDC needs that).
+    // not consider reading at the no-overlap point (e.g. FTDC needs that).
     if (opCtx->recoveryUnit()->getTimestampReadSource() == RecoveryUnit::ReadSource::kNoTimestamp)
         return;
 
@@ -123,10 +124,6 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
     // need to check for pending catalog changes.
     while (auto coll = _autoColl->getCollection()) {
 
-        auto readSource = opCtx->recoveryUnit()->getTimestampReadSource();
-        auto minSnapshot = coll->getMinimumVisibleSnapshot();
-        auto mySnapshot = opCtx->recoveryUnit()->getPointInTimeReadTimestamp();
-
         // TODO(SERVER-47824): Also ban transaction snapshot reads on capped collections.
         uassert(ErrorCodes::SnapshotUnavailable,
                 "Reading from capped collections with readConcern snapshot is not supported "
@@ -135,77 +132,85 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
                     readConcernLevel != repl::ReadConcernLevel::kSnapshotReadConcern ||
                     opCtx->inMultiDocumentTransaction());
 
-        // If we are reading at a provided timestamp earlier than the latest catalog changes,
-        // then we must return an error.
-        if (readSource == RecoveryUnit::ReadSource::kProvided && minSnapshot &&
-            (*mySnapshot < *minSnapshot)) {
-            uasserted(ErrorCodes::SnapshotUnavailable,
-                      str::stream()
-                          << "Unable to read from a snapshot due to pending collection catalog "
-                             "changes; please retry the operation. Snapshot timestamp is "
-                          << mySnapshot->toString() << ". Collection minimum is "
-                          << minSnapshot->toString());
-        }
-
         // During batch application on secondaries, there is a potential to read inconsistent states
         // that would normally be protected by the PBWM lock. In order to serve secondary reads
         // during this period, we default to not acquiring the lock (by setting
         // _shouldNotConflictWithSecondaryBatchApplicationBlock). On primaries, we always read at a
         // consistent time, so not taking the PBWM lock is not a problem. On secondaries, we have to
-        // guarantee we read at a consistent state, so we must read at the last applied timestamp,
-        // which is set after each complete batch.
+        // guarantee we read at a consistent state, so we must read at the no-overlap timestamp,
+        // which is a function of the lastApplied timestamp, which is set after each complete batch.
         //
-        // If an attempt to read at the last applied timestamp is unsuccessful because there are
-        // pending catalog changes that occur after the last applied timestamp, we release our locks
+        // If an attempt to read at the no-overlap timestamp is unsuccessful because there are
+        // pending catalog changes that occur after the no-overlap timestamp, we release our locks
         // and try again with the PBWM lock (by unsetting
         // _shouldNotConflictWithSecondaryBatchApplicationBlock).
 
         const NamespaceString nss = coll->ns();
+        auto readSource = opCtx->recoveryUnit()->getTimestampReadSource();
 
-        bool readAtLastAppliedTimestamp =
-            _shouldReadAtLastAppliedTimestamp(opCtx, nss, readConcernLevel);
-
-        if (readAtLastAppliedTimestamp) {
-            opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kLastApplied);
-            readSource = opCtx->recoveryUnit()->getTimestampReadSource();
+        // Once we have our locks, check whether or not we should override the ReadSource that was
+        // set before acquiring locks.
+        if (auto newReadSource = SnapshotHelper::getNewReadSource(opCtx, nss)) {
+            opCtx->recoveryUnit()->setTimestampReadSource(*newReadSource);
+            readSource = *newReadSource;
         }
 
-        // This timestamp could be earlier than the timestamp seen when the transaction is opened
-        // because it is set asynchonously. This is not problematic because holding the collection
-        // lock guarantees no metadata changes will occur in that time.
-        auto lastAppliedTimestamp = readAtLastAppliedTimestamp
-            ? boost::optional<Timestamp>(replCoord->getMyLastAppliedOpTime().getTimestamp())
-            : boost::none;
+        const auto readTimestamp = opCtx->recoveryUnit()->getPointInTimeReadTimestamp();
+        const auto afterClusterTime = repl::ReadConcernArgs::get(opCtx).getArgsAfterClusterTime();
+        if (readTimestamp && afterClusterTime) {
+            // Readers that use afterClusterTime have already waited at a higher level for the
+            // lastApplied time to advance to a specified optime, and they assume the read timestamp
+            // of the operation is at least that waited-for timestamp. For kNoOverlap, which is the
+            // minimum of lastApplied and all_durable, this invariant ensures that afterClusterTime
+            // reads do not choose a read timestamp older than the one requested.
+            invariant(*readTimestamp >= afterClusterTime->asTimestamp(),
+                      str::stream() << "read timestamp " << readTimestamp->toString()
+                                    << "was less than afterClusterTime: "
+                                    << afterClusterTime->asTimestamp().toString());
+        }
 
-        if (!_conflictingCatalogChanges(opCtx, minSnapshot, lastAppliedTimestamp)) {
+        auto minSnapshot = coll->getMinimumVisibleSnapshot();
+        if (!SnapshotHelper::collectionChangesConflictWithRead(minSnapshot, readTimestamp)) {
             return;
         }
 
-        invariant(lastAppliedTimestamp ||
-                  // The kMajorityCommitted and kNoOverlap read sources already read from timestamps
-                  // that are safe with respect to concurrent secondary batch application.
-                  readSource == RecoveryUnit::ReadSource::kMajorityCommitted ||
-                  readSource == RecoveryUnit::ReadSource::kNoOverlap);
+        // If we are reading at a provided timestamp earlier than the latest catalog changes,
+        // then we must return an error.
+        if (readSource == RecoveryUnit::ReadSource::kProvided) {
+            uasserted(ErrorCodes::SnapshotUnavailable,
+                      str::stream()
+                          << "Unable to read from a snapshot due to pending collection catalog "
+                             "changes; please retry the operation. Snapshot timestamp is "
+                          << readTimestamp->toString() << ". Collection minimum is "
+                          << minSnapshot->toString());
+        }
+
+        invariant(
+            // The kMajorityCommitted and kNoOverlap read sources already read from timestamps
+            // that are safe with respect to concurrent secondary batch application, and are
+            // eligible for retrying.
+            readSource == RecoveryUnit::ReadSource::kMajorityCommitted ||
+            readSource == RecoveryUnit::ReadSource::kNoOverlap);
         invariant(readConcernLevel != repl::ReadConcernLevel::kSnapshotReadConcern);
 
         // Yield locks in order to do the blocking call below.
         _autoColl = boost::none;
 
-        // If there are pending catalog changes, we should conflict with any in-progress batches (by
-        // taking the PBWM lock) and choose not to read from the last applied timestamp by unsetting
-        // _shouldNotConflictWithSecondaryBatchApplicationBlock. Index builds on secondaries can
-        // complete at timestamps later than the lastAppliedTimestamp during initial sync. After
-        // initial sync finishes, if we waited instead of retrying, readers would block indefinitely
-        // waiting for the lastAppliedTimestamp to move forward. Instead we force the reader take
-        // the PBWM lock and retry.
-        if (lastAppliedTimestamp) {
+        // If there are pending catalog changes when using a no-overlap read source, we choose to
+        // take the PBWM lock to conflict with any in-progress batches. This prevents us from idly
+        // spinning in this loop trying to get a new read timestamp ahead of the minimum visible
+        // snapshot. This helps us guarantee liveness (i.e. we can eventually get a suitable read
+        // timestamp) but should not be necessary for correctness. After initial sync finishes, if
+        // we waited instead of retrying, readers would block indefinitely waiting for the
+        // noOverlap time to move forward. Instead we force the reader take the PBWM lock and retry.
+        if (readSource == RecoveryUnit::ReadSource::kNoOverlap) {
+            invariant(readTimestamp);
             LOGV2(20576,
-                  "tried reading at last-applied time: {lastAppliedTimestamp} on ns: {nss_ns}, but "
-                  "future catalog changes are pending at time {minSnapshot}. Trying again without "
-                  "reading at last-applied time.",
-                  "lastAppliedTimestamp"_attr = *lastAppliedTimestamp,
-                  "nss_ns"_attr = nss.ns(),
-                  "minSnapshot"_attr = *minSnapshot);
+                  "Tried reading at no-overlap time, but future catalog changes are pending. "
+                  "Trying again without reading at no-overlap time.",
+                  "noOverlapTimestamp"_attr = *readTimestamp,
+                  "collection"_attr = nss.ns(),
+                  "collectionMinSnapshot"_attr = *minSnapshot);
             // Destructing the block sets _shouldConflictWithSecondaryBatchApplication back to the
             // previous value. If the previous value is false (because there is another
             // shouldNotConflictWithSecondaryBatchApplicationBlock outside of this function), this
@@ -220,28 +225,12 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
                 ErrorCodes::SnapshotUnavailable,
                 str::stream() << "Unable to read from a snapshot due to pending collection catalog "
                                  "changes; please retry the operation. Snapshot timestamp is "
-                              << (mySnapshot ? mySnapshot->toString() : "(none)")
-                              << ". Collection minimum is " << minSnapshot->toString(),
+                              << readTimestamp->toString() << ". Collection minimum is "
+                              << minSnapshot->toString(),
                 opCtx->lockState()->shouldConflictWithSecondaryBatchApplication());
 
-            // Cannot change ReadSource while a RecoveryUnit is active, which may result from
-            // calling getPointInTimeReadTimestamp().
-            opCtx->recoveryUnit()->abandonSnapshot();
-            opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset);
-        }
-
-        // If there are pending catalog changes when using a no-overlap read source, we choose to
-        // take the PBWM lock to conflict with any in-progress batches. This prevents us from idly
-        // spinning in this loop trying to get a new read timestamp ahead of the minimum visible
-        // snapshot. This helps us guarantee liveness (i.e. we can eventually get a suitable read
-        // timestamp) but should not be necessary for correctness.
-        if (readSource == RecoveryUnit::ReadSource::kNoOverlap) {
-            invariant(!lastAppliedTimestamp);  // no-overlap read source selects its own timestamp.
-            _shouldNotConflictWithSecondaryBatchApplicationBlock = boost::none;
-            invariant(opCtx->lockState()->shouldConflictWithSecondaryBatchApplication());
-
-            // Abandon our snapshot but don't change our read source, so that we can select a new
-            // read timestamp on the next loop iteration.
+            // Abandon our snapshot. We may select a new read timestamp or ReadSource in the next
+            // loop iteration.
             opCtx->recoveryUnit()->abandonSnapshot();
         }
 
@@ -259,84 +248,6 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
     }
 }
 
-bool AutoGetCollectionForRead::_shouldReadAtLastAppliedTimestamp(
-    OperationContext* opCtx,
-    const NamespaceString& nss,
-    repl::ReadConcernLevel readConcernLevel) const {
-
-    // If this block is unset, then the operation did not opt-out of the PBWM lock, implying that it
-    // cannot read at lastApplied. It's important to note that it is possible for this to be set,
-    // but still be holding the PBWM lock, explained below.
-    if (!_shouldNotConflictWithSecondaryBatchApplicationBlock) {
-        return false;
-    }
-
-    // If we are already holding the PBWM lock, do not read at last-applied. This is because once an
-    // operation reads without a timestamp (effectively seeing all writes), it is no longer safe to
-    // start reading at a timestamp, as writes or catalog operations may appear to vanish.
-    // This may occur when multiple collection locks are held concurrently, which is often the case
-    // when DBDirectClient is used.
-    if (opCtx->lockState()->isLockHeldForMode(resourceIdParallelBatchWriterMode, MODE_IS)) {
-        LOGV2_DEBUG(20577, 1, "not reading at last-applied because the PBWM lock is held");
-        return false;
-    }
-
-    // Majority and snapshot readConcern levels should not read from lastApplied; these read
-    // concerns already have a designated timestamp to read from.
-    if (readConcernLevel != repl::ReadConcernLevel::kLocalReadConcern &&
-        readConcernLevel != repl::ReadConcernLevel::kAvailableReadConcern) {
-        return false;
-    }
-
-    // If we are in a replication state (like secondary or primary catch-up) where we are not
-    // accepting writes, we should read at lastApplied. If this node can accept writes, then no
-    // conflicting replication batches are being applied and we can read from the default snapshot.
-    if (repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesForDatabase(opCtx, "admin")) {
-        return false;
-    }
-
-    // Non-replicated collections do not need to read at lastApplied, as those collections are not
-    // written by the replication system.  However, the oplog is special, as it *is* written by the
-    // replication system.
-    if (!nss.isReplicated() && !nss.isOplog()) {
-        return false;
-    }
-
-    return true;
-}
-
-bool AutoGetCollectionForRead::_conflictingCatalogChanges(
-    OperationContext* opCtx,
-    boost::optional<Timestamp> minSnapshot,
-    boost::optional<Timestamp> lastAppliedTimestamp) const {
-    // This is the timestamp of the most recent catalog changes to this collection. If this is
-    // greater than any point in time read timestamps, we should either wait or return an error.
-    if (!minSnapshot) {
-        return false;
-    }
-
-    // If we are reading from the lastAppliedTimestamp and it is up-to-date with any catalog
-    // changes, we can return.
-    if (lastAppliedTimestamp &&
-        (lastAppliedTimestamp->isNull() || *lastAppliedTimestamp >= *minSnapshot)) {
-        return false;
-    }
-
-    // This can be set when readConcern is "snapshot" or "majority".
-    auto mySnapshot = opCtx->recoveryUnit()->getPointInTimeReadTimestamp();
-
-    // If we do not have a point in time to conflict with minSnapshot, return.
-    if (!mySnapshot && !lastAppliedTimestamp) {
-        return false;
-    }
-
-    // Return if there are no conflicting catalog changes with mySnapshot.
-    if (mySnapshot && *mySnapshot >= *minSnapshot) {
-        return false;
-    }
-
-    return true;
-}
 
 AutoGetCollectionForReadCommand::AutoGetCollectionForReadCommand(
     OperationContext* opCtx,