summaryrefslogtreecommitdiff
path: root/src/mongo/db/db_raii.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/mongo/db/db_raii.cpp')
-rw-r--r--src/mongo/db/db_raii.cpp213
1 files changed, 62 insertions, 151 deletions
diff --git a/src/mongo/db/db_raii.cpp b/src/mongo/db/db_raii.cpp
index 34775dd7d01..53dda3ca490 100644
--- a/src/mongo/db/db_raii.cpp
+++ b/src/mongo/db/db_raii.cpp
@@ -39,6 +39,7 @@
#include "mongo/db/db_raii_gen.h"
#include "mongo/db/repl/replication_coordinator.h"
#include "mongo/db/s/collection_sharding_state.h"
+#include "mongo/db/storage/snapshot_helper.h"
#include "mongo/logv2/log.h"
namespace mongo {
@@ -112,7 +113,7 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
_autoColl.emplace(opCtx, nsOrUUID, collectionLockMode, viewMode, deadline);
// If the read source is explicitly set to kNoTimestamp, we read the most up to date data and do
- // not consider reading at last applied (e.g. FTDC needs that).
+ // not consider reading at the no-overlap point (e.g. FTDC needs that).
if (opCtx->recoveryUnit()->getTimestampReadSource() == RecoveryUnit::ReadSource::kNoTimestamp)
return;
@@ -123,10 +124,6 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
// need to check for pending catalog changes.
while (auto coll = _autoColl->getCollection()) {
- auto readSource = opCtx->recoveryUnit()->getTimestampReadSource();
- auto minSnapshot = coll->getMinimumVisibleSnapshot();
- auto mySnapshot = opCtx->recoveryUnit()->getPointInTimeReadTimestamp();
-
// TODO(SERVER-47824): Also ban transaction snapshot reads on capped collections.
uassert(ErrorCodes::SnapshotUnavailable,
"Reading from capped collections with readConcern snapshot is not supported "
@@ -135,77 +132,85 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
readConcernLevel != repl::ReadConcernLevel::kSnapshotReadConcern ||
opCtx->inMultiDocumentTransaction());
- // If we are reading at a provided timestamp earlier than the latest catalog changes,
- // then we must return an error.
- if (readSource == RecoveryUnit::ReadSource::kProvided && minSnapshot &&
- (*mySnapshot < *minSnapshot)) {
- uasserted(ErrorCodes::SnapshotUnavailable,
- str::stream()
- << "Unable to read from a snapshot due to pending collection catalog "
- "changes; please retry the operation. Snapshot timestamp is "
- << mySnapshot->toString() << ". Collection minimum is "
- << minSnapshot->toString());
- }
-
// During batch application on secondaries, there is a potential to read inconsistent states
// that would normally be protected by the PBWM lock. In order to serve secondary reads
// during this period, we default to not acquiring the lock (by setting
// _shouldNotConflictWithSecondaryBatchApplicationBlock). On primaries, we always read at a
// consistent time, so not taking the PBWM lock is not a problem. On secondaries, we have to
- // guarantee we read at a consistent state, so we must read at the last applied timestamp,
- // which is set after each complete batch.
+ // guarantee we read at a consistent state, so we must read at the no-overlap timestamp,
+ // which is a function of the lastApplied timestamp, which is set after each complete batch.
//
- // If an attempt to read at the last applied timestamp is unsuccessful because there are
- // pending catalog changes that occur after the last applied timestamp, we release our locks
+ // If an attempt to read at the no-overlap timestamp is unsuccessful because there are
+ // pending catalog changes that occur after the no-overlap timestamp, we release our locks
// and try again with the PBWM lock (by unsetting
// _shouldNotConflictWithSecondaryBatchApplicationBlock).
const NamespaceString nss = coll->ns();
+ auto readSource = opCtx->recoveryUnit()->getTimestampReadSource();
- bool readAtLastAppliedTimestamp =
- _shouldReadAtLastAppliedTimestamp(opCtx, nss, readConcernLevel);
-
- if (readAtLastAppliedTimestamp) {
- opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kLastApplied);
- readSource = opCtx->recoveryUnit()->getTimestampReadSource();
+ // Once we have our locks, check whether or not we should override the ReadSource that was
+ // set before acquiring locks.
+ if (auto newReadSource = SnapshotHelper::getNewReadSource(opCtx, nss)) {
+ opCtx->recoveryUnit()->setTimestampReadSource(*newReadSource);
+ readSource = *newReadSource;
}
- // This timestamp could be earlier than the timestamp seen when the transaction is opened
- // because it is set asynchonously. This is not problematic because holding the collection
- // lock guarantees no metadata changes will occur in that time.
- auto lastAppliedTimestamp = readAtLastAppliedTimestamp
- ? boost::optional<Timestamp>(replCoord->getMyLastAppliedOpTime().getTimestamp())
- : boost::none;
+ const auto readTimestamp = opCtx->recoveryUnit()->getPointInTimeReadTimestamp();
+ const auto afterClusterTime = repl::ReadConcernArgs::get(opCtx).getArgsAfterClusterTime();
+ if (readTimestamp && afterClusterTime) {
+ // Readers that use afterClusterTime have already waited at a higher level for the
+ // lastApplied time to advance to a specified optime, and they assume the read timestamp
+ // of the operation is at least that waited-for timestamp. For kNoOverlap, which is the
+ // minimum of lastApplied and all_durable, this invariant ensures that afterClusterTime
+ // reads do not choose a read timestamp older than the one requested.
+ invariant(*readTimestamp >= afterClusterTime->asTimestamp(),
+ str::stream() << "read timestamp " << readTimestamp->toString()
+ << "was less than afterClusterTime: "
+ << afterClusterTime->asTimestamp().toString());
+ }
- if (!_conflictingCatalogChanges(opCtx, minSnapshot, lastAppliedTimestamp)) {
+ auto minSnapshot = coll->getMinimumVisibleSnapshot();
+ if (!SnapshotHelper::collectionChangesConflictWithRead(minSnapshot, readTimestamp)) {
return;
}
- invariant(lastAppliedTimestamp ||
- // The kMajorityCommitted and kNoOverlap read sources already read from timestamps
- // that are safe with respect to concurrent secondary batch application.
- readSource == RecoveryUnit::ReadSource::kMajorityCommitted ||
- readSource == RecoveryUnit::ReadSource::kNoOverlap);
+ // If we are reading at a provided timestamp earlier than the latest catalog changes,
+ // then we must return an error.
+ if (readSource == RecoveryUnit::ReadSource::kProvided) {
+ uasserted(ErrorCodes::SnapshotUnavailable,
+ str::stream()
+ << "Unable to read from a snapshot due to pending collection catalog "
+ "changes; please retry the operation. Snapshot timestamp is "
+ << readTimestamp->toString() << ". Collection minimum is "
+ << minSnapshot->toString());
+ }
+
+ invariant(
+ // The kMajorityCommitted and kNoOverlap read sources already read from timestamps
+ // that are safe with respect to concurrent secondary batch application, and are
+ // eligible for retrying.
+ readSource == RecoveryUnit::ReadSource::kMajorityCommitted ||
+ readSource == RecoveryUnit::ReadSource::kNoOverlap);
invariant(readConcernLevel != repl::ReadConcernLevel::kSnapshotReadConcern);
// Yield locks in order to do the blocking call below.
_autoColl = boost::none;
- // If there are pending catalog changes, we should conflict with any in-progress batches (by
- // taking the PBWM lock) and choose not to read from the last applied timestamp by unsetting
- // _shouldNotConflictWithSecondaryBatchApplicationBlock. Index builds on secondaries can
- // complete at timestamps later than the lastAppliedTimestamp during initial sync. After
- // initial sync finishes, if we waited instead of retrying, readers would block indefinitely
- // waiting for the lastAppliedTimestamp to move forward. Instead we force the reader take
- // the PBWM lock and retry.
- if (lastAppliedTimestamp) {
+ // If there are pending catalog changes when using a no-overlap read source, we choose to
+ // take the PBWM lock to conflict with any in-progress batches. This prevents us from idly
+ // spinning in this loop trying to get a new read timestamp ahead of the minimum visible
+ // snapshot. This helps us guarantee liveness (i.e. we can eventually get a suitable read
+ // timestamp) but should not be necessary for correctness. After initial sync finishes, if
+ // we waited instead of retrying, readers would block indefinitely waiting for the
+ // noOverlap time to move forward. Instead we force the reader take the PBWM lock and retry.
+ if (readSource == RecoveryUnit::ReadSource::kNoOverlap) {
+ invariant(readTimestamp);
LOGV2(20576,
- "tried reading at last-applied time: {lastAppliedTimestamp} on ns: {nss_ns}, but "
- "future catalog changes are pending at time {minSnapshot}. Trying again without "
- "reading at last-applied time.",
- "lastAppliedTimestamp"_attr = *lastAppliedTimestamp,
- "nss_ns"_attr = nss.ns(),
- "minSnapshot"_attr = *minSnapshot);
+ "Tried reading at no-overlap time, but future catalog changes are pending. "
+ "Trying again without reading at no-overlap time.",
+ "noOverlapTimestamp"_attr = *readTimestamp,
+ "collection"_attr = nss.ns(),
+ "collectionMinSnapshot"_attr = *minSnapshot);
// Destructing the block sets _shouldConflictWithSecondaryBatchApplication back to the
// previous value. If the previous value is false (because there is another
// shouldNotConflictWithSecondaryBatchApplicationBlock outside of this function), this
@@ -220,28 +225,12 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
ErrorCodes::SnapshotUnavailable,
str::stream() << "Unable to read from a snapshot due to pending collection catalog "
"changes; please retry the operation. Snapshot timestamp is "
- << (mySnapshot ? mySnapshot->toString() : "(none)")
- << ". Collection minimum is " << minSnapshot->toString(),
+ << readTimestamp->toString() << ". Collection minimum is "
+ << minSnapshot->toString(),
opCtx->lockState()->shouldConflictWithSecondaryBatchApplication());
- // Cannot change ReadSource while a RecoveryUnit is active, which may result from
- // calling getPointInTimeReadTimestamp().
- opCtx->recoveryUnit()->abandonSnapshot();
- opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset);
- }
-
- // If there are pending catalog changes when using a no-overlap read source, we choose to
- // take the PBWM lock to conflict with any in-progress batches. This prevents us from idly
- // spinning in this loop trying to get a new read timestamp ahead of the minimum visible
- // snapshot. This helps us guarantee liveness (i.e. we can eventually get a suitable read
- // timestamp) but should not be necessary for correctness.
- if (readSource == RecoveryUnit::ReadSource::kNoOverlap) {
- invariant(!lastAppliedTimestamp); // no-overlap read source selects its own timestamp.
- _shouldNotConflictWithSecondaryBatchApplicationBlock = boost::none;
- invariant(opCtx->lockState()->shouldConflictWithSecondaryBatchApplication());
-
- // Abandon our snapshot but don't change our read source, so that we can select a new
- // read timestamp on the next loop iteration.
+ // Abandon our snapshot. We may select a new read timestamp or ReadSource in the next
+ // loop iteration.
opCtx->recoveryUnit()->abandonSnapshot();
}
@@ -259,84 +248,6 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
}
}
-bool AutoGetCollectionForRead::_shouldReadAtLastAppliedTimestamp(
- OperationContext* opCtx,
- const NamespaceString& nss,
- repl::ReadConcernLevel readConcernLevel) const {
-
- // If this block is unset, then the operation did not opt-out of the PBWM lock, implying that it
- // cannot read at lastApplied. It's important to note that it is possible for this to be set,
- // but still be holding the PBWM lock, explained below.
- if (!_shouldNotConflictWithSecondaryBatchApplicationBlock) {
- return false;
- }
-
- // If we are already holding the PBWM lock, do not read at last-applied. This is because once an
- // operation reads without a timestamp (effectively seeing all writes), it is no longer safe to
- // start reading at a timestamp, as writes or catalog operations may appear to vanish.
- // This may occur when multiple collection locks are held concurrently, which is often the case
- // when DBDirectClient is used.
- if (opCtx->lockState()->isLockHeldForMode(resourceIdParallelBatchWriterMode, MODE_IS)) {
- LOGV2_DEBUG(20577, 1, "not reading at last-applied because the PBWM lock is held");
- return false;
- }
-
- // Majority and snapshot readConcern levels should not read from lastApplied; these read
- // concerns already have a designated timestamp to read from.
- if (readConcernLevel != repl::ReadConcernLevel::kLocalReadConcern &&
- readConcernLevel != repl::ReadConcernLevel::kAvailableReadConcern) {
- return false;
- }
-
- // If we are in a replication state (like secondary or primary catch-up) where we are not
- // accepting writes, we should read at lastApplied. If this node can accept writes, then no
- // conflicting replication batches are being applied and we can read from the default snapshot.
- if (repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesForDatabase(opCtx, "admin")) {
- return false;
- }
-
- // Non-replicated collections do not need to read at lastApplied, as those collections are not
- // written by the replication system. However, the oplog is special, as it *is* written by the
- // replication system.
- if (!nss.isReplicated() && !nss.isOplog()) {
- return false;
- }
-
- return true;
-}
-
-bool AutoGetCollectionForRead::_conflictingCatalogChanges(
- OperationContext* opCtx,
- boost::optional<Timestamp> minSnapshot,
- boost::optional<Timestamp> lastAppliedTimestamp) const {
- // This is the timestamp of the most recent catalog changes to this collection. If this is
- // greater than any point in time read timestamps, we should either wait or return an error.
- if (!minSnapshot) {
- return false;
- }
-
- // If we are reading from the lastAppliedTimestamp and it is up-to-date with any catalog
- // changes, we can return.
- if (lastAppliedTimestamp &&
- (lastAppliedTimestamp->isNull() || *lastAppliedTimestamp >= *minSnapshot)) {
- return false;
- }
-
- // This can be set when readConcern is "snapshot" or "majority".
- auto mySnapshot = opCtx->recoveryUnit()->getPointInTimeReadTimestamp();
-
- // If we do not have a point in time to conflict with minSnapshot, return.
- if (!mySnapshot && !lastAppliedTimestamp) {
- return false;
- }
-
- // Return if there are no conflicting catalog changes with mySnapshot.
- if (mySnapshot && *mySnapshot >= *minSnapshot) {
- return false;
- }
-
- return true;
-}
AutoGetCollectionForReadCommand::AutoGetCollectionForReadCommand(
OperationContext* opCtx,