diff options
Diffstat (limited to 'src/mongo/db/db_raii.cpp')
-rw-r--r-- | src/mongo/db/db_raii.cpp | 213 |
1 files changed, 62 insertions, 151 deletions
diff --git a/src/mongo/db/db_raii.cpp b/src/mongo/db/db_raii.cpp index 34775dd7d01..53dda3ca490 100644 --- a/src/mongo/db/db_raii.cpp +++ b/src/mongo/db/db_raii.cpp @@ -39,6 +39,7 @@ #include "mongo/db/db_raii_gen.h" #include "mongo/db/repl/replication_coordinator.h" #include "mongo/db/s/collection_sharding_state.h" +#include "mongo/db/storage/snapshot_helper.h" #include "mongo/logv2/log.h" namespace mongo { @@ -112,7 +113,7 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx, _autoColl.emplace(opCtx, nsOrUUID, collectionLockMode, viewMode, deadline); // If the read source is explicitly set to kNoTimestamp, we read the most up to date data and do - // not consider reading at last applied (e.g. FTDC needs that). + // not consider reading at the no-overlap point (e.g. FTDC needs that). if (opCtx->recoveryUnit()->getTimestampReadSource() == RecoveryUnit::ReadSource::kNoTimestamp) return; @@ -123,10 +124,6 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx, // need to check for pending catalog changes. while (auto coll = _autoColl->getCollection()) { - auto readSource = opCtx->recoveryUnit()->getTimestampReadSource(); - auto minSnapshot = coll->getMinimumVisibleSnapshot(); - auto mySnapshot = opCtx->recoveryUnit()->getPointInTimeReadTimestamp(); - // TODO(SERVER-47824): Also ban transaction snapshot reads on capped collections. uassert(ErrorCodes::SnapshotUnavailable, "Reading from capped collections with readConcern snapshot is not supported " @@ -135,77 +132,85 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx, readConcernLevel != repl::ReadConcernLevel::kSnapshotReadConcern || opCtx->inMultiDocumentTransaction()); - // If we are reading at a provided timestamp earlier than the latest catalog changes, - // then we must return an error. - if (readSource == RecoveryUnit::ReadSource::kProvided && minSnapshot && - (*mySnapshot < *minSnapshot)) { - uasserted(ErrorCodes::SnapshotUnavailable, - str::stream() - << "Unable to read from a snapshot due to pending collection catalog " - "changes; please retry the operation. Snapshot timestamp is " - << mySnapshot->toString() << ". Collection minimum is " - << minSnapshot->toString()); - } - // During batch application on secondaries, there is a potential to read inconsistent states // that would normally be protected by the PBWM lock. In order to serve secondary reads // during this period, we default to not acquiring the lock (by setting // _shouldNotConflictWithSecondaryBatchApplicationBlock). On primaries, we always read at a // consistent time, so not taking the PBWM lock is not a problem. On secondaries, we have to - // guarantee we read at a consistent state, so we must read at the last applied timestamp, - // which is set after each complete batch. + // guarantee we read at a consistent state, so we must read at the no-overlap timestamp, + // which is a function of the lastApplied timestamp, which is set after each complete batch. // - // If an attempt to read at the last applied timestamp is unsuccessful because there are - // pending catalog changes that occur after the last applied timestamp, we release our locks + // If an attempt to read at the no-overlap timestamp is unsuccessful because there are + // pending catalog changes that occur after the no-overlap timestamp, we release our locks // and try again with the PBWM lock (by unsetting // _shouldNotConflictWithSecondaryBatchApplicationBlock). const NamespaceString nss = coll->ns(); + auto readSource = opCtx->recoveryUnit()->getTimestampReadSource(); - bool readAtLastAppliedTimestamp = - _shouldReadAtLastAppliedTimestamp(opCtx, nss, readConcernLevel); - - if (readAtLastAppliedTimestamp) { - opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kLastApplied); - readSource = opCtx->recoveryUnit()->getTimestampReadSource(); + // Once we have our locks, check whether or not we should override the ReadSource that was + // set before acquiring locks. + if (auto newReadSource = SnapshotHelper::getNewReadSource(opCtx, nss)) { + opCtx->recoveryUnit()->setTimestampReadSource(*newReadSource); + readSource = *newReadSource; } - // This timestamp could be earlier than the timestamp seen when the transaction is opened - // because it is set asynchonously. This is not problematic because holding the collection - // lock guarantees no metadata changes will occur in that time. - auto lastAppliedTimestamp = readAtLastAppliedTimestamp - ? boost::optional<Timestamp>(replCoord->getMyLastAppliedOpTime().getTimestamp()) - : boost::none; + const auto readTimestamp = opCtx->recoveryUnit()->getPointInTimeReadTimestamp(); + const auto afterClusterTime = repl::ReadConcernArgs::get(opCtx).getArgsAfterClusterTime(); + if (readTimestamp && afterClusterTime) { + // Readers that use afterClusterTime have already waited at a higher level for the + // lastApplied time to advance to a specified optime, and they assume the read timestamp + // of the operation is at least that waited-for timestamp. For kNoOverlap, which is the + // minimum of lastApplied and all_durable, this invariant ensures that afterClusterTime + // reads do not choose a read timestamp older than the one requested. + invariant(*readTimestamp >= afterClusterTime->asTimestamp(), + str::stream() << "read timestamp " << readTimestamp->toString() + << "was less than afterClusterTime: " + << afterClusterTime->asTimestamp().toString()); + } - if (!_conflictingCatalogChanges(opCtx, minSnapshot, lastAppliedTimestamp)) { + auto minSnapshot = coll->getMinimumVisibleSnapshot(); + if (!SnapshotHelper::collectionChangesConflictWithRead(minSnapshot, readTimestamp)) { return; } - invariant(lastAppliedTimestamp || - // The kMajorityCommitted and kNoOverlap read sources already read from timestamps - // that are safe with respect to concurrent secondary batch application. - readSource == RecoveryUnit::ReadSource::kMajorityCommitted || - readSource == RecoveryUnit::ReadSource::kNoOverlap); + // If we are reading at a provided timestamp earlier than the latest catalog changes, + // then we must return an error. + if (readSource == RecoveryUnit::ReadSource::kProvided) { + uasserted(ErrorCodes::SnapshotUnavailable, + str::stream() + << "Unable to read from a snapshot due to pending collection catalog " + "changes; please retry the operation. Snapshot timestamp is " + << readTimestamp->toString() << ". Collection minimum is " + << minSnapshot->toString()); + } + + invariant( + // The kMajorityCommitted and kNoOverlap read sources already read from timestamps + // that are safe with respect to concurrent secondary batch application, and are + // eligible for retrying. + readSource == RecoveryUnit::ReadSource::kMajorityCommitted || + readSource == RecoveryUnit::ReadSource::kNoOverlap); invariant(readConcernLevel != repl::ReadConcernLevel::kSnapshotReadConcern); // Yield locks in order to do the blocking call below. _autoColl = boost::none; - // If there are pending catalog changes, we should conflict with any in-progress batches (by - // taking the PBWM lock) and choose not to read from the last applied timestamp by unsetting - // _shouldNotConflictWithSecondaryBatchApplicationBlock. Index builds on secondaries can - // complete at timestamps later than the lastAppliedTimestamp during initial sync. After - // initial sync finishes, if we waited instead of retrying, readers would block indefinitely - // waiting for the lastAppliedTimestamp to move forward. Instead we force the reader take - // the PBWM lock and retry. - if (lastAppliedTimestamp) { + // If there are pending catalog changes when using a no-overlap read source, we choose to + // take the PBWM lock to conflict with any in-progress batches. This prevents us from idly + // spinning in this loop trying to get a new read timestamp ahead of the minimum visible + // snapshot. This helps us guarantee liveness (i.e. we can eventually get a suitable read + // timestamp) but should not be necessary for correctness. After initial sync finishes, if + // we waited instead of retrying, readers would block indefinitely waiting for the + // noOverlap time to move forward. Instead we force the reader take the PBWM lock and retry. + if (readSource == RecoveryUnit::ReadSource::kNoOverlap) { + invariant(readTimestamp); LOGV2(20576, - "tried reading at last-applied time: {lastAppliedTimestamp} on ns: {nss_ns}, but " - "future catalog changes are pending at time {minSnapshot}. Trying again without " - "reading at last-applied time.", - "lastAppliedTimestamp"_attr = *lastAppliedTimestamp, - "nss_ns"_attr = nss.ns(), - "minSnapshot"_attr = *minSnapshot); + "Tried reading at no-overlap time, but future catalog changes are pending. " + "Trying again without reading at no-overlap time.", + "noOverlapTimestamp"_attr = *readTimestamp, + "collection"_attr = nss.ns(), + "collectionMinSnapshot"_attr = *minSnapshot); // Destructing the block sets _shouldConflictWithSecondaryBatchApplication back to the // previous value. If the previous value is false (because there is another // shouldNotConflictWithSecondaryBatchApplicationBlock outside of this function), this @@ -220,28 +225,12 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx, ErrorCodes::SnapshotUnavailable, str::stream() << "Unable to read from a snapshot due to pending collection catalog " "changes; please retry the operation. Snapshot timestamp is " - << (mySnapshot ? mySnapshot->toString() : "(none)") - << ". Collection minimum is " << minSnapshot->toString(), + << readTimestamp->toString() << ". Collection minimum is " + << minSnapshot->toString(), opCtx->lockState()->shouldConflictWithSecondaryBatchApplication()); - // Cannot change ReadSource while a RecoveryUnit is active, which may result from - // calling getPointInTimeReadTimestamp(). - opCtx->recoveryUnit()->abandonSnapshot(); - opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset); - } - - // If there are pending catalog changes when using a no-overlap read source, we choose to - // take the PBWM lock to conflict with any in-progress batches. This prevents us from idly - // spinning in this loop trying to get a new read timestamp ahead of the minimum visible - // snapshot. This helps us guarantee liveness (i.e. we can eventually get a suitable read - // timestamp) but should not be necessary for correctness. - if (readSource == RecoveryUnit::ReadSource::kNoOverlap) { - invariant(!lastAppliedTimestamp); // no-overlap read source selects its own timestamp. - _shouldNotConflictWithSecondaryBatchApplicationBlock = boost::none; - invariant(opCtx->lockState()->shouldConflictWithSecondaryBatchApplication()); - - // Abandon our snapshot but don't change our read source, so that we can select a new - // read timestamp on the next loop iteration. + // Abandon our snapshot. We may select a new read timestamp or ReadSource in the next + // loop iteration. opCtx->recoveryUnit()->abandonSnapshot(); } @@ -259,84 +248,6 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx, } } -bool AutoGetCollectionForRead::_shouldReadAtLastAppliedTimestamp( - OperationContext* opCtx, - const NamespaceString& nss, - repl::ReadConcernLevel readConcernLevel) const { - - // If this block is unset, then the operation did not opt-out of the PBWM lock, implying that it - // cannot read at lastApplied. It's important to note that it is possible for this to be set, - // but still be holding the PBWM lock, explained below. - if (!_shouldNotConflictWithSecondaryBatchApplicationBlock) { - return false; - } - - // If we are already holding the PBWM lock, do not read at last-applied. This is because once an - // operation reads without a timestamp (effectively seeing all writes), it is no longer safe to - // start reading at a timestamp, as writes or catalog operations may appear to vanish. - // This may occur when multiple collection locks are held concurrently, which is often the case - // when DBDirectClient is used. - if (opCtx->lockState()->isLockHeldForMode(resourceIdParallelBatchWriterMode, MODE_IS)) { - LOGV2_DEBUG(20577, 1, "not reading at last-applied because the PBWM lock is held"); - return false; - } - - // Majority and snapshot readConcern levels should not read from lastApplied; these read - // concerns already have a designated timestamp to read from. - if (readConcernLevel != repl::ReadConcernLevel::kLocalReadConcern && - readConcernLevel != repl::ReadConcernLevel::kAvailableReadConcern) { - return false; - } - - // If we are in a replication state (like secondary or primary catch-up) where we are not - // accepting writes, we should read at lastApplied. If this node can accept writes, then no - // conflicting replication batches are being applied and we can read from the default snapshot. - if (repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesForDatabase(opCtx, "admin")) { - return false; - } - - // Non-replicated collections do not need to read at lastApplied, as those collections are not - // written by the replication system. However, the oplog is special, as it *is* written by the - // replication system. - if (!nss.isReplicated() && !nss.isOplog()) { - return false; - } - - return true; -} - -bool AutoGetCollectionForRead::_conflictingCatalogChanges( - OperationContext* opCtx, - boost::optional<Timestamp> minSnapshot, - boost::optional<Timestamp> lastAppliedTimestamp) const { - // This is the timestamp of the most recent catalog changes to this collection. If this is - // greater than any point in time read timestamps, we should either wait or return an error. - if (!minSnapshot) { - return false; - } - - // If we are reading from the lastAppliedTimestamp and it is up-to-date with any catalog - // changes, we can return. - if (lastAppliedTimestamp && - (lastAppliedTimestamp->isNull() || *lastAppliedTimestamp >= *minSnapshot)) { - return false; - } - - // This can be set when readConcern is "snapshot" or "majority". - auto mySnapshot = opCtx->recoveryUnit()->getPointInTimeReadTimestamp(); - - // If we do not have a point in time to conflict with minSnapshot, return. - if (!mySnapshot && !lastAppliedTimestamp) { - return false; - } - - // Return if there are no conflicting catalog changes with mySnapshot. - if (mySnapshot && *mySnapshot >= *minSnapshot) { - return false; - } - - return true; -} AutoGetCollectionForReadCommand::AutoGetCollectionForReadCommand( OperationContext* opCtx, |