summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHenrik Edin <henrik.edin@mongodb.com>2022-07-07 12:50:14 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-08-01 13:05:12 +0000
commitec54462dba6610bbd18bb4e676c7ef5637aab947 (patch)
treef15743a80d768badc0552587e8dc1cb303883133
parentf5d28a6e731319f7ef6c2745df65e62fdce885f6 (diff)
downloadmongo-ec54462dba6610bbd18bb4e676c7ef5637aab947.tar.gz
SERVER-67305 Fix lock-free read timestamp order during repl stepup
Eliminate the race between setting read source to lastApplied and getting the lastApplied timestamp for storage snapshot. When transitioning out of primary catchup mode we may get a snapshot at lastApplied concurrent with out-of-order writers. Changed so the lastApplied timestamp is determined when calling setTimestampReadSource to allow us to re-verify that we need to read at lastApplied before opening the storage snapshot. (cherry picked from commit 381f698f0e5ba9099ed2bb1d0f51cd7e5e92aea0)
-rw-r--r--src/mongo/db/catalog/collection_catalog.cpp5
-rw-r--r--src/mongo/db/db_raii.cpp15
-rw-r--r--src/mongo/db/storage/snapshot_helper.cpp127
-rw-r--r--src/mongo/db/storage/snapshot_helper.h15
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp33
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h7
6 files changed, 130 insertions, 72 deletions
diff --git a/src/mongo/db/catalog/collection_catalog.cpp b/src/mongo/db/catalog/collection_catalog.cpp
index 2ca8ce0d020..d7f6cb14e9d 100644
--- a/src/mongo/db/catalog/collection_catalog.cpp
+++ b/src/mongo/db/catalog/collection_catalog.cpp
@@ -1217,10 +1217,7 @@ const Collection* LookupCollectionForYieldRestore::operator()(OperationContext*
// state. After a query yields its locks, the replication state may have changed, invalidating
// our current choice of ReadSource. Using the same preconditions, change our ReadSource if
// necessary.
- auto [newReadSource, _] = SnapshotHelper::shouldChangeReadSource(opCtx, collection->ns());
- if (newReadSource) {
- opCtx->recoveryUnit()->setTimestampReadSource(*newReadSource);
- }
+ SnapshotHelper::changeReadSourceIfNeeded(opCtx, collection->ns());
return collection.get();
}
diff --git a/src/mongo/db/db_raii.cpp b/src/mongo/db/db_raii.cpp
index 7a0eda9b506..a6e3d3e04bb 100644
--- a/src/mongo/db/db_raii.cpp
+++ b/src/mongo/db/db_raii.cpp
@@ -288,12 +288,9 @@ AutoGetCollectionForReadBase<AutoGetCollectionType, EmplaceAutoCollFunc>::
// Once we have our locks, check whether or not we should override the ReadSource that was
// set before acquiring locks.
- auto [newReadSource, shouldReadAtLastApplied] =
- SnapshotHelper::shouldChangeReadSource(opCtx, nss);
- if (newReadSource) {
- opCtx->recoveryUnit()->setTimestampReadSource(*newReadSource);
- readSource = *newReadSource;
- }
+ const bool shouldReadAtLastApplied = SnapshotHelper::changeReadSourceIfNeeded(opCtx, nss);
+ // Update readSource in case it was updated.
+ readSource = opCtx->recoveryUnit()->getTimestampReadSource();
const auto readTimestamp = opCtx->recoveryUnit()->getPointInTimeReadTimestamp(opCtx);
if (readTimestamp && afterClusterTime) {
@@ -476,11 +473,7 @@ void AutoGetCollectionForReadLockFree::EmplaceHelper::emplace(
// replication state may have changed, invalidating our current choice of
// ReadSource. Using the same preconditions, change our ReadSource if necessary.
if (coll) {
- auto [newReadSource, _] =
- SnapshotHelper::shouldChangeReadSource(opCtx, coll->ns());
- if (newReadSource) {
- opCtx->recoveryUnit()->setTimestampReadSource(*newReadSource);
- }
+ SnapshotHelper::changeReadSourceIfNeeded(opCtx, coll->ns());
}
return coll;
diff --git a/src/mongo/db/storage/snapshot_helper.cpp b/src/mongo/db/storage/snapshot_helper.cpp
index db0d1e3c207..f4502834964 100644
--- a/src/mongo/db/storage/snapshot_helper.cpp
+++ b/src/mongo/db/storage/snapshot_helper.cpp
@@ -128,61 +128,120 @@ bool shouldReadAtLastApplied(OperationContext* opCtx,
return true;
}
+
} // namespace
namespace SnapshotHelper {
-ReadSourceChange shouldChangeReadSource(OperationContext* opCtx, const NamespaceString& nss) {
+bool changeReadSourceIfNeeded(OperationContext* opCtx, const NamespaceString& nss) {
std::string reason;
- const bool readAtLastApplied = shouldReadAtLastApplied(opCtx, nss, &reason);
+ // Write to the reason string if debug logging is enabled. This avoids writing this string every
+ // time we check if we should read at last applied. This string itself is only used in logging
+ // with the same debug level as this check.
+ std::string* reasonWriter =
+ shouldLog(MONGO_LOGV2_DEFAULT_COMPONENT, logv2::LogSeverity::Debug(2)) ? &reason : nullptr;
+ bool readAtLastApplied = shouldReadAtLastApplied(opCtx, nss, reasonWriter);
if (!canReadAtLastApplied(opCtx)) {
- return {boost::none, readAtLastApplied};
+ return readAtLastApplied;
}
- const auto existing = opCtx->recoveryUnit()->getTimestampReadSource();
+ const auto originalReadSource = opCtx->recoveryUnit()->getTimestampReadSource();
if (opCtx->recoveryUnit()->isReadSourcePinned()) {
LOGV2_DEBUG(5863601,
2,
"Not changing readSource as it is pinned",
- "current"_attr = RecoveryUnit::toString(existing),
+ "current"_attr = RecoveryUnit::toString(originalReadSource),
"rejected"_attr = readAtLastApplied
? RecoveryUnit::toString(RecoveryUnit::ReadSource::kLastApplied)
: RecoveryUnit::toString(RecoveryUnit::ReadSource::kNoTimestamp));
- return {boost::none, false};
+ return false;
+ }
+
+ // We may only change to kLastApplied if we were reading without a timestamp (or if kLastApplied
+ // is already set)
+ if (originalReadSource != RecoveryUnit::ReadSource::kNoTimestamp &&
+ originalReadSource != RecoveryUnit::ReadSource::kLastApplied) {
+ return readAtLastApplied;
}
- if (existing == RecoveryUnit::ReadSource::kNoTimestamp) {
- // Shifting from reading without a timestamp to reading with a timestamp can be dangerous
- // because writes will appear to vanish. This case is intended for new reads on secondaries
- // and query yield recovery after state transitions from primary to secondary.
+ // Helper to set read source to the recovery unit and remember our current setting
+ auto currentReadSource = originalReadSource;
+ auto setReadSource = [&](RecoveryUnit::ReadSource readSource) {
+ opCtx->recoveryUnit()->setTimestampReadSource(readSource);
+ currentReadSource = readSource;
+ };
- // If a query recovers from a yield and the node is no longer primary, it must start reading
- // at the lastApplied point because reading without a timestamp is not safe.
- if (readAtLastApplied) {
- LOGV2_DEBUG(4452901, 2, "Changing ReadSource to kLastApplied", logAttrs(nss));
- return {RecoveryUnit::ReadSource::kLastApplied, readAtLastApplied};
- }
- } else if (existing == RecoveryUnit::ReadSource::kLastApplied) {
- // For some reason, we can no longer read at lastApplied.
- // An operation that yields a timestamped snapshot must restore a snapshot with at least as
- // large of a timestamp, or with proper consideration of rollback scenarios, no timestamp.
- // Given readers do not survive rollbacks, it's okay to go from reading with a timestamp to
- // reading without one. More writes will become visible.
- if (!readAtLastApplied) {
- LOGV2_DEBUG(4452902,
- 2,
- "Changing ReadSource to kNoTimestamp",
- logAttrs(nss),
- "reason"_attr = reason);
- // This shift to kNoTimestamp assumes that callers will not make future attempts to
- // manipulate their ReadSources after performing reads at an un-timetamped snapshot. The
- // only exception is callers of this function that may need to change from kNoTimestamp
- // to kLastApplied in the event of a catalog conflict or query yield.
- return {RecoveryUnit::ReadSource::kNoTimestamp, readAtLastApplied};
+ // Set read source based on current setting and readAtLastApplied decision.
+ if (originalReadSource == RecoveryUnit::ReadSource::kLastApplied && !readAtLastApplied) {
+ setReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
+ } else if (readAtLastApplied) {
+ // Shifting from reading without a timestamp to reading with a timestamp can be
+ // dangerous because writes will appear to vanish.
+ //
+ // If a query recovers from a yield and the node is no longer primary, it must start
+ // reading at the lastApplied point because reading without a timestamp is not safe.
+ //
+ // An operation that yields a timestamped snapshot must restore a snapshot with at least
+ // as large of a timestamp, or with proper consideration of rollback scenarios, no
+ // timestamp. Given readers do not survive rollbacks, it's okay to go from reading with
+ // a timestamp to reading without one. More writes will become visible.
+ //
+ // If we already had kLastApplied as our read source then this call will refresh the
+ // timestamp.
+ setReadSource(RecoveryUnit::ReadSource::kLastApplied);
+
+ // We need to make sure the decision if we need to read at last applied is not changing
+ // concurrently with setting the read source with its read timestamp to the recovery unit.
+ //
+ // When the timestamp is being selected we might have transitioned into PRIMARY that is
+ // accepting writes. The lastApplied timestamp can have oplog holes behind it, in PRIMARY
+ // mode, making it unsafe as a read timestamp as concurrent writes could commit at earlier
+ // timestamps.
+ //
+ // This is handled by re-verifying the conditions if we need to read at last applied after
+ // determining the timestamp but before opening the storage snapshot. If the conditions do
+ // not match what we recorded at the beginning of the operation, we set the read source back
+ // to kNoTimestamp and read without a timestamp.
+ //
+ // The above mainly applies for Lock-free reads that is not holding the RSTL which protects
+ // against state changes.
+ reason.clear();
+ if (!shouldReadAtLastApplied(opCtx, nss, reasonWriter)) {
+ // State changed concurrently with setting the read source and we should no longer read
+ // at lastApplied.
+ setReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
+ readAtLastApplied = false;
}
}
- return {boost::none, readAtLastApplied};
+
+ // All done, log if we made a change to the read source
+ if (originalReadSource == RecoveryUnit::ReadSource::kNoTimestamp &&
+ currentReadSource == RecoveryUnit::ReadSource::kLastApplied) {
+ LOGV2_DEBUG(4452901,
+ 2,
+ "Changed ReadSource to kLastApplied",
+ logAttrs(nss),
+ "ts"_attr = opCtx->recoveryUnit()->getPointInTimeReadTimestamp(opCtx));
+ } else if (originalReadSource == RecoveryUnit::ReadSource::kLastApplied &&
+ currentReadSource == RecoveryUnit::ReadSource::kLastApplied) {
+ LOGV2_DEBUG(6730500,
+ 2,
+ "ReadSource kLastApplied updated timestamp",
+ logAttrs(nss),
+ "ts"_attr = opCtx->recoveryUnit()->getPointInTimeReadTimestamp(opCtx));
+ } else if (originalReadSource == RecoveryUnit::ReadSource::kLastApplied &&
+ currentReadSource == RecoveryUnit::ReadSource::kNoTimestamp) {
+ LOGV2_DEBUG(4452902,
+ 2,
+ "Changed ReadSource to kNoTimestamp",
+ logAttrs(nss),
+ "reason"_attr = reason);
+ }
+
+ // Return if we need to read at last applied to the caller in case further checks need to be
+ // performed.
+ return readAtLastApplied;
}
bool collectionChangesConflictWithRead(boost::optional<Timestamp> collectionMin,
diff --git a/src/mongo/db/storage/snapshot_helper.h b/src/mongo/db/storage/snapshot_helper.h
index cc62cbec889..b2ab6773fc5 100644
--- a/src/mongo/db/storage/snapshot_helper.h
+++ b/src/mongo/db/storage/snapshot_helper.h
@@ -33,20 +33,15 @@
namespace mongo {
namespace SnapshotHelper {
-struct ReadSourceChange {
- boost::optional<RecoveryUnit::ReadSource> newReadSource;
- bool shouldReadAtLastApplied;
-};
/**
- * Returns a ReadSourceChange containing data necessary to decide if we need to change read source.
+ * Changes the read source in the recovery unit if needed depending on server state. If reading at
+ * last applied is needed and we already have that read source set, refresh the lastApplied
+ * timestamp.
*
- * For Lock-Free Reads, the decisions made within this function based on replication state may
- * become invalid after it returns and multiple calls may yield different answers. Higher level code
- * must validate the relevance of the outcome based on replication state before and after calling
- * this function.
+ * Returns true if the state is such that we should read at last applied, false otherwise.
*/
-ReadSourceChange shouldChangeReadSource(OperationContext* opCtx, const NamespaceString& nss);
+bool changeReadSourceIfNeeded(OperationContext* opCtx, const NamespaceString& nss);
bool collectionChangesConflictWithRead(boost::optional<Timestamp> collectionMin,
boost::optional<Timestamp> readTimestamp);
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp
index 6e3370fecca..e637e906f32 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp
@@ -515,11 +515,16 @@ boost::optional<Timestamp> WiredTigerRecoveryUnit::getPointInTimeReadTimestamp(
// The read timestamp is set by the user and does not require a transaction to be open.
invariant(!_readAtTimestamp.isNull());
return _readAtTimestamp;
-
+ case ReadSource::kLastApplied:
+ // The lastApplied timestamp is not always available if the system has not accepted
+ // writes, so it is not possible to invariant that it exists.
+ if (_readAtTimestamp.isNull()) {
+ return boost::none;
+ }
+ return _readAtTimestamp;
// The following ReadSources can only establish a read timestamp when a transaction is
// opened.
case ReadSource::kNoOverlap:
- case ReadSource::kLastApplied:
case ReadSource::kAllDurableSnapshot:
case ReadSource::kMajorityCommitted:
break;
@@ -580,7 +585,7 @@ void WiredTigerRecoveryUnit::_txnOpen() {
break;
}
case ReadSource::kLastApplied: {
- _readAtTimestamp = _beginTransactionAtLastAppliedTimestamp(session);
+ _beginTransactionAtLastAppliedTimestamp(session);
break;
}
case ReadSource::kNoOverlap: {
@@ -648,9 +653,8 @@ namespace {
Mutex _lastAppliedTxnMutex = MONGO_MAKE_LATCH("lastAppliedTxnMutex");
} // namespace
-Timestamp WiredTigerRecoveryUnit::_beginTransactionAtLastAppliedTimestamp(WT_SESSION* session) {
- auto lastApplied = _sessionCache->snapshotManager().getLastApplied();
- if (!lastApplied) {
+void WiredTigerRecoveryUnit::_beginTransactionAtLastAppliedTimestamp(WT_SESSION* session) {
+ if (_readAtTimestamp.isNull()) {
// When there is not a lastApplied timestamp available, read without a timestamp. Do not
// round up the read timestamp to the oldest timestamp.
@@ -664,7 +668,7 @@ Timestamp WiredTigerRecoveryUnit::_beginTransactionAtLastAppliedTimestamp(WT_SES
session, _prepareConflictBehavior, _roundUpPreparedTimestamps);
LOGV2_DEBUG(4847500, 2, "no read timestamp available for kLastApplied");
txnOpen.done();
- return Timestamp();
+ return;
}
{
@@ -673,14 +677,14 @@ Timestamp WiredTigerRecoveryUnit::_beginTransactionAtLastAppliedTimestamp(WT_SES
_prepareConflictBehavior,
_roundUpPreparedTimestamps,
RoundUpReadTimestamp::kRound);
- auto status = txnOpen.setReadSnapshot(*lastApplied);
+ auto status = txnOpen.setReadSnapshot(_readAtTimestamp);
fassert(4847501, status);
txnOpen.done();
}
// We might have rounded to oldest between calling getLastApplied and setReadSnapshot. We
// need to get the actual read timestamp we used.
- return _getTransactionReadTimestamp(session);
+ _readAtTimestamp = _getTransactionReadTimestamp(session);
}
Timestamp WiredTigerRecoveryUnit::_beginTransactionAtNoOverlapTimestamp(WT_SESSION* session) {
@@ -918,7 +922,16 @@ void WiredTigerRecoveryUnit::setTimestampReadSource(ReadSource readSource,
invariant(!(provided && provided->isNull()));
_timestampReadSource = readSource;
- _readAtTimestamp = (provided) ? *provided : Timestamp();
+ if (readSource == kLastApplied) {
+ // The lastApplied timestamp is not always available if the system has not accepted writes.
+ if (auto lastApplied = _sessionCache->snapshotManager().getLastApplied()) {
+ _readAtTimestamp = *lastApplied;
+ } else {
+ _readAtTimestamp = Timestamp();
+ }
+ } else {
+ _readAtTimestamp = (provided) ? *provided : Timestamp();
+ }
}
RecoveryUnit::ReadSource WiredTigerRecoveryUnit::getTimestampReadSource() const {
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h
index 4afe11ac614..1826eb56385 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h
@@ -254,10 +254,11 @@ private:
Timestamp _beginTransactionAtNoOverlapTimestamp(WT_SESSION* session);
/**
- * Starts a transaction at the lastApplied timestamp. Returns the timestamp at which the
- * transaction was started.
+ * Starts a transaction at the lastApplied timestamp stored in '_readAtTimestamp'. Sets
+ * '_readAtTimestamp' to the actual timestamp used by the storage engine in case rounding
+ * occured.
*/
- Timestamp _beginTransactionAtLastAppliedTimestamp(WT_SESSION* session);
+ void _beginTransactionAtLastAppliedTimestamp(WT_SESSION* session);
/**
* Returns the timestamp at which the current transaction is reading.