summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLouis Williams <louis.williams@mongodb.com>2020-06-05 11:41:22 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-06-05 19:16:38 +0000
commit4e3f3b96826772baa777b7563b38574c1e11c2e4 (patch)
tree35b660384f5b6bcc918f66c7cfca9e1b43302e3d
parentc1bc1b6d6b7b7d216b8243a609f1c7231045e5be (diff)
downloadmongo-4e3f3b96826772baa777b7563b38574c1e11c2e4.tar.gz
SERVER-48475 Reimplement lastApplied for secondary reads
This partially reverts work to use the kNoOverlap ReadSource on secondaries since the all_durable calculation is unnecessary and expensive. (cherry picked from commit ff92d4435fa75c2b947c49cffaf48805b320a5ae)
-rw-r--r--src/mongo/db/db_raii.cpp41
-rw-r--r--src/mongo/db/db_raii_test.cpp34
-rw-r--r--src/mongo/db/storage/recovery_unit.h6
-rw-r--r--src/mongo/db/storage/snapshot_helper.cpp42
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp38
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h6
6 files changed, 106 insertions, 61 deletions
diff --git a/src/mongo/db/db_raii.cpp b/src/mongo/db/db_raii.cpp
index 6064e3b9b6c..a1b684d94c0 100644
--- a/src/mongo/db/db_raii.cpp
+++ b/src/mongo/db/db_raii.cpp
@@ -113,7 +113,7 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
_autoColl.emplace(opCtx, nsOrUUID, collectionLockMode, viewMode, deadline);
// If the read source is explicitly set to kNoTimestamp, we read the most up to date data and do
- // not consider reading at the no-overlap point (e.g. FTDC needs that).
+ // not consider changing our ReadSource (e.g. FTDC needs that).
if (opCtx->recoveryUnit()->getTimestampReadSource() == RecoveryUnit::ReadSource::kNoTimestamp)
return;
@@ -129,12 +129,12 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
// during this period, we default to not acquiring the lock (by setting
// _shouldNotConflictWithSecondaryBatchApplicationBlock). On primaries, we always read at a
// consistent time, so not taking the PBWM lock is not a problem. On secondaries, we have to
- // guarantee we read at a consistent state, so we must read at the no-overlap timestamp,
- // which is a function of the lastApplied timestamp, which is set after each complete batch.
+ // guarantee we read at a consistent state, so we must read at the lastApplied timestamp,
+ // which is set after each complete batch.
//
- // If an attempt to read at the no-overlap timestamp is unsuccessful because there are
- // pending catalog changes that occur after the no-overlap timestamp, we release our locks
- // and try again with the PBWM lock (by unsetting
+ // If an attempt to read at the lastApplied timestamp is unsuccessful because there are
+ // pending catalog changes that occur after that timestamp, we release our locks and try
+ // again with the PBWM lock (by unsetting
// _shouldNotConflictWithSecondaryBatchApplicationBlock).
const NamespaceString nss = coll->ns();
@@ -165,29 +165,32 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
}
invariant(
- // The kMajorityCommitted and kNoOverlap read sources already read from timestamps
+ // The kMajorityCommitted and kLastApplied read sources already read from timestamps
// that are safe with respect to concurrent secondary batch application, and are
// eligible for retrying.
readSource == RecoveryUnit::ReadSource::kMajorityCommitted ||
- readSource == RecoveryUnit::ReadSource::kNoOverlap);
+ readSource == RecoveryUnit::ReadSource::kNoOverlap ||
+ readSource == RecoveryUnit::ReadSource::kLastApplied);
invariant(readConcernLevel != repl::ReadConcernLevel::kSnapshotReadConcern);
// Yield locks in order to do the blocking call below.
_autoColl = boost::none;
- // If there are pending catalog changes when using a no-overlap read source, we choose to
- // take the PBWM lock to conflict with any in-progress batches. This prevents us from idly
- // spinning in this loop trying to get a new read timestamp ahead of the minimum visible
- // snapshot. This helps us guarantee liveness (i.e. we can eventually get a suitable read
- // timestamp) but should not be necessary for correctness. After initial sync finishes, if
- // we waited instead of retrying, readers would block indefinitely waiting for the
- // noOverlap time to move forward. Instead we force the reader take the PBWM lock and retry.
- if (readSource == RecoveryUnit::ReadSource::kNoOverlap) {
+ // If there are pending catalog changes when using a no-overlap or lastApplied read source,
+ // we choose to take the PBWM lock to conflict with any in-progress batches. This prevents
+ // us from idly spinning in this loop trying to get a new read timestamp ahead of the
+ // minimum visible snapshot. This helps us guarantee liveness (i.e. we can eventually get a
+ // suitable read timestamp) but should not be necessary for correctness. After initial sync
+ // finishes, if we waited instead of retrying, readers would block indefinitely waiting for
+ // their read timstamp to move forward. Instead we force the reader take the PBWM lock and
+ // retry.
+ if (readSource == RecoveryUnit::ReadSource::kLastApplied ||
+ readSource == RecoveryUnit::ReadSource::kNoOverlap) {
invariant(readTimestamp);
LOGV2(20576,
- "Tried reading at no-overlap time, but future catalog changes are pending. "
- "Trying again without reading at no-overlap time.",
- "noOverlapTimestamp"_attr = *readTimestamp,
+ "Tried reading at a timestamp, but future catalog changes are pending. "
+ "Trying again without reading at a timestamp",
+ "readTimestamp"_attr = *readTimestamp,
"collection"_attr = nss.ns(),
"collectionMinSnapshot"_attr = *minSnapshot);
// Destructing the block sets _shouldConflictWithSecondaryBatchApplication back to the
diff --git a/src/mongo/db/db_raii_test.cpp b/src/mongo/db/db_raii_test.cpp
index 460d4acec10..77874604c86 100644
--- a/src/mongo/db/db_raii_test.cpp
+++ b/src/mongo/db/db_raii_test.cpp
@@ -267,11 +267,11 @@ TEST_F(DBRAIITestFixture,
ASSERT(client1.second->lockState()->isDbLockedForMode(nss.db(), MODE_IX));
AutoGetCollectionForRead coll(client2.second.get(), NamespaceString("local.system.js"));
- // Reading from an unreplicated collection does not change the ReadSource to kNoOverlap.
+ // Reading from an unreplicated collection does not change the ReadSource to kLastApplied.
ASSERT_EQ(client2.second.get()->recoveryUnit()->getTimestampReadSource(),
RecoveryUnit::ReadSource::kUnset);
- // Reading from a replicated collection will try to switch to kNoOverlap. Because we are
+ // Reading from a replicated collection will try to switch to kLastApplied. Because we are
// already reading without a timestamp and we can't reacquire the PBWM lock to continue reading
// without a timestamp, we uassert in this situation.
ASSERT_THROWS_CODE(AutoGetCollectionForRead(client2.second.get(), nss),
@@ -280,9 +280,9 @@ TEST_F(DBRAIITestFixture,
}
TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedConflict) {
- // This test simulates a situation where AutoGetCollectionForRead cant read at the no-overlap
- // point (minimum of all_durable and lastApplied) because it is set to a point earlier than the
- // catalog change. We expect to read without a timestamp and hold the PBWM lock.
+ // This test simulates a situation where AutoGetCollectionForRead cant read at lastApplied
+ // because it is set to a point earlier than the catalog change. We expect to read without a
+ // timestamp and hold the PBWM lock.
auto replCoord = repl::ReplicationCoordinator::get(client1.second.get());
CollectionOptions defaultCollectionOptions;
ASSERT_OK(
@@ -302,7 +302,7 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedConflict) {
snapshotManager->setLastApplied(opTime.getTimestamp());
AutoGetCollectionForRead coll(client1.second.get(), nss);
- // We can't read from kNoOverlap in this scenario because there is a catalog conflict. Resort
+ // We can't read from kLastApplied in this scenario because there is a catalog conflict. Resort
// to taking the PBWM lock and reading without a timestamp.
ASSERT_EQ(client1.second.get()->recoveryUnit()->getTimestampReadSource(),
RecoveryUnit::ReadSource::kUnset);
@@ -311,8 +311,8 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedConflict) {
}
TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedUnavailable) {
- // This test simulates a situation where AutoGetCollectionForRead reads at the no-overlap
- // point (minimum of all_durable and lastApplied) even though lastApplied is not available.
+ // This test simulates a situation where AutoGetCollectionForRead reads without a timestamp
+ // even though lastApplied is not available.
auto replCoord = repl::ReplicationCoordinator::get(client1.second.get());
CollectionOptions defaultCollectionOptions;
ASSERT_OK(
@@ -320,23 +320,21 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedUnavailable) {
ASSERT_OK(replCoord->setFollowerMode(repl::MemberState::RS_SECONDARY));
// Note that when the collection was created, above, the system chooses a minimum snapshot time
- // for the collection. Since last-applied isn't available, we default to all_durable, which is
- // available, and is greater than the collection minimum snapshot.
+ // for the collection. Since last-applied isn't available, we default to read without a
+ // timestamp.
auto snapshotManager =
client1.second.get()->getServiceContext()->getStorageEngine()->getSnapshotManager();
ASSERT_FALSE(snapshotManager->getLastApplied());
AutoGetCollectionForRead coll(client1.second.get(), nss);
- // Even though lastApplied isn't available, the ReadSource is set to kNoOverlap, which reads
- // at the all_durable time.
ASSERT_EQ(client1.second.get()->recoveryUnit()->getTimestampReadSource(),
- RecoveryUnit::ReadSource::kNoOverlap);
- ASSERT_TRUE(client1.second.get()->recoveryUnit()->getPointInTimeReadTimestamp());
+ RecoveryUnit::ReadSource::kLastApplied);
+ ASSERT_FALSE(client1.second.get()->recoveryUnit()->getPointInTimeReadTimestamp());
ASSERT_FALSE(client1.second.get()->lockState()->isLockHeldForMode(
resourceIdParallelBatchWriterMode, MODE_IS));
}
-TEST_F(DBRAIITestFixture, AutoGetCollectionForReadUsesNoOverlapOnSecondary) {
+TEST_F(DBRAIITestFixture, AutoGetCollectionForReadUsesLastAppliedOnSecondary) {
auto opCtx = client1.second.get();
// Use a tailable query on a capped collection because we can anticipate it automatically
@@ -358,9 +356,9 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadUsesNoOverlapOnSecondary) {
auto state = exec->getNext(&unused, nullptr);
ASSERT_EQ(state, PlanExecutor::ExecState::IS_EOF);
- // After restoring, the collection scan should now be reading with kNoOverlap, the default on
+ // After restoring, the collection scan should now be reading with kLastApplied, the default on
// secondaries.
- ASSERT_EQ(RecoveryUnit::ReadSource::kNoOverlap,
+ ASSERT_EQ(RecoveryUnit::ReadSource::kLastApplied,
opCtx->recoveryUnit()->getTimestampReadSource());
ASSERT_EQUALS(PlanExecutor::IS_EOF, exec->getNext(&unused, nullptr));
}
@@ -379,7 +377,7 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadChangedReadSourceAfterStepUp)
auto exec = makeTailableQueryPlan(opCtx, autoColl.getCollection());
// The collection scan should use the default ReadSource on a secondary.
- ASSERT_EQ(RecoveryUnit::ReadSource::kNoOverlap,
+ ASSERT_EQ(RecoveryUnit::ReadSource::kLastApplied,
opCtx->recoveryUnit()->getTimestampReadSource());
// When the tailable query recovers from its yield, it should discover that the node is primary
diff --git a/src/mongo/db/storage/recovery_unit.h b/src/mongo/db/storage/recovery_unit.h
index a2767c661c7..0fd44b18b42 100644
--- a/src/mongo/db/storage/recovery_unit.h
+++ b/src/mongo/db/storage/recovery_unit.h
@@ -398,6 +398,10 @@ public:
*/
kNoOverlap,
/**
+ * Read from the lastApplied timestamp.
+ */
+ kLastApplied,
+ /**
* Read from the all_durable timestamp. New transactions will always read from the same
* timestamp and never advance.
*/
@@ -418,6 +422,8 @@ public:
return "kMajorityCommitted";
case ReadSource::kNoOverlap:
return "kNoOverlap";
+ case ReadSource::kLastApplied:
+ return "kLastApplied";
case ReadSource::kAllDurableSnapshot:
return "kAllDurableSnapshot";
case ReadSource::kProvided:
diff --git a/src/mongo/db/storage/snapshot_helper.cpp b/src/mongo/db/storage/snapshot_helper.cpp
index 51d7f0327c1..57a1c3b99e6 100644
--- a/src/mongo/db/storage/snapshot_helper.cpp
+++ b/src/mongo/db/storage/snapshot_helper.cpp
@@ -52,19 +52,19 @@ bool canSwitchReadSource(OperationContext* opCtx) {
return false;
}
-bool shouldReadAtNoOverlap(OperationContext* opCtx,
- const NamespaceString& nss,
- std::string* reason) {
+bool shouldReadAtLastApplied(OperationContext* opCtx,
+ const NamespaceString& nss,
+ std::string* reason) {
- // If this is true, then the operation opted-in to the PBWM lock, implying that it cannot read
- // at no-overlap. It's important to note that it is possible for this to be false, but still be
+ // If this is true, then the operation opted-in to the PBWM lock, implying that it cannot change
+ // its ReadSource. It's important to note that it is possible for this to be false, but still be
// holding the PBWM lock, explained below.
if (opCtx->lockState()->shouldConflictWithSecondaryBatchApplication()) {
*reason = "conflicts with batch application";
return false;
}
- // If we are already holding the PBWM lock, do not read at no-overlap. Snapshots acquired by an
+ // If we are already holding the PBWM lock, do not change ReadSource. Snapshots acquired by an
// operation after a yield/restore must see all writes in the pre-yield snapshot. Once a
// snapshot is reading without a timestamp, we choose to continue acquiring snapshots without a
// timestamp. This is done in lieu of determining a timestamp far enough in the future that's
@@ -72,19 +72,19 @@ bool shouldReadAtNoOverlap(OperationContext* opCtx,
// held concurrently, which is often the case when DBDirectClient is used.
if (opCtx->lockState()->isLockHeldForMode(resourceIdParallelBatchWriterMode, MODE_IS)) {
*reason = "PBWM lock is held";
- LOGV2_DEBUG(20577, 1, "not reading at no-overlap because the PBWM lock is held");
+ LOGV2_DEBUG(20577, 1, "not reading at lastApplied because the PBWM lock is held");
return false;
}
// If we are in a replication state (like secondary or primary catch-up) where we are not
- // accepting writes, we should read at no-overlap. If this node can accept writes, then no
+ // accepting writes, we should read at lastApplied. If this node can accept writes, then no
// conflicting replication batches are being applied and we can read from the default snapshot.
if (repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesForDatabase(opCtx, "admin")) {
*reason = "primary";
return false;
}
- // Non-replicated collections do not need to read at no-overlap, as those collections are not
+ // Non-replicated collections do not need to read at lastApplied, as those collections are not
// written by the replication system. However, the oplog is special, as it *is* written by the
// replication system.
if (!nss.isReplicated() && !nss.isOplog()) {
@@ -103,39 +103,33 @@ boost::optional<RecoveryUnit::ReadSource> getNewReadSource(OperationContext* opC
const auto existing = opCtx->recoveryUnit()->getTimestampReadSource();
std::string reason;
- const bool readAtNoOverlap = shouldReadAtNoOverlap(opCtx, nss, &reason);
+ const bool readAtLastApplied = shouldReadAtLastApplied(opCtx, nss, &reason);
if (existing == RecoveryUnit::ReadSource::kUnset) {
// Shifting from reading without a timestamp to reading with a timestamp can be dangerous
// because writes will appear to vanish. This case is intended for new reads on secondaries
// and query yield recovery after state transitions from primary to secondary.
// If a query recovers from a yield and the node is no longer primary, it must start reading
- // at the no-overlap point because reading without a timestamp is not safe. The no-overlap
- // point (the minimum of WT all_durable and lastApplied) is safe to use because step-down
- // closes all oplog holes, which hold back all_durable. The all_durable timestamp will then
- // be at least equal to the timestamp of the last write before the state transition, meaning
- // a reader performing an untimestamped read before the state transition will see all of the
- // same writes after the state transition.
- if (readAtNoOverlap) {
- LOGV2_DEBUG(4452901, 2, "Changing ReadSource to kNoOverlap", "collection"_attr = nss);
- return RecoveryUnit::ReadSource::kNoOverlap;
+ // at the lastApplied point because reading without a timestamp is not safe.
+ if (readAtLastApplied) {
+ LOGV2_DEBUG(4452901, 2, "Changing ReadSource to kLastApplied", "collection"_attr = nss);
+ return RecoveryUnit::ReadSource::kLastApplied;
}
- } else if (existing == RecoveryUnit::ReadSource::kNoOverlap) {
- // For some reason, we can no longer read at kNoOverlap.
+ } else if (existing == RecoveryUnit::ReadSource::kLastApplied) {
+ // For some reason, we can no longer read at lastApplied.
// An operation that yields a timestamped snapshot must restore a snapshot with at least as
// large of a timestamp, or with proper consideration of rollback scenarios, no timestamp.
// Given readers do not survive rollbacks, it's okay to go from reading with a timestamp to
// reading without one. More writes will become visible.
- if (!readAtNoOverlap) {
+ if (!readAtLastApplied) {
LOGV2_DEBUG(4452902,
2,
"Changing ReadSource to kUnset",
"collection"_attr = nss,
"reason"_attr = reason);
- // This shift to kUnset assumes that callers will not make future attempts to manipulate
// their ReadSources after performing reads at an un-timetamped snapshot. The only
// exception is callers of this function that may need to change from kUnset to
- // kNoOverlap in the event of a catalog conflict or query yield.
+ // kLastApplied in the event of a catalog conflict or query yield.
return RecoveryUnit::ReadSource::kUnset;
}
}
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp
index 1577c27111b..a54fd288176 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp
@@ -454,6 +454,7 @@ boost::optional<Timestamp> WiredTigerRecoveryUnit::getPointInTimeReadTimestamp()
// The following ReadSources can only establish a read timestamp when a transaction is
// opened.
case ReadSource::kNoOverlap:
+ case ReadSource::kLastApplied:
case ReadSource::kAllDurableSnapshot:
break;
}
@@ -462,6 +463,7 @@ boost::optional<Timestamp> WiredTigerRecoveryUnit::getPointInTimeReadTimestamp()
getSession();
switch (_timestampReadSource) {
+ case ReadSource::kLastApplied:
case ReadSource::kNoOverlap:
// The lastApplied and allDurable timestamps are not always available if the system has
// not accepted writes, so it is not possible to invariant that it exists as other
@@ -515,6 +517,10 @@ void WiredTigerRecoveryUnit::_txnOpen() {
session, _prepareConflictBehavior, _roundUpPreparedTimestamps);
break;
}
+ case ReadSource::kLastApplied: {
+ _readAtTimestamp = _beginTransactionAtLastAppliedTimestamp(session);
+ break;
+ }
case ReadSource::kNoOverlap: {
_readAtTimestamp = _beginTransactionAtNoOverlapTimestamp(session);
break;
@@ -566,6 +572,38 @@ Timestamp WiredTigerRecoveryUnit::_beginTransactionAtAllDurableTimestamp(WT_SESS
return readTimestamp;
}
+Timestamp WiredTigerRecoveryUnit::_beginTransactionAtLastAppliedTimestamp(WT_SESSION* session) {
+ auto lastApplied = _sessionCache->snapshotManager().getLastApplied();
+ if (!lastApplied) {
+ // When there is not a lastApplied timestamp available, read without a timestamp. Do not
+ // round up the read timestamp to the oldest timestamp.
+
+ // There is a race that allows new transactions to start between the time we check for a
+ // read timestamp and start our transaction, which can temporarily violate the contract of
+ // kLastApplied. That is, writes will be visible that occur after the lastApplied time. This
+ // is only possible for readers that start immediately after an initial sync that did not
+ // replicate any oplog entries. Future transactions will start reading at a timestamp once
+ // timestamped writes have been made.
+ WiredTigerBeginTxnBlock txnOpen(
+ session, _prepareConflictBehavior, _roundUpPreparedTimestamps);
+ LOGV2_DEBUG(4847500, 2, "no read timestamp available for kLastApplied");
+ txnOpen.done();
+ return Timestamp();
+ }
+
+ WiredTigerBeginTxnBlock txnOpen(session,
+ _prepareConflictBehavior,
+ _roundUpPreparedTimestamps,
+ RoundUpReadTimestamp::kRound);
+ auto status = txnOpen.setReadSnapshot(*lastApplied);
+ fassert(4847501, status);
+ txnOpen.done();
+
+ // We might have rounded to oldest between calling getLastApplied and setReadSnapshot. We
+ // need to get the actual read timestamp we used.
+ return _getTransactionReadTimestamp(session);
+}
+
Timestamp WiredTigerRecoveryUnit::_beginTransactionAtNoOverlapTimestamp(WT_SESSION* session) {
auto lastApplied = _sessionCache->snapshotManager().getLastApplied();
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h
index 19c67fcfe62..c61d80205ac 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h
@@ -228,6 +228,12 @@ private:
Timestamp _beginTransactionAtNoOverlapTimestamp(WT_SESSION* session);
/**
+ * Starts a transaction at the lastApplied timestamp. Returns the timestamp at which the
+ * transaction was started.
+ */
+ Timestamp _beginTransactionAtLastAppliedTimestamp(WT_SESSION* session);
+
+ /**
* Returns the timestamp at which the current transaction is reading.
*/
Timestamp _getTransactionReadTimestamp(WT_SESSION* session);