SERVER-48475 Reimplement lastApplied for secondary reads

This partially reverts work to use the kNoOverlap ReadSource on secondaries since the all_durable calculation is unnecessary and expensive. (cherry picked from commit ff92d4435fa75c2b947c49cffaf48805b320a5ae)
author: Louis Williams <louis.williams@mongodb.com> 2020-06-05 11:41:22 -0400
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2020-06-05 19:16:38 +0000
commit: 4e3f3b96826772baa777b7563b38574c1e11c2e4 (patch)
tree: 35b660384f5b6bcc918f66c7cfca9e1b43302e3d
parent: c1bc1b6d6b7b7d216b8243a609f1c7231045e5be (diff)
download: mongo-4e3f3b96826772baa777b7563b38574c1e11c2e4.tar.gz
6 files changed, 106 insertions, 61 deletions
diff --git a/src/mongo/db/db_raii.cpp b/src/mongo/db/db_raii.cpp
index 6064e3b9b6c..a1b684d94c0 100644
--- a/src/mongo/db/db_raii.cpp
+++ b/src/mongo/db/db_raii.cpp
@@ -113,7 +113,7 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
     _autoColl.emplace(opCtx, nsOrUUID, collectionLockMode, viewMode, deadline);
 
     // If the read source is explicitly set to kNoTimestamp, we read the most up to date data and do
-    // not consider reading at the no-overlap point (e.g. FTDC needs that).
+    // not consider changing our ReadSource (e.g. FTDC needs that).
     if (opCtx->recoveryUnit()->getTimestampReadSource() == RecoveryUnit::ReadSource::kNoTimestamp)
         return;
 
@@ -129,12 +129,12 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
         // during this period, we default to not acquiring the lock (by setting
         // _shouldNotConflictWithSecondaryBatchApplicationBlock). On primaries, we always read at a
         // consistent time, so not taking the PBWM lock is not a problem. On secondaries, we have to
-        // guarantee we read at a consistent state, so we must read at the no-overlap timestamp,
-        // which is a function of the lastApplied timestamp, which is set after each complete batch.
+        // guarantee we read at a consistent state, so we must read at the lastApplied timestamp,
+        // which is set after each complete batch.
         //
-        // If an attempt to read at the no-overlap timestamp is unsuccessful because there are
-        // pending catalog changes that occur after the no-overlap timestamp, we release our locks
-        // and try again with the PBWM lock (by unsetting
+        // If an attempt to read at the lastApplied timestamp is unsuccessful because there are
+        // pending catalog changes that occur after that timestamp, we release our locks and try
+        // again with the PBWM lock (by unsetting
         // _shouldNotConflictWithSecondaryBatchApplicationBlock).
 
         const NamespaceString nss = coll->ns();
@@ -165,29 +165,32 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
         }
 
         invariant(
-            // The kMajorityCommitted and kNoOverlap read sources already read from timestamps
+            // The kMajorityCommitted and kLastApplied read sources already read from timestamps
             // that are safe with respect to concurrent secondary batch application, and are
             // eligible for retrying.
             readSource == RecoveryUnit::ReadSource::kMajorityCommitted ||
-            readSource == RecoveryUnit::ReadSource::kNoOverlap);
+            readSource == RecoveryUnit::ReadSource::kNoOverlap ||
+            readSource == RecoveryUnit::ReadSource::kLastApplied);
         invariant(readConcernLevel != repl::ReadConcernLevel::kSnapshotReadConcern);
 
         // Yield locks in order to do the blocking call below.
         _autoColl = boost::none;
 
-        // If there are pending catalog changes when using a no-overlap read source, we choose to
-        // take the PBWM lock to conflict with any in-progress batches. This prevents us from idly
-        // spinning in this loop trying to get a new read timestamp ahead of the minimum visible
-        // snapshot. This helps us guarantee liveness (i.e. we can eventually get a suitable read
-        // timestamp) but should not be necessary for correctness. After initial sync finishes, if
-        // we waited instead of retrying, readers would block indefinitely waiting for the
-        // noOverlap time to move forward. Instead we force the reader take the PBWM lock and retry.
-        if (readSource == RecoveryUnit::ReadSource::kNoOverlap) {
+        // If there are pending catalog changes when using a no-overlap or lastApplied read source,
+        // we choose to take the PBWM lock to conflict with any in-progress batches. This prevents
+        // us from idly spinning in this loop trying to get a new read timestamp ahead of the
+        // minimum visible snapshot. This helps us guarantee liveness (i.e. we can eventually get a
+        // suitable read timestamp) but should not be necessary for correctness. After initial sync
+        // finishes, if we waited instead of retrying, readers would block indefinitely waiting for
+        // their read timstamp to move forward. Instead we force the reader take the PBWM lock and
+        // retry.
+        if (readSource == RecoveryUnit::ReadSource::kLastApplied ||
+            readSource == RecoveryUnit::ReadSource::kNoOverlap) {
             invariant(readTimestamp);
             LOGV2(20576,
-                  "Tried reading at no-overlap time, but future catalog changes are pending. "
-                  "Trying again without reading at no-overlap time.",
-                  "noOverlapTimestamp"_attr = *readTimestamp,
+                  "Tried reading at a timestamp, but future catalog changes are pending. "
+                  "Trying again without reading at a timestamp",
+                  "readTimestamp"_attr = *readTimestamp,
                   "collection"_attr = nss.ns(),
                   "collectionMinSnapshot"_attr = *minSnapshot);
             // Destructing the block sets _shouldConflictWithSecondaryBatchApplication back to the
diff --git a/src/mongo/db/db_raii_test.cpp b/src/mongo/db/db_raii_test.cpp
index 460d4acec10..77874604c86 100644
--- a/src/mongo/db/db_raii_test.cpp
+++ b/src/mongo/db/db_raii_test.cpp
@@ -267,11 +267,11 @@ TEST_F(DBRAIITestFixture,
     ASSERT(client1.second->lockState()->isDbLockedForMode(nss.db(), MODE_IX));
 
     AutoGetCollectionForRead coll(client2.second.get(), NamespaceString("local.system.js"));
-    // Reading from an unreplicated collection does not change the ReadSource to kNoOverlap.
+    // Reading from an unreplicated collection does not change the ReadSource to kLastApplied.
     ASSERT_EQ(client2.second.get()->recoveryUnit()->getTimestampReadSource(),
               RecoveryUnit::ReadSource::kUnset);
 
-    // Reading from a replicated collection will try to switch to kNoOverlap. Because we are
+    // Reading from a replicated collection will try to switch to kLastApplied. Because we are
     // already reading without a timestamp and we can't reacquire the PBWM lock to continue reading
     // without a timestamp, we uassert in this situation.
     ASSERT_THROWS_CODE(AutoGetCollectionForRead(client2.second.get(), nss),
@@ -280,9 +280,9 @@ TEST_F(DBRAIITestFixture,
 }
 
 TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedConflict) {
-    // This test simulates a situation where AutoGetCollectionForRead cant read at the no-overlap
-    // point (minimum of all_durable and lastApplied) because it is set to a point earlier than the
-    // catalog change. We expect to read without a timestamp and hold the PBWM lock.
+    // This test simulates a situation where AutoGetCollectionForRead cant read at lastApplied
+    // because it is set to a point earlier than the catalog change. We expect to read without a
+    // timestamp and hold the PBWM lock.
     auto replCoord = repl::ReplicationCoordinator::get(client1.second.get());
     CollectionOptions defaultCollectionOptions;
     ASSERT_OK(
@@ -302,7 +302,7 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedConflict) {
     snapshotManager->setLastApplied(opTime.getTimestamp());
     AutoGetCollectionForRead coll(client1.second.get(), nss);
 
-    // We can't read from kNoOverlap in this scenario because there is a catalog conflict. Resort
+    // We can't read from kLastApplied in this scenario because there is a catalog conflict. Resort
     // to taking the PBWM lock and reading without a timestamp.
     ASSERT_EQ(client1.second.get()->recoveryUnit()->getTimestampReadSource(),
               RecoveryUnit::ReadSource::kUnset);
@@ -311,8 +311,8 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedConflict) {
 }
 
 TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedUnavailable) {
-    // This test simulates a situation where AutoGetCollectionForRead reads at the no-overlap
-    // point (minimum of all_durable and lastApplied) even though lastApplied is not available.
+    // This test simulates a situation where AutoGetCollectionForRead reads without a timestamp
+    // even though lastApplied is not available.
     auto replCoord = repl::ReplicationCoordinator::get(client1.second.get());
     CollectionOptions defaultCollectionOptions;
     ASSERT_OK(
@@ -320,23 +320,21 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedUnavailable) {
     ASSERT_OK(replCoord->setFollowerMode(repl::MemberState::RS_SECONDARY));
 
     // Note that when the collection was created, above, the system chooses a minimum snapshot time
-    // for the collection. Since last-applied isn't available, we default to all_durable, which is
-    // available, and is greater than the collection minimum snapshot.
+    // for the collection. Since last-applied isn't available, we default to read without a
+    // timestamp.
     auto snapshotManager =
         client1.second.get()->getServiceContext()->getStorageEngine()->getSnapshotManager();
     ASSERT_FALSE(snapshotManager->getLastApplied());
     AutoGetCollectionForRead coll(client1.second.get(), nss);
 
-    // Even though lastApplied isn't available, the ReadSource is set to kNoOverlap, which reads
-    // at the all_durable time.
     ASSERT_EQ(client1.second.get()->recoveryUnit()->getTimestampReadSource(),
-              RecoveryUnit::ReadSource::kNoOverlap);
-    ASSERT_TRUE(client1.second.get()->recoveryUnit()->getPointInTimeReadTimestamp());
+              RecoveryUnit::ReadSource::kLastApplied);
+    ASSERT_FALSE(client1.second.get()->recoveryUnit()->getPointInTimeReadTimestamp());
     ASSERT_FALSE(client1.second.get()->lockState()->isLockHeldForMode(
         resourceIdParallelBatchWriterMode, MODE_IS));
 }
 
-TEST_F(DBRAIITestFixture, AutoGetCollectionForReadUsesNoOverlapOnSecondary) {
+TEST_F(DBRAIITestFixture, AutoGetCollectionForReadUsesLastAppliedOnSecondary) {
     auto opCtx = client1.second.get();
 
     // Use a tailable query on a capped collection because we can anticipate it automatically
@@ -358,9 +356,9 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadUsesNoOverlapOnSecondary) {
     auto state = exec->getNext(&unused, nullptr);
     ASSERT_EQ(state, PlanExecutor::ExecState::IS_EOF);
 
-    // After restoring, the collection scan should now be reading with kNoOverlap, the default on
+    // After restoring, the collection scan should now be reading with kLastApplied, the default on
     // secondaries.
-    ASSERT_EQ(RecoveryUnit::ReadSource::kNoOverlap,
+    ASSERT_EQ(RecoveryUnit::ReadSource::kLastApplied,
               opCtx->recoveryUnit()->getTimestampReadSource());
     ASSERT_EQUALS(PlanExecutor::IS_EOF, exec->getNext(&unused, nullptr));
 }
@@ -379,7 +377,7 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadChangedReadSourceAfterStepUp)
     auto exec = makeTailableQueryPlan(opCtx, autoColl.getCollection());
 
     // The collection scan should use the default ReadSource on a secondary.
-    ASSERT_EQ(RecoveryUnit::ReadSource::kNoOverlap,
+    ASSERT_EQ(RecoveryUnit::ReadSource::kLastApplied,
               opCtx->recoveryUnit()->getTimestampReadSource());
 
     // When the tailable query recovers from its yield, it should discover that the node is primary
diff --git a/src/mongo/db/storage/recovery_unit.h b/src/mongo/db/storage/recovery_unit.h
index a2767c661c7..0fd44b18b42 100644
--- a/src/mongo/db/storage/recovery_unit.h
+++ b/src/mongo/db/storage/recovery_unit.h
@@ -398,6 +398,10 @@ public:
          */
         kNoOverlap,
         /**
+         * Read from the lastApplied timestamp.
+         */
+        kLastApplied,
+        /**
          * Read from the all_durable timestamp. New transactions will always read from the same
          * timestamp and never advance.
          */
@@ -418,6 +422,8 @@ public:
                 return "kMajorityCommitted";
             case ReadSource::kNoOverlap:
                 return "kNoOverlap";
+            case ReadSource::kLastApplied:
+                return "kLastApplied";
             case ReadSource::kAllDurableSnapshot:
                 return "kAllDurableSnapshot";
             case ReadSource::kProvided:
diff --git a/src/mongo/db/storage/snapshot_helper.cpp b/src/mongo/db/storage/snapshot_helper.cpp
index 51d7f0327c1..57a1c3b99e6 100644
--- a/src/mongo/db/storage/snapshot_helper.cpp
+++ b/src/mongo/db/storage/snapshot_helper.cpp
@@ -52,19 +52,19 @@ bool canSwitchReadSource(OperationContext* opCtx) {
     return false;
 }
 
-bool shouldReadAtNoOverlap(OperationContext* opCtx,
-                           const NamespaceString& nss,
-                           std::string* reason) {
+bool shouldReadAtLastApplied(OperationContext* opCtx,
+                             const NamespaceString& nss,
+                             std::string* reason) {
 
-    // If this is true, then the operation opted-in to the PBWM lock, implying that it cannot read
-    // at no-overlap. It's important to note that it is possible for this to be false, but still be
+    // If this is true, then the operation opted-in to the PBWM lock, implying that it cannot change
+    // its ReadSource. It's important to note that it is possible for this to be false, but still be
     // holding the PBWM lock, explained below.
     if (opCtx->lockState()->shouldConflictWithSecondaryBatchApplication()) {
         *reason = "conflicts with batch application";
         return false;
     }
 
-    // If we are already holding the PBWM lock, do not read at no-overlap. Snapshots acquired by an
+    // If we are already holding the PBWM lock, do not change ReadSource. Snapshots acquired by an
     // operation after a yield/restore must see all writes in the pre-yield snapshot. Once a
     // snapshot is reading without a timestamp, we choose to continue acquiring snapshots without a
     // timestamp. This is done in lieu of determining a timestamp far enough in the future that's
@@ -72,19 +72,19 @@ bool shouldReadAtNoOverlap(OperationContext* opCtx,
     // held concurrently, which is often the case when DBDirectClient is used.
     if (opCtx->lockState()->isLockHeldForMode(resourceIdParallelBatchWriterMode, MODE_IS)) {
         *reason = "PBWM lock is held";
-        LOGV2_DEBUG(20577, 1, "not reading at no-overlap because the PBWM lock is held");
+        LOGV2_DEBUG(20577, 1, "not reading at lastApplied because the PBWM lock is held");
         return false;
     }
 
     // If we are in a replication state (like secondary or primary catch-up) where we are not
-    // accepting writes, we should read at no-overlap. If this node can accept writes, then no
+    // accepting writes, we should read at lastApplied. If this node can accept writes, then no
     // conflicting replication batches are being applied and we can read from the default snapshot.
     if (repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesForDatabase(opCtx, "admin")) {
         *reason = "primary";
         return false;
     }
 
-    // Non-replicated collections do not need to read at no-overlap, as those collections are not
+    // Non-replicated collections do not need to read at lastApplied, as those collections are not
     // written by the replication system.  However, the oplog is special, as it *is* written by the
     // replication system.
     if (!nss.isReplicated() && !nss.isOplog()) {
@@ -103,39 +103,33 @@ boost::optional<RecoveryUnit::ReadSource> getNewReadSource(OperationContext* opC
 
     const auto existing = opCtx->recoveryUnit()->getTimestampReadSource();
     std::string reason;
-    const bool readAtNoOverlap = shouldReadAtNoOverlap(opCtx, nss, &reason);
+    const bool readAtLastApplied = shouldReadAtLastApplied(opCtx, nss, &reason);
     if (existing == RecoveryUnit::ReadSource::kUnset) {
         // Shifting from reading without a timestamp to reading with a timestamp can be dangerous
         // because writes will appear to vanish. This case is intended for new reads on secondaries
         // and query yield recovery after state transitions from primary to secondary.
 
         // If a query recovers from a yield and the node is no longer primary, it must start reading
-        // at the no-overlap point because reading without a timestamp is not safe. The no-overlap
-        // point (the minimum of WT all_durable and lastApplied) is safe to use because step-down
-        // closes all oplog holes, which hold back all_durable. The all_durable timestamp will then
-        // be at least equal to the timestamp of the last write before the state transition, meaning
-        // a reader performing an untimestamped read before the state transition will see all of the
-        // same writes after the state transition.
-        if (readAtNoOverlap) {
-            LOGV2_DEBUG(4452901, 2, "Changing ReadSource to kNoOverlap", "collection"_attr = nss);
-            return RecoveryUnit::ReadSource::kNoOverlap;
+        // at the lastApplied point because reading without a timestamp is not safe.
+        if (readAtLastApplied) {
+            LOGV2_DEBUG(4452901, 2, "Changing ReadSource to kLastApplied", "collection"_attr = nss);
+            return RecoveryUnit::ReadSource::kLastApplied;
         }
-    } else if (existing == RecoveryUnit::ReadSource::kNoOverlap) {
-        // For some reason, we can no longer read at kNoOverlap.
+    } else if (existing == RecoveryUnit::ReadSource::kLastApplied) {
+        // For some reason, we can no longer read at lastApplied.
         // An operation that yields a timestamped snapshot must restore a snapshot with at least as
         // large of a timestamp, or with proper consideration of rollback scenarios, no timestamp.
         // Given readers do not survive rollbacks, it's okay to go from reading with a timestamp to
         // reading without one. More writes will become visible.
-        if (!readAtNoOverlap) {
+        if (!readAtLastApplied) {
             LOGV2_DEBUG(4452902,
                         2,
                         "Changing ReadSource to kUnset",
                         "collection"_attr = nss,
                         "reason"_attr = reason);
-            // This shift to kUnset assumes that callers will not make future attempts to manipulate
             // their ReadSources after performing reads at an un-timetamped snapshot. The only
             // exception is callers of this function that may need to change from kUnset to
-            // kNoOverlap in the event of a catalog conflict or query yield.
+            // kLastApplied in the event of a catalog conflict or query yield.
             return RecoveryUnit::ReadSource::kUnset;
         }
     }
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp
index 1577c27111b..a54fd288176 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp
@@ -454,6 +454,7 @@ boost::optional<Timestamp> WiredTigerRecoveryUnit::getPointInTimeReadTimestamp()
         // The following ReadSources can only establish a read timestamp when a transaction is
         // opened.
         case ReadSource::kNoOverlap:
+        case ReadSource::kLastApplied:
         case ReadSource::kAllDurableSnapshot:
             break;
     }
@@ -462,6 +463,7 @@ boost::optional<Timestamp> WiredTigerRecoveryUnit::getPointInTimeReadTimestamp()
     getSession();
 
     switch (_timestampReadSource) {
+        case ReadSource::kLastApplied:
         case ReadSource::kNoOverlap:
             // The lastApplied and allDurable timestamps are not always available if the system has
             // not accepted writes, so it is not possible to invariant that it exists as other
@@ -515,6 +517,10 @@ void WiredTigerRecoveryUnit::_txnOpen() {
                     session, _prepareConflictBehavior, _roundUpPreparedTimestamps);
             break;
         }
+        case ReadSource::kLastApplied: {
+            _readAtTimestamp = _beginTransactionAtLastAppliedTimestamp(session);
+            break;
+        }
         case ReadSource::kNoOverlap: {
             _readAtTimestamp = _beginTransactionAtNoOverlapTimestamp(session);
             break;
@@ -566,6 +572,38 @@ Timestamp WiredTigerRecoveryUnit::_beginTransactionAtAllDurableTimestamp(WT_SESS
     return readTimestamp;
 }
 
+Timestamp WiredTigerRecoveryUnit::_beginTransactionAtLastAppliedTimestamp(WT_SESSION* session) {
+    auto lastApplied = _sessionCache->snapshotManager().getLastApplied();
+    if (!lastApplied) {
+        // When there is not a lastApplied timestamp available, read without a timestamp. Do not
+        // round up the read timestamp to the oldest timestamp.
+
+        // There is a race that allows new transactions to start between the time we check for a
+        // read timestamp and start our transaction, which can temporarily violate the contract of
+        // kLastApplied. That is, writes will be visible that occur after the lastApplied time. This
+        // is only possible for readers that start immediately after an initial sync that did not
+        // replicate any oplog entries. Future transactions will start reading at a timestamp once
+        // timestamped writes have been made.
+        WiredTigerBeginTxnBlock txnOpen(
+            session, _prepareConflictBehavior, _roundUpPreparedTimestamps);
+        LOGV2_DEBUG(4847500, 2, "no read timestamp available for kLastApplied");
+        txnOpen.done();
+        return Timestamp();
+    }
+
+    WiredTigerBeginTxnBlock txnOpen(session,
+                                    _prepareConflictBehavior,
+                                    _roundUpPreparedTimestamps,
+                                    RoundUpReadTimestamp::kRound);
+    auto status = txnOpen.setReadSnapshot(*lastApplied);
+    fassert(4847501, status);
+    txnOpen.done();
+
+    // We might have rounded to oldest between calling getLastApplied and setReadSnapshot. We
+    // need to get the actual read timestamp we used.
+    return _getTransactionReadTimestamp(session);
+}
+
 Timestamp WiredTigerRecoveryUnit::_beginTransactionAtNoOverlapTimestamp(WT_SESSION* session) {
 
     auto lastApplied = _sessionCache->snapshotManager().getLastApplied();
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h
index 19c67fcfe62..c61d80205ac 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h
@@ -228,6 +228,12 @@ private:
     Timestamp _beginTransactionAtNoOverlapTimestamp(WT_SESSION* session);
 
     /**
+     * Starts a transaction at the lastApplied timestamp. Returns the timestamp at which the
+     * transaction was started.
+     */
+    Timestamp _beginTransactionAtLastAppliedTimestamp(WT_SESSION* session);
+
+    /**
      * Returns the timestamp at which the current transaction is reading.
      */
     Timestamp _getTransactionReadTimestamp(WT_SESSION* session);
author	Louis Williams <louis.williams@mongodb.com>	2020-06-05 11:41:22 -0400
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2020-06-05 19:16:38 +0000
commit	4e3f3b96826772baa777b7563b38574c1e11c2e4 (patch)
tree	35b660384f5b6bcc918f66c7cfca9e1b43302e3d
parent	c1bc1b6d6b7b7d216b8243a609f1c7231045e5be (diff)
download	mongo-4e3f3b96826772baa777b7563b38574c1e11c2e4.tar.gz