SERVER-47866 Secondary readers do not need to reacquire PBWM lock if there are catalog conflicts

author: Louis Williams <louis.williams@mongodb.com> 2020-09-22 16:03:46 -0400
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2020-09-23 16:49:17 +0000
commit: 669e5650151733738ce8270e5bdf3c5759665316 (patch)
tree: 40e8d2b26c236fd3effe64d5fd04b7a05162f1ec
parent: 0e044981649f6712b3cc3f39bdb41c6e06546815 (diff)
download: mongo-669e5650151733738ce8270e5bdf3c5759665316.tar.gz
21 files changed, 202 insertions, 162 deletions
diff --git a/jstests/concurrency/fsm_workloads/snapshot_read_at_cluster_time_ddl_operations.js b/jstests/concurrency/fsm_workloads/snapshot_read_at_cluster_time_ddl_operations.js
index 07d1b578109..c6bf1e808bd 100644
--- a/jstests/concurrency/fsm_workloads/snapshot_read_at_cluster_time_ddl_operations.js
+++ b/jstests/concurrency/fsm_workloads/snapshot_read_at_cluster_time_ddl_operations.js
@@ -2,8 +2,7 @@
 
 /**
  * Perform point-in-time snapshot reads that span a 'find' and multiple 'getmore's concurrently with
- * CRUD operations. Index operations running concurrently with the snapshot read may cause
- * the read to fail with a SnapshotUnavailable error.
+ * CRUD operations.
  *
  * @tags: [creates_background_indexes, requires_fcv_47, requires_replication,
  * does_not_support_causal_consistency, requires_majority_read_concern]
@@ -17,7 +16,6 @@ var $config = (function() {
 
         snapshotScan: function snapshotScan(db, collName) {
             const readErrorCodes = [
-                ErrorCodes.SnapshotUnavailable,
                 ErrorCodes.ShutdownInProgress,
                 ErrorCodes.CursorNotFound,
                 ErrorCodes.QueryPlanKilled,
diff --git a/jstests/concurrency/fsm_workloads/snapshot_read_catalog_operations.js b/jstests/concurrency/fsm_workloads/snapshot_read_catalog_operations.js
index 87a0c2f0515..5b89d01d29f 100644
--- a/jstests/concurrency/fsm_workloads/snapshot_read_catalog_operations.js
+++ b/jstests/concurrency/fsm_workloads/snapshot_read_catalog_operations.js
@@ -5,8 +5,7 @@
  * snapshot reads and CRUD operations will all contend for locks on db and collName. Since the
  * snapshot read does not release its locks until the transaction is committed, it is expected that
  * once the read has begun, catalog operations with conflicting locks will block until the read is
- * finished. Additionally, index operations running concurrently with the snapshot read may cause
- * the read to fail with a SnapshotUnavailable error.
+ * finished.
  *
  * @tags: [creates_background_indexes, uses_transactions]
  */
@@ -32,7 +31,6 @@ var $config = (function() {
             const sortByAscending = sortOptions[Random.randInt(2)];
             const readErrorCodes = [
                 ErrorCodes.NoSuchTransaction,
-                ErrorCodes.SnapshotUnavailable,
                 ErrorCodes.SnapshotTooOld,
                 ErrorCodes.StaleChunkHistory,
                 ErrorCodes.LockTimeout,
diff --git a/jstests/libs/global_snapshot_reads_util.js b/jstests/libs/global_snapshot_reads_util.js
index 10f8f9ff74d..658141ebdbd 100644
--- a/jstests/libs/global_snapshot_reads_util.js
+++ b/jstests/libs/global_snapshot_reads_util.js
@@ -138,9 +138,17 @@ function SnapshotReadsTest({primaryDB, secondaryDB, awaitCommittedFn}) {
                     // This update is not visible to reads at insertTimestamp.
                     res = assert.commandWorked(primaryDB.runCommand(
                         {update: collName, updates: [{q: {}, u: {$set: {x: true}}, multi: true}]}));
-
                     jsTestLog(`Updated collection "${collName}" at timestamp ${
                         tojson(res.operationTime)}`);
+
+                    awaitCommittedFn(db, res.operationTime);
+
+                    // This index is not visible to reads at insertTimestamp and does not cause the
+                    // operation to fail.
+                    res = assert.commandWorked(primaryDB.runCommand(
+                        {createIndexes: collName, indexes: [{key: {x: 1}, name: 'x_1'}]}));
+                    jsTestLog(`Created an index on collection "${collName}" at timestamp ${
+                        tojson(res.operationTime)}`);
                     awaitCommittedFn(db, res.operationTime);
 
                     // Retrieve the rest of the read command's result set.
diff --git a/jstests/noPassthrough/index_stepdown_abort_prepare_conflict.js b/jstests/noPassthrough/index_stepdown_abort_prepare_conflict.js
index c47922aeb68..6b207866ed7 100644
--- a/jstests/noPassthrough/index_stepdown_abort_prepare_conflict.js
+++ b/jstests/noPassthrough/index_stepdown_abort_prepare_conflict.js
@@ -100,6 +100,7 @@ assert.commandWorked(newSession.abortTransaction_forTesting());
 
 IndexBuildTest.waitForIndexBuildToStop(newPrimary.getDB(dbName), collName, indexName);
 IndexBuildTest.waitForIndexBuildToStop(primary.getDB(dbName), collName, indexName);
+rst.awaitReplication();
 
 IndexBuildTest.assertIndexes(newPrimary.getDB(dbName).getCollection(collName), 1, ["_id_"], []);
 IndexBuildTest.assertIndexes(primaryColl, 1, ["_id_"], []);
diff --git a/jstests/noPassthrough/out_majority_read_replset.js b/jstests/noPassthrough/out_majority_read_replset.js
index 0ffba741b3e..4ca9bd83e0a 100644
--- a/jstests/noPassthrough/out_majority_read_replset.js
+++ b/jstests/noPassthrough/out_majority_read_replset.js
@@ -28,12 +28,22 @@ rst.awaitLastOpCommitted();
 
 stopReplicationOnSecondaries(rst);
 
-// Create the index that is not majority commited
-// This test create indexes with majority of nodes not available for replication. So, disabling
-// index build commit quorum.
+// Rename the collection temporarily and then back to its original name. This advances the minimum
+// visible snapshot and forces the $out to block until its snapshot advances.
+const tempColl = db.getName() + '.temp';
+assert.commandWorked(db.adminCommand({
+    renameCollection: sourceColl.getFullName(),
+    to: tempColl,
+}));
+assert.commandWorked(db.adminCommand({
+    renameCollection: tempColl,
+    to: sourceColl.getFullName(),
+}));
+
+// Create the index that is not majority committed
 assert.commandWorked(sourceColl.createIndex({state: 1}, {name: "secondIndex"}, 0));
 
-// Run the $out in the parallel shell as it will block in the metadata until the shapshot is
+// Run the $out in the parallel shell as it will block in the metadata until the snapshot is
 // advanced.
 const awaitShell = startParallelShell(`{
         const testDB = db.getSiblingDB("${name}");
@@ -58,7 +68,7 @@ assert.soon(function() {
     return assert.commandWorked(db.currentOp(filter)).inprog.length === 1;
 });
 
-// Restart data replicaiton and wait until the new write becomes visible.
+// Restart data replication and wait until the new write becomes visible.
 restartReplicationOnSecondaries(rst);
 rst.awaitLastOpCommitted();
 
diff --git a/jstests/noPassthrough/read_concern_snapshot_catalog_invalidation.js b/jstests/noPassthrough/read_concern_snapshot_catalog_invalidation.js
index b6cc5d4b21d..fb296a9d29e 100644
--- a/jstests/noPassthrough/read_concern_snapshot_catalog_invalidation.js
+++ b/jstests/noPassthrough/read_concern_snapshot_catalog_invalidation.js
@@ -42,13 +42,19 @@ function testCommand(cmd, curOpFilter) {
 
     waitForCurOpByFailPointNoNS(testDB, "hangAfterPreallocateSnapshot", curOpFilter);
 
-    // Create an index on the collection the command was executed against. This will move the
-    // collection's minimum visible timestamp to a point later than the point-in-time referenced
-    // by the transaction snapshot.
-    assert.commandWorked(testDB.runCommand({
-        createIndexes: kCollName,
-        indexes: [{key: {x: 1}, name: "x_1"}],
-        writeConcern: {w: "majority"}
+    // Rename the collection the command was executed against and then back to its original name.
+    // This will move the collection's minimum visible timestamp to a point later than the
+    // point-in-time referenced by the transaction snapshot.
+    const tempColl = testDB.getName() + '.temp';
+    assert.commandWorked(testDB.adminCommand({
+        renameCollection: testDB.getName() + '.' + kCollName,
+        to: tempColl,
+        writeConcern: {w: "majority"},
+    }));
+    assert.commandWorked(testDB.adminCommand({
+        renameCollection: tempColl,
+        to: testDB.getName() + '.' + kCollName,
+        writeConcern: {w: "majority"},
     }));
 
     // Disable the hang and check for parallel shell success. Success indicates that the command
diff --git a/jstests/noPassthrough/read_majority.js b/jstests/noPassthrough/read_majority.js
index 12a2c991641..227e04e7dc4 100644
--- a/jstests/noPassthrough/read_majority.js
+++ b/jstests/noPassthrough/read_majority.js
@@ -53,6 +53,13 @@ function testReadConcernLevel(level) {
         assert.eq(res.code, ErrorCodes.MaxTimeMSExpired);
     }
 
+    function assertNoSnapshotAvailableForReadConcernLevelByUUID(uuid) {
+        var res =
+            db.runCommand({find: uuid, batchSize: 2, readConcern: {level: level}, maxTimeMS: 1000});
+        assert.commandFailed(res);
+        assert.eq(res.code, ErrorCodes.MaxTimeMSExpired);
+    }
+
     function getCursorForReadConcernLevel() {
         var res = t.runCommand('find', {batchSize: 2, readConcern: {level: level}});
         assert.commandWorked(res);
@@ -145,30 +152,77 @@ function testReadConcernLevel(level) {
     assert.eq(cursor.next().version, 4);
     assert.eq(cursor.next().version, 4);
 
-    // Adding an index bumps the min snapshot for a collection as of SERVER-20260. This may
-    // change to just filter that index out from query planning as part of SERVER-20439.
+    // Adding an index does not bump the min snapshot for a collection. Collection scans are
+    // possible, however the index is not guaranteed to be usable until the majority-committed
+    // snapshot advances.
     t.createIndex({version: 1}, {}, 0);
-    assertNoSnapshotAvailableForReadConcernLevel();
+    assert.eq(getCursorForReadConcernLevel().itcount(), 10);
+    assert.eq(getAggCursorForReadConcernLevel().itcount(), 10);
 
     // To use the index, a snapshot created after the index was completed must be marked
     // committed.
     var newSnapshot = assert.commandWorked(db.adminCommand("makeSnapshot")).name;
-    assertNoSnapshotAvailableForReadConcernLevel();
     assert.commandWorked(db.adminCommand({"setCommittedSnapshot": newSnapshot}));
     assert.eq(getCursorForReadConcernLevel().itcount(), 10);
     assert.eq(getAggCursorForReadConcernLevel().itcount(), 10);
     assert(isIxscan(db, getExplainPlan({version: 1})));
 
-    // Dropping an index does bump the min snapshot.
+    // Dropping an index does not bump the min snapshot, so the query should succeed.
     t.dropIndex({version: 1});
-    assertNoSnapshotAvailableForReadConcernLevel();
+    assert.eq(getCursorForReadConcernLevel().itcount(), 10);
+    assert.eq(getAggCursorForReadConcernLevel().itcount(), 10);
+    assert(isCollscan(db, getExplainPlan({version: 1})));
 
-    // To use the collection again, a snapshot created after the dropIndex must be marked
-    // committed.
     newSnapshot = assert.commandWorked(db.adminCommand("makeSnapshot")).name;
+    assert.commandWorked(db.adminCommand({"setCommittedSnapshot": newSnapshot}));
+    assert.eq(getCursorForReadConcernLevel().itcount(), 10);
+    assert.eq(getAggCursorForReadConcernLevel().itcount(), 10);
+    assert(isCollscan(db, getExplainPlan({version: 1})));
+
+    // Get the UUID before renaming.
+    const collUuid = (() => {
+        const collectionInfos =
+            assert.commandWorked(db.runCommand({listCollections: 1})).cursor.firstBatch;
+        assert.eq(1, collectionInfos.length);
+        const info = collectionInfos[0];
+        assert.eq(t.getName(), info.name);
+        return info.info.uuid;
+    })();
+    assert(collUuid);
+
+    // Get a cursor before renaming.
+    cursor = getCursorForReadConcernLevel();  // Note: uses batchsize=2.
+    assert.eq(cursor.next().version, 4);
+    assert.eq(cursor.next().version, 4);
+    assert(!cursor.objsLeftInBatch());
+
+    // Even though renaming advances the minimum visible snapshot, we're querying by a namespace
+    // that no longer exists. Because of this, the query surprisingly returns no results instead of
+    // timing out. This violates read-committed semantics but is allowed by the current
+    // specification.
+    const tempNs = db.getName() + '.temp';
+    assert.commandWorked(db.adminCommand({renameCollection: t.getFullName(), to: tempNs}));
+    assert.eq(getCursorForReadConcernLevel().itcount(), 0);
+
+    // Trigger a getMore that should fail due to the rename.
+    let error = assert.throws(() => {
+        cursor.next();
+    });
+    assert.eq(error.code, ErrorCodes.QueryPlanKilled);
+
+    // Starting a new query by UUID will block because the minimum visible timestamp is ahead of the
+    // majority-committed snapshot.
+    assertNoSnapshotAvailableForReadConcernLevelByUUID(collUuid);
+
+    // Renaming back will cause queries to block again because the original namespace exists, and
+    // its minimum visible timestamp is ahead of the current majority-committed snapshot.
+    assert.commandWorked(db.adminCommand({renameCollection: tempNs, to: t.getFullName()}));
     assertNoSnapshotAvailableForReadConcernLevel();
+
+    newSnapshot = assert.commandWorked(db.adminCommand("makeSnapshot")).name;
     assert.commandWorked(db.adminCommand({"setCommittedSnapshot": newSnapshot}));
     assert.eq(getCursorForReadConcernLevel().itcount(), 10);
+    assert.eq(getAggCursorForReadConcernLevel().itcount(), 10);
 
     // Dropping the collection is visible in the committed snapshot, even though it hasn't been
     // marked committed yet. This is allowed by the current specification even though it
diff --git a/jstests/replsets/minimum_visible_with_cluster_time.js b/jstests/replsets/minimum_visible_with_cluster_time.js
index 58797e9e47f..7bbbff02522 100644
--- a/jstests/replsets/minimum_visible_with_cluster_time.js
+++ b/jstests/replsets/minimum_visible_with_cluster_time.js
@@ -73,23 +73,38 @@ for (let i = 0; i < 10; i++) {
     assert.commandWorked(
         coll.createIndex({x: 1}, {'name': 'x_1', 'expireAfterSeconds': 60 * 60 * 23}));
 
-    doMajorityRead(coll, 1);
+    // Majority read should eventually see new documents because it will not block on the index
+    // build.
+    assert.soonNoExcept(() => {
+        doMajorityRead(coll, 1);
+        return true;
+    });
 
     assert.commandWorked(coll.insert({x: 7, y: 2}));
     assert.commandWorked(coll.runCommand(
         'collMod', {'index': {'keyPattern': {x: 1}, 'expireAfterSeconds': 60 * 60 * 24}}));
-    doMajorityRead(coll, 2);
+    // Majority read should eventually see new documents because it will not block on the index
+    // build.
+    assert.soonNoExcept(() => {
+        doMajorityRead(coll, 2);
+        return true;
+    });
 
     assert.commandWorked(coll.insert({x: 7, y: 3}));
     assert.commandWorked(coll.dropIndexes());
 
-    doMajorityRead(coll, 3);
+    // Majority read should eventually see new documents because it will not block on the drop.
+    assert.soonNoExcept(() => {
+        doMajorityRead(coll, 3);
+        return true;
+    });
 
     assert.commandWorked(coll.insert({x: 7, y: 4}));
     const newCollNameI = collNameI + '_new';
     assert.commandWorked(coll.renameCollection(newCollNameI));
 
     coll = primary.getDB(dbName).getCollection(newCollNameI);
+    // Majority read should immediately see new documents because it blocks on the rename.
     doMajorityRead(coll, 4);
 }
 
diff --git a/jstests/replsets/read_committed_with_catalog_changes.js b/jstests/replsets/read_committed_with_catalog_changes.js
index 1afbaa40f10..d314187e561 100644
--- a/jstests/replsets/read_committed_with_catalog_changes.js
+++ b/jstests/replsets/read_committed_with_catalog_changes.js
@@ -21,7 +21,10 @@
  *  - reindex collection
  *  - compact collection
  *
- * @tags: [requires_majority_read_concern]
+ * @tags: [
+ *   requires_fcv_47,
+ *   requires_majority_read_concern,
+ * ]
  */
 
 load("jstests/libs/parallelTester.js");  // For Thread.
@@ -140,8 +143,22 @@ const testCases = {
             // So, disabling index build commit quorum.
             assert.commandWorked(db.coll.createIndex({x: 1}, {}, 0));
         },
-        blockedCollections: ['coll'],
-        unblockedCollections: ['other'],
+        blockedCollections: [],
+        unblockedCollections: ['coll', 'other'],
+    },
+    collMod: {
+        prepare: function(db) {
+            // This test create indexes with majority of nodes not available for replication.
+            // So, disabling index build commit quorum.
+            assert.commandWorked(db.coll.createIndex({x: 1}, {expireAfterSeconds: 60 * 60}, 0));
+            assert.commandWorked(db.coll.insert({_id: 1, x: 1}));
+        },
+        performOp: function(db) {
+            assert.commandWorked(db.coll.runCommand(
+                'collMod', {index: {keyPattern: {x: 1}, expireAfterSeconds: 60 * 61}}));
+        },
+        blockedCollections: [],
+        unblockedCollections: ['coll'],
     },
     dropIndex: {
         prepare: function(db) {
@@ -155,8 +172,8 @@ const testCases = {
         performOp: function(db) {
             assert.commandWorked(db.coll.dropIndex({x: 1}));
         },
-        blockedCollections: ['coll'],
-        unblockedCollections: ['other'],
+        blockedCollections: [],
+        unblockedCollections: ['coll', 'other'],
     },
 
     // Remaining case is a local-only operation.
diff --git a/src/mongo/db/SConscript b/src/mongo/db/SConscript
index 66aa6d8cf13..2ad4e518b49 100644
--- a/src/mongo/db/SConscript
+++ b/src/mongo/db/SConscript
@@ -612,7 +612,6 @@ env.Library(
     target='db_raii',
     source=[
         'db_raii.cpp',
-        env.Idlc('db_raii.idl')[0]
     ],
     LIBDEPS=[
         'catalog_raii',
diff --git a/src/mongo/db/catalog/catalog_control.cpp b/src/mongo/db/catalog/catalog_control.cpp
index a81bf952f4c..2b243b53c08 100644
--- a/src/mongo/db/catalog/catalog_control.cpp
+++ b/src/mongo/db/catalog/catalog_control.cpp
@@ -104,7 +104,9 @@ MinVisibleTimestampMap closeCatalog(OperationContext* opCtx) {
     return minVisibleTimestampMap;
 }
 
-void openCatalog(OperationContext* opCtx, const MinVisibleTimestampMap& minVisibleTimestampMap) {
+void openCatalog(OperationContext* opCtx,
+                 const MinVisibleTimestampMap& minVisibleTimestampMap,
+                 Timestamp stableTimestamp) {
     invariant(opCtx->lockState()->isW());
 
     // Load the catalog in the storage engine.
@@ -197,8 +199,18 @@ void openCatalog(OperationContext* opCtx, const MinVisibleTimestampMap& minVisib
                           << "failed to get valid collection pointer for namespace " << collNss);
 
             if (minVisibleTimestampMap.count(collection->uuid()) > 0) {
-                collection->setMinimumVisibleSnapshot(
-                    minVisibleTimestampMap.find(collection->uuid())->second);
+                // After rolling back to a stable timestamp T, the minimum visible timestamp for
+                // each collection must be reset to (at least) its value at T. Additionally, there
+                // cannot exist a minimum visible timestamp greater than lastApplied. This allows us
+                // to upper bound what the minimum visible timestamp can be coming out of rollback.
+                //
+                // Because we only save the latest minimum visible timestamp for each collection, we
+                // bound the minimum visible timestamp (where necessary) to the stable timestamp.
+                // The benefit of fine grained tracking is assumed to be low-value compared to the
+                // cost/effort.
+                auto minVisible = std::min(stableTimestamp,
+                                           minVisibleTimestampMap.find(collection->uuid())->second);
+                collection->setMinimumVisibleSnapshot(minVisible);
             }
 
             // If this is the oplog collection, re-establish the replication system's cached pointer
diff --git a/src/mongo/db/catalog/catalog_control.h b/src/mongo/db/catalog/catalog_control.h
index ab6ee87e8f4..527e1725c07 100644
--- a/src/mongo/db/catalog/catalog_control.h
+++ b/src/mongo/db/catalog/catalog_control.h
@@ -50,6 +50,8 @@ MinVisibleTimestampMap closeCatalog(OperationContext* opCtx);
  *
  * Must be called with the global lock acquired in exclusive mode.
  */
-void openCatalog(OperationContext* opCtx, const MinVisibleTimestampMap& catalogState);
+void openCatalog(OperationContext* opCtx,
+                 const MinVisibleTimestampMap& catalogState,
+                 Timestamp stableTimestamp);
 }  // namespace catalog
 }  // namespace mongo
diff --git a/src/mongo/db/catalog/catalog_control_test.cpp b/src/mongo/db/catalog/catalog_control_test.cpp
index b0877a70def..602b8c26a2b 100644
--- a/src/mongo/db/catalog/catalog_control_test.cpp
+++ b/src/mongo/db/catalog/catalog_control_test.cpp
@@ -75,7 +75,7 @@ TEST_F(CatalogControlTest, CloseAndOpenCatalog) {
     OperationContextNoop opCtx(&cc(), 0);
     auto map = catalog::closeCatalog(&opCtx);
     ASSERT_EQUALS(0U, map.size());
-    catalog::openCatalog(&opCtx, {});
+    catalog::openCatalog(&opCtx, {}, Timestamp());
 }
 
 }  // namespace
diff --git a/src/mongo/db/catalog/index_build_block.cpp b/src/mongo/db/catalog/index_build_block.cpp
index 8226c415c4d..d49a71bbc7e 100644
--- a/src/mongo/db/catalog/index_build_block.cpp
+++ b/src/mongo/db/catalog/index_build_block.cpp
@@ -159,7 +159,6 @@ Status IndexBuildBlock::init(OperationContext* opCtx, Collection* collection) {
                 // This will prevent the unfinished index from being visible on index iterators.
                 if (commitTime) {
                     entry->setMinimumVisibleSnapshot(commitTime.get());
-                    coll->setMinimumVisibleSnapshot(commitTime.get());
                 }
             });
     }
@@ -223,15 +222,6 @@ void IndexBuildBlock::success(OperationContext* opCtx, Collection* collection) {
             // Note: this runs after the WUOW commits but before we release our X lock on the
             // collection. This means that any snapshot created after this must include the full
             // index, and no one can try to read this index before we set the visibility.
-            if (!commitTime) {
-                // The end of background index builds on secondaries does not get a commit
-                // timestamp. We use the cluster time since it's guaranteed to be greater than the
-                // time of the index build. It is possible the cluster time could be in the future,
-                // and we will need to do another write to reach the minimum visible snapshot.
-                const auto currentTime = VectorClock::get(svcCtx)->getTime();
-                commitTime = currentTime.clusterTime().asTimestamp();
-            }
-
             LOGV2(20345,
                   "Index build: done building index {indexName} on ns {nss}",
                   "Index build: done building",
@@ -240,11 +230,9 @@ void IndexBuildBlock::success(OperationContext* opCtx, Collection* collection) {
                   "index"_attr = indexName,
                   "commitTimestamp"_attr = commitTime);
 
-            entry->setMinimumVisibleSnapshot(commitTime.get());
-            // We must also set the minimum visible snapshot on the collection like during init().
-            // This prevents reads in the past from reading inconsistent metadata. We should be
-            // able to remove this when the catalog is versioned.
-            coll->setMinimumVisibleSnapshot(commitTime.get());
+            if (commitTime) {
+                entry->setMinimumVisibleSnapshot(commitTime.get());
+            }
 
             // Add the index to the TTLCollectionCache upon successfully committing the index build.
             if (spec.hasField(IndexDescriptor::kExpireAfterSecondsFieldName)) {
diff --git a/src/mongo/db/catalog/index_catalog_impl.cpp b/src/mongo/db/catalog/index_catalog_impl.cpp
index 0a1254805c8..8874008e553 100644
--- a/src/mongo/db/catalog/index_catalog_impl.cpp
+++ b/src/mongo/db/catalog/index_catalog_impl.cpp
@@ -1020,17 +1020,7 @@ public:
         : _opCtx(opCtx), _collection(collection), _entries(entries), _entry(std::move(entry)) {}
 
     void commit(boost::optional<Timestamp> commitTime) final {
-        // Ban reading from this collection on committed reads on snapshots before now.
-        if (!commitTime) {
-            // This is called when we refresh the index catalog entry, which does not always have
-            // a commit timestamp. We use the cluster time since it's guaranteed to be greater
-            // than the time of the index removal. It is possible the cluster time could be in the
-            // future, and we will need to do another write to reach the minimum visible snapshot.
-            const auto currentTime = VectorClock::get(_opCtx)->getTime();
-            commitTime = currentTime.clusterTime().asTimestamp();
-        }
         _entry->setDropped();
-        _collection->setMinimumVisibleSnapshot(commitTime.get());
     }
 
     void rollback() final {
@@ -1318,11 +1308,16 @@ const IndexDescriptor* IndexCatalogImpl::refreshEntry(OperationContext* opCtx,
     // to the CollectionQueryInfo.
     auto newDesc =
         std::make_unique<IndexDescriptor>(_collection, _getAccessMethodName(keyPattern), spec);
-    const IndexCatalogEntry* newEntry =
-        createIndexEntry(opCtx, std::move(newDesc), CreateIndexEntryFlags::kIsReady);
+    auto newEntry = createIndexEntry(opCtx, std::move(newDesc), CreateIndexEntryFlags::kIsReady);
     invariant(newEntry->isReady(opCtx));
     CollectionQueryInfo::get(_collection).addedIndex(opCtx, _collection, newEntry->descriptor());
 
+    opCtx->recoveryUnit()->onCommit([newEntry](auto commitTime) {
+        if (commitTime) {
+            newEntry->setMinimumVisibleSnapshot(*commitTime);
+        }
+    });
+
     // Return the new descriptor.
     return newEntry->descriptor();
 }
diff --git a/src/mongo/db/db_raii.cpp b/src/mongo/db/db_raii.cpp
index 22a9181f157..5e294acefe4 100644
--- a/src/mongo/db/db_raii.cpp
+++ b/src/mongo/db/db_raii.cpp
@@ -36,7 +36,6 @@
 #include "mongo/db/catalog/database_holder.h"
 #include "mongo/db/concurrency/locker.h"
 #include "mongo/db/curop.h"
-#include "mongo/db/db_raii_gen.h"
 #include "mongo/db/repl/replication_coordinator.h"
 #include "mongo/db/s/collection_sharding_state.h"
 #include "mongo/db/storage/snapshot_helper.h"
@@ -49,7 +48,6 @@ const boost::optional<int> kDoNotChangeProfilingLevel = boost::none;
 
 // TODO: SERVER-44105 remove
 // If set to false, secondary reads should wait behind the PBW lock.
-// Does nothing if gAllowSecondaryReadsDuringBatchApplication setting is false.
 const auto allowSecondaryReadsDuringBatchApplication_DONT_USE =
     OperationContext::declareDecoration<boost::optional<bool>>();
 
@@ -94,10 +92,7 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
     // i.e. the caller does not currently have a ShouldNotConflict... block in scope.
     bool callerWasConflicting = opCtx->lockState()->shouldConflictWithSecondaryBatchApplication();
 
-    // Don't take the ParallelBatchWriterMode lock when the server parameter is set and our
-    // storage engine supports snapshot reads.
-    if (gAllowSecondaryReadsDuringBatchApplication.load() &&
-        allowSecondaryReadsDuringBatchApplication_DONT_USE(opCtx).value_or(true) &&
+    if (allowSecondaryReadsDuringBatchApplication_DONT_USE(opCtx).value_or(true) &&
         opCtx->getServiceContext()->getStorageEngine()->supportsReadConcernSnapshot()) {
         _shouldNotConflictWithSecondaryBatchApplicationBlock.emplace(opCtx->lockState());
     }
@@ -123,11 +118,6 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
         // consistent time, so not taking the PBWM lock is not a problem. On secondaries, we have to
         // guarantee we read at a consistent state, so we must read at the lastApplied timestamp,
         // which is set after each complete batch.
-        //
-        // If an attempt to read at the lastApplied timestamp is unsuccessful because there are
-        // pending catalog changes that occur after that timestamp, we release our locks and try
-        // again with the PBWM lock (by unsetting
-        // _shouldNotConflictWithSecondaryBatchApplicationBlock).
 
         const NamespaceString nss = coll->ns();
         auto readSource = opCtx->recoveryUnit()->getTimestampReadSource();
@@ -175,7 +165,8 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
             !nss.mustBeAppliedInOwnOplogBatch() &&
             SnapshotHelper::shouldReadAtLastApplied(opCtx, nss)) {
             LOGV2_FATAL(4728700,
-                        "Reading from replicated collection without read timestamp or PBWM lock",
+                        "Reading from replicated collection on a secondary without read timestamp "
+                        "or PBWM lock",
                         "collection"_attr = nss);
         }
 
@@ -208,39 +199,28 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
         _autoColl = boost::none;
 
         // If there are pending catalog changes when using a no-overlap or lastApplied read source,
-        // we choose to take the PBWM lock to conflict with any in-progress batches. This prevents
-        // us from idly spinning in this loop trying to get a new read timestamp ahead of the
-        // minimum visible snapshot. This helps us guarantee liveness (i.e. we can eventually get a
-        // suitable read timestamp) but should not be necessary for correctness. After initial sync
-        // finishes, if we waited instead of retrying, readers would block indefinitely waiting for
-        // their read timstamp to move forward. Instead we force the reader take the PBWM lock and
-        // retry.
+        // we yield to get a new read timestamp ahead of the minimum visible snapshot.
         if (readSource == RecoveryUnit::ReadSource::kLastApplied ||
             readSource == RecoveryUnit::ReadSource::kNoOverlap) {
             invariant(readTimestamp);
             LOGV2(20576,
                   "Tried reading at a timestamp, but future catalog changes are pending. "
-                  "Trying again without reading at a timestamp",
+                  "Trying again",
                   "readTimestamp"_attr = *readTimestamp,
                   "collection"_attr = nss.ns(),
                   "collectionMinSnapshot"_attr = *minSnapshot);
-            // Destructing the block sets _shouldConflictWithSecondaryBatchApplication back to the
-            // previous value. If the previous value is false (because there is another
-            // shouldNotConflictWithSecondaryBatchApplicationBlock outside of this function), this
-            // does not take the PBWM lock.
-            _shouldNotConflictWithSecondaryBatchApplicationBlock = boost::none;
-
-            // As alluded to above, if we are AutoGetting multiple collections, it
-            // is possible that our "reaquire the PBWM" trick doesn't work, since we've already done
+
+            // If we are AutoGetting multiple collections, it is possible that we've already done
             // some reads and locked in our snapshot.  At this point, the only way out is to fail
             // the operation. The client application will need to retry.
             uassert(
                 ErrorCodes::SnapshotUnavailable,
                 str::stream() << "Unable to read from a snapshot due to pending collection catalog "
-                                 "changes; please retry the operation. Snapshot timestamp is "
+                                 "changes and holding multiple collection locks; please retry the "
+                                 "operation. Snapshot timestamp is "
                               << readTimestamp->toString() << ". Collection minimum is "
                               << minSnapshot->toString(),
-                opCtx->lockState()->shouldConflictWithSecondaryBatchApplication());
+                !opCtx->lockState()->isLocked());
 
             // Abandon our snapshot. We may select a new read timestamp or ReadSource in the next
             // loop iteration.
diff --git a/src/mongo/db/db_raii.idl b/src/mongo/db/db_raii.idl
deleted file mode 100644
index d04759f18bc..00000000000
--- a/src/mongo/db/db_raii.idl
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (C) 2019-present MongoDB, Inc.
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the Server Side Public License, version 1,
-# as published by MongoDB, Inc.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# Server Side Public License for more details.
-#
-# You should have received a copy of the Server Side Public License
-# along with this program. If not, see
-# <http://www.mongodb.com/licensing/server-side-public-license>.
-#
-# As a special exception, the copyright holders give permission to link the
-# code of portions of this program with the OpenSSL library under certain
-# conditions as described in each individual source file and distribute
-# linked combinations including the program with the OpenSSL library. You
-# must comply with the Server Side Public License in all respects for
-# all of the code used other than as permitted herein. If you modify file(s)
-# with this exception, you may extend this exception to your version of the
-# file(s), but you are not obligated to do so. If you do not wish to do so,
-# delete this exception statement from your version. If you delete this
-# exception statement from all source files in the program, then also delete
-# it in the license file.
-#
-
-global:
-    cpp_namespace: "mongo"
-
-server_parameters:
-    allowSecondaryReadsDuringBatchApplication:
-        description: 'If true, do not take PBWM lock in AutoGetCollectionForRead on secondaries during batch application.'
-        set_at: [startup, runtime]
-        cpp_vartype: AtomicWord<bool>
-        cpp_varname: gAllowSecondaryReadsDuringBatchApplication
-        default: true
diff --git a/src/mongo/db/db_raii_test.cpp b/src/mongo/db/db_raii_test.cpp
index eba322c5581..a9239f0a92e 100644
--- a/src/mongo/db/db_raii_test.cpp
+++ b/src/mongo/db/db_raii_test.cpp
@@ -235,10 +235,9 @@ TEST_F(DBRAIITestFixture,
 
     // Don't call into the ReplicationCoordinator to update lastApplied because it is only a mock
     // class and does not update the correct state in the SnapshotManager.
-    repl::OpTime opTime(Timestamp(200, 1), 1);
     auto snapshotManager =
         client1.second.get()->getServiceContext()->getStorageEngine()->getSnapshotManager();
-    snapshotManager->setLastApplied(opTime.getTimestamp());
+    snapshotManager->setLastApplied(replCoord->getMyLastAppliedOpTime().getTimestamp());
     Lock::DBLock dbLock1(client1.second.get(), nss.db(), MODE_IX);
     ASSERT(client1.second->lockState()->isDbLockedForMode(nss.db(), MODE_IX));
 
@@ -310,14 +309,24 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedConflict) {
 
     // Simulate using a DBDirectClient to test this behavior for user reads.
     client1.first->setInDirectClient(true);
-    AutoGetCollectionForRead coll(client1.second.get(), nss);
 
-    // We can't read from kLastApplied in this scenario because there is a catalog conflict. Resort
-    // to taking the PBWM lock and reading without a timestamp.
+    auto timeoutError = client1.second->getTimeoutError();
+    auto waitForLock = [&] {
+        auto deadline = Date_t::now() + Milliseconds(10);
+        client1.second->runWithDeadline(deadline, timeoutError, [&] {
+            AutoGetCollectionForRead coll(client1.second.get(), nss);
+        });
+    };
+
+    // Expect that the lock acquisition eventually times out because lastApplied is not advancing.
+    ASSERT_THROWS_CODE(waitForLock(), DBException, timeoutError);
+
+    // Advance lastApplied and ensure the lock acquisition succeeds.
+    snapshotManager->setLastApplied(replCoord->getMyLastAppliedOpTime().getTimestamp());
+
+    AutoGetCollectionForRead coll(client1.second.get(), nss);
     ASSERT_EQ(client1.second.get()->recoveryUnit()->getTimestampReadSource(),
-              RecoveryUnit::ReadSource::kNoTimestamp);
-    ASSERT_TRUE(client1.second.get()->lockState()->isLockHeldForMode(
-        resourceIdParallelBatchWriterMode, MODE_IS));
+              RecoveryUnit::ReadSource::kLastApplied);
 }
 
 TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedUnavailable) {
diff --git a/src/mongo/db/repair.cpp b/src/mongo/db/repair.cpp
index 0b3dfca8ebb..950e0b8d9aa 100644
--- a/src/mongo/db/repair.cpp
+++ b/src/mongo/db/repair.cpp
@@ -152,7 +152,7 @@ Status repairDatabase(OperationContext* opCtx, StorageEngine* engine, const std:
     databaseHolder->close(opCtx, dbName);
 
     // Reopening db is necessary for repairCollections.
-    auto db = databaseHolder->openDb(opCtx, dbName);
+    databaseHolder->openDb(opCtx, dbName);
 
     auto status = repairCollections(opCtx, engine, dbName);
     if (!status.isOK()) {
@@ -167,19 +167,6 @@ Status repairDatabase(OperationContext* opCtx, StorageEngine* engine, const std:
         // Ensure that we don't trigger an exception when attempting to take locks.
         UninterruptibleLockGuard noInterrupt(opCtx->lockState());
 
-        // Set the minimum snapshot for all Collections in this db. This ensures that readers
-        // using majority readConcern level can only use the collections after their repaired
-        // versions are in the committed view.
-        const auto currentTime = VectorClock::get(opCtx)->getTime();
-
-        for (auto collIt = db->begin(opCtx); collIt != db->end(opCtx); ++collIt) {
-            auto collection =
-                collIt.getWritableCollection(opCtx, CollectionCatalog::LifetimeMode::kInplace);
-            if (collection) {
-                collection->setMinimumVisibleSnapshot(currentTime.clusterTime().asTimestamp());
-            }
-        }
-
         // Restore oplog Collection pointer cache.
         repl::acquireOplogCollectionForLogging(opCtx);
     } catch (const ExceptionFor<ErrorCodes::MustDowngrade>&) {
diff --git a/src/mongo/db/storage/storage_engine_impl.cpp b/src/mongo/db/storage/storage_engine_impl.cpp
index ea2e13f3a0f..c8c87173421 100644
--- a/src/mongo/db/storage/storage_engine_impl.cpp
+++ b/src/mongo/db/storage/storage_engine_impl.cpp
@@ -956,7 +956,7 @@ StatusWith<Timestamp> StorageEngineImpl::recoverToStableTimestamp(OperationConte
         return swTimestamp;
     }
 
-    catalog::openCatalog(opCtx, state);
+    catalog::openCatalog(opCtx, state, swTimestamp.getValue());
 
     LOGV2(22259,
           "recoverToStableTimestamp successful",
diff --git a/src/mongo/dbtests/storage_timestamp_tests.cpp b/src/mongo/dbtests/storage_timestamp_tests.cpp
index e7c007b5835..a70b5b42402 100644
--- a/src/mongo/dbtests/storage_timestamp_tests.cpp
+++ b/src/mongo/dbtests/storage_timestamp_tests.cpp
@@ -1618,9 +1618,8 @@ public:
 
         AutoGetCollection autoColl(_opCtx, nss, LockMode::MODE_IX);
 
-        // It is not valid to read the multikey state earlier than the 'minimumVisibleTimestamp',
-        // so at least assert that it has been updated due to the index creation.
-        ASSERT_GT(autoColl.getCollection()->getMinimumVisibleSnapshot().get(),
+        // Ensure minimumVisible has not been updated due to the index creation.
+        ASSERT_LT(autoColl.getCollection()->getMinimumVisibleSnapshot().get(),
                   pastTime.asTimestamp());
 
         // Reading the multikey state before 'insertTime0' is not valid or reliable to test. If the
author	Louis Williams <louis.williams@mongodb.com>	2020-09-22 16:03:46 -0400
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2020-09-23 16:49:17 +0000
commit	669e5650151733738ce8270e5bdf3c5759665316 (patch)
tree	40e8d2b26c236fd3effe64d5fd04b7a05162f1ec
parent	0e044981649f6712b3cc3f39bdb41c6e06546815 (diff)
download	mongo-669e5650151733738ce8270e5bdf3c5759665316.tar.gz