From 669e5650151733738ce8270e5bdf3c5759665316 Mon Sep 17 00:00:00 2001 From: Louis Williams Date: Tue, 22 Sep 2020 16:03:46 -0400 Subject: SERVER-47866 Secondary readers do not need to reacquire PBWM lock if there are catalog conflicts --- ...snapshot_read_at_cluster_time_ddl_operations.js | 4 +- .../snapshot_read_catalog_operations.js | 4 +- jstests/libs/global_snapshot_reads_util.js | 10 +++- .../index_stepdown_abort_prepare_conflict.js | 1 + jstests/noPassthrough/out_majority_read_replset.js | 20 +++++-- .../read_concern_snapshot_catalog_invalidation.js | 20 ++++--- jstests/noPassthrough/read_majority.js | 70 +++++++++++++++++++--- .../replsets/minimum_visible_with_cluster_time.js | 21 ++++++- .../read_committed_with_catalog_changes.js | 27 +++++++-- src/mongo/db/SConscript | 1 - src/mongo/db/catalog/catalog_control.cpp | 18 +++++- src/mongo/db/catalog/catalog_control.h | 4 +- src/mongo/db/catalog/catalog_control_test.cpp | 2 +- src/mongo/db/catalog/index_build_block.cpp | 18 +----- src/mongo/db/catalog/index_catalog_impl.cpp | 19 +++--- src/mongo/db/db_raii.cpp | 40 ++++--------- src/mongo/db/db_raii.idl | 38 ------------ src/mongo/db/db_raii_test.cpp | 25 +++++--- src/mongo/db/repair.cpp | 15 +---- src/mongo/db/storage/storage_engine_impl.cpp | 2 +- src/mongo/dbtests/storage_timestamp_tests.cpp | 5 +- 21 files changed, 202 insertions(+), 162 deletions(-) delete mode 100644 src/mongo/db/db_raii.idl diff --git a/jstests/concurrency/fsm_workloads/snapshot_read_at_cluster_time_ddl_operations.js b/jstests/concurrency/fsm_workloads/snapshot_read_at_cluster_time_ddl_operations.js index 07d1b578109..c6bf1e808bd 100644 --- a/jstests/concurrency/fsm_workloads/snapshot_read_at_cluster_time_ddl_operations.js +++ b/jstests/concurrency/fsm_workloads/snapshot_read_at_cluster_time_ddl_operations.js @@ -2,8 +2,7 @@ /** * Perform point-in-time snapshot reads that span a 'find' and multiple 'getmore's concurrently with - * CRUD operations. Index operations running concurrently with the snapshot read may cause - * the read to fail with a SnapshotUnavailable error. + * CRUD operations. * * @tags: [creates_background_indexes, requires_fcv_47, requires_replication, * does_not_support_causal_consistency, requires_majority_read_concern] @@ -17,7 +16,6 @@ var $config = (function() { snapshotScan: function snapshotScan(db, collName) { const readErrorCodes = [ - ErrorCodes.SnapshotUnavailable, ErrorCodes.ShutdownInProgress, ErrorCodes.CursorNotFound, ErrorCodes.QueryPlanKilled, diff --git a/jstests/concurrency/fsm_workloads/snapshot_read_catalog_operations.js b/jstests/concurrency/fsm_workloads/snapshot_read_catalog_operations.js index 87a0c2f0515..5b89d01d29f 100644 --- a/jstests/concurrency/fsm_workloads/snapshot_read_catalog_operations.js +++ b/jstests/concurrency/fsm_workloads/snapshot_read_catalog_operations.js @@ -5,8 +5,7 @@ * snapshot reads and CRUD operations will all contend for locks on db and collName. Since the * snapshot read does not release its locks until the transaction is committed, it is expected that * once the read has begun, catalog operations with conflicting locks will block until the read is - * finished. Additionally, index operations running concurrently with the snapshot read may cause - * the read to fail with a SnapshotUnavailable error. + * finished. * * @tags: [creates_background_indexes, uses_transactions] */ @@ -32,7 +31,6 @@ var $config = (function() { const sortByAscending = sortOptions[Random.randInt(2)]; const readErrorCodes = [ ErrorCodes.NoSuchTransaction, - ErrorCodes.SnapshotUnavailable, ErrorCodes.SnapshotTooOld, ErrorCodes.StaleChunkHistory, ErrorCodes.LockTimeout, diff --git a/jstests/libs/global_snapshot_reads_util.js b/jstests/libs/global_snapshot_reads_util.js index 10f8f9ff74d..658141ebdbd 100644 --- a/jstests/libs/global_snapshot_reads_util.js +++ b/jstests/libs/global_snapshot_reads_util.js @@ -138,9 +138,17 @@ function SnapshotReadsTest({primaryDB, secondaryDB, awaitCommittedFn}) { // This update is not visible to reads at insertTimestamp. res = assert.commandWorked(primaryDB.runCommand( {update: collName, updates: [{q: {}, u: {$set: {x: true}}, multi: true}]})); - jsTestLog(`Updated collection "${collName}" at timestamp ${ tojson(res.operationTime)}`); + + awaitCommittedFn(db, res.operationTime); + + // This index is not visible to reads at insertTimestamp and does not cause the + // operation to fail. + res = assert.commandWorked(primaryDB.runCommand( + {createIndexes: collName, indexes: [{key: {x: 1}, name: 'x_1'}]})); + jsTestLog(`Created an index on collection "${collName}" at timestamp ${ + tojson(res.operationTime)}`); awaitCommittedFn(db, res.operationTime); // Retrieve the rest of the read command's result set. diff --git a/jstests/noPassthrough/index_stepdown_abort_prepare_conflict.js b/jstests/noPassthrough/index_stepdown_abort_prepare_conflict.js index c47922aeb68..6b207866ed7 100644 --- a/jstests/noPassthrough/index_stepdown_abort_prepare_conflict.js +++ b/jstests/noPassthrough/index_stepdown_abort_prepare_conflict.js @@ -100,6 +100,7 @@ assert.commandWorked(newSession.abortTransaction_forTesting()); IndexBuildTest.waitForIndexBuildToStop(newPrimary.getDB(dbName), collName, indexName); IndexBuildTest.waitForIndexBuildToStop(primary.getDB(dbName), collName, indexName); +rst.awaitReplication(); IndexBuildTest.assertIndexes(newPrimary.getDB(dbName).getCollection(collName), 1, ["_id_"], []); IndexBuildTest.assertIndexes(primaryColl, 1, ["_id_"], []); diff --git a/jstests/noPassthrough/out_majority_read_replset.js b/jstests/noPassthrough/out_majority_read_replset.js index 0ffba741b3e..4ca9bd83e0a 100644 --- a/jstests/noPassthrough/out_majority_read_replset.js +++ b/jstests/noPassthrough/out_majority_read_replset.js @@ -28,12 +28,22 @@ rst.awaitLastOpCommitted(); stopReplicationOnSecondaries(rst); -// Create the index that is not majority commited -// This test create indexes with majority of nodes not available for replication. So, disabling -// index build commit quorum. +// Rename the collection temporarily and then back to its original name. This advances the minimum +// visible snapshot and forces the $out to block until its snapshot advances. +const tempColl = db.getName() + '.temp'; +assert.commandWorked(db.adminCommand({ + renameCollection: sourceColl.getFullName(), + to: tempColl, +})); +assert.commandWorked(db.adminCommand({ + renameCollection: tempColl, + to: sourceColl.getFullName(), +})); + +// Create the index that is not majority committed assert.commandWorked(sourceColl.createIndex({state: 1}, {name: "secondIndex"}, 0)); -// Run the $out in the parallel shell as it will block in the metadata until the shapshot is +// Run the $out in the parallel shell as it will block in the metadata until the snapshot is // advanced. const awaitShell = startParallelShell(`{ const testDB = db.getSiblingDB("${name}"); @@ -58,7 +68,7 @@ assert.soon(function() { return assert.commandWorked(db.currentOp(filter)).inprog.length === 1; }); -// Restart data replicaiton and wait until the new write becomes visible. +// Restart data replication and wait until the new write becomes visible. restartReplicationOnSecondaries(rst); rst.awaitLastOpCommitted(); diff --git a/jstests/noPassthrough/read_concern_snapshot_catalog_invalidation.js b/jstests/noPassthrough/read_concern_snapshot_catalog_invalidation.js index b6cc5d4b21d..fb296a9d29e 100644 --- a/jstests/noPassthrough/read_concern_snapshot_catalog_invalidation.js +++ b/jstests/noPassthrough/read_concern_snapshot_catalog_invalidation.js @@ -42,13 +42,19 @@ function testCommand(cmd, curOpFilter) { waitForCurOpByFailPointNoNS(testDB, "hangAfterPreallocateSnapshot", curOpFilter); - // Create an index on the collection the command was executed against. This will move the - // collection's minimum visible timestamp to a point later than the point-in-time referenced - // by the transaction snapshot. - assert.commandWorked(testDB.runCommand({ - createIndexes: kCollName, - indexes: [{key: {x: 1}, name: "x_1"}], - writeConcern: {w: "majority"} + // Rename the collection the command was executed against and then back to its original name. + // This will move the collection's minimum visible timestamp to a point later than the + // point-in-time referenced by the transaction snapshot. + const tempColl = testDB.getName() + '.temp'; + assert.commandWorked(testDB.adminCommand({ + renameCollection: testDB.getName() + '.' + kCollName, + to: tempColl, + writeConcern: {w: "majority"}, + })); + assert.commandWorked(testDB.adminCommand({ + renameCollection: tempColl, + to: testDB.getName() + '.' + kCollName, + writeConcern: {w: "majority"}, })); // Disable the hang and check for parallel shell success. Success indicates that the command diff --git a/jstests/noPassthrough/read_majority.js b/jstests/noPassthrough/read_majority.js index 12a2c991641..227e04e7dc4 100644 --- a/jstests/noPassthrough/read_majority.js +++ b/jstests/noPassthrough/read_majority.js @@ -53,6 +53,13 @@ function testReadConcernLevel(level) { assert.eq(res.code, ErrorCodes.MaxTimeMSExpired); } + function assertNoSnapshotAvailableForReadConcernLevelByUUID(uuid) { + var res = + db.runCommand({find: uuid, batchSize: 2, readConcern: {level: level}, maxTimeMS: 1000}); + assert.commandFailed(res); + assert.eq(res.code, ErrorCodes.MaxTimeMSExpired); + } + function getCursorForReadConcernLevel() { var res = t.runCommand('find', {batchSize: 2, readConcern: {level: level}}); assert.commandWorked(res); @@ -145,30 +152,77 @@ function testReadConcernLevel(level) { assert.eq(cursor.next().version, 4); assert.eq(cursor.next().version, 4); - // Adding an index bumps the min snapshot for a collection as of SERVER-20260. This may - // change to just filter that index out from query planning as part of SERVER-20439. + // Adding an index does not bump the min snapshot for a collection. Collection scans are + // possible, however the index is not guaranteed to be usable until the majority-committed + // snapshot advances. t.createIndex({version: 1}, {}, 0); - assertNoSnapshotAvailableForReadConcernLevel(); + assert.eq(getCursorForReadConcernLevel().itcount(), 10); + assert.eq(getAggCursorForReadConcernLevel().itcount(), 10); // To use the index, a snapshot created after the index was completed must be marked // committed. var newSnapshot = assert.commandWorked(db.adminCommand("makeSnapshot")).name; - assertNoSnapshotAvailableForReadConcernLevel(); assert.commandWorked(db.adminCommand({"setCommittedSnapshot": newSnapshot})); assert.eq(getCursorForReadConcernLevel().itcount(), 10); assert.eq(getAggCursorForReadConcernLevel().itcount(), 10); assert(isIxscan(db, getExplainPlan({version: 1}))); - // Dropping an index does bump the min snapshot. + // Dropping an index does not bump the min snapshot, so the query should succeed. t.dropIndex({version: 1}); - assertNoSnapshotAvailableForReadConcernLevel(); + assert.eq(getCursorForReadConcernLevel().itcount(), 10); + assert.eq(getAggCursorForReadConcernLevel().itcount(), 10); + assert(isCollscan(db, getExplainPlan({version: 1}))); - // To use the collection again, a snapshot created after the dropIndex must be marked - // committed. newSnapshot = assert.commandWorked(db.adminCommand("makeSnapshot")).name; + assert.commandWorked(db.adminCommand({"setCommittedSnapshot": newSnapshot})); + assert.eq(getCursorForReadConcernLevel().itcount(), 10); + assert.eq(getAggCursorForReadConcernLevel().itcount(), 10); + assert(isCollscan(db, getExplainPlan({version: 1}))); + + // Get the UUID before renaming. + const collUuid = (() => { + const collectionInfos = + assert.commandWorked(db.runCommand({listCollections: 1})).cursor.firstBatch; + assert.eq(1, collectionInfos.length); + const info = collectionInfos[0]; + assert.eq(t.getName(), info.name); + return info.info.uuid; + })(); + assert(collUuid); + + // Get a cursor before renaming. + cursor = getCursorForReadConcernLevel(); // Note: uses batchsize=2. + assert.eq(cursor.next().version, 4); + assert.eq(cursor.next().version, 4); + assert(!cursor.objsLeftInBatch()); + + // Even though renaming advances the minimum visible snapshot, we're querying by a namespace + // that no longer exists. Because of this, the query surprisingly returns no results instead of + // timing out. This violates read-committed semantics but is allowed by the current + // specification. + const tempNs = db.getName() + '.temp'; + assert.commandWorked(db.adminCommand({renameCollection: t.getFullName(), to: tempNs})); + assert.eq(getCursorForReadConcernLevel().itcount(), 0); + + // Trigger a getMore that should fail due to the rename. + let error = assert.throws(() => { + cursor.next(); + }); + assert.eq(error.code, ErrorCodes.QueryPlanKilled); + + // Starting a new query by UUID will block because the minimum visible timestamp is ahead of the + // majority-committed snapshot. + assertNoSnapshotAvailableForReadConcernLevelByUUID(collUuid); + + // Renaming back will cause queries to block again because the original namespace exists, and + // its minimum visible timestamp is ahead of the current majority-committed snapshot. + assert.commandWorked(db.adminCommand({renameCollection: tempNs, to: t.getFullName()})); assertNoSnapshotAvailableForReadConcernLevel(); + + newSnapshot = assert.commandWorked(db.adminCommand("makeSnapshot")).name; assert.commandWorked(db.adminCommand({"setCommittedSnapshot": newSnapshot})); assert.eq(getCursorForReadConcernLevel().itcount(), 10); + assert.eq(getAggCursorForReadConcernLevel().itcount(), 10); // Dropping the collection is visible in the committed snapshot, even though it hasn't been // marked committed yet. This is allowed by the current specification even though it diff --git a/jstests/replsets/minimum_visible_with_cluster_time.js b/jstests/replsets/minimum_visible_with_cluster_time.js index 58797e9e47f..7bbbff02522 100644 --- a/jstests/replsets/minimum_visible_with_cluster_time.js +++ b/jstests/replsets/minimum_visible_with_cluster_time.js @@ -73,23 +73,38 @@ for (let i = 0; i < 10; i++) { assert.commandWorked( coll.createIndex({x: 1}, {'name': 'x_1', 'expireAfterSeconds': 60 * 60 * 23})); - doMajorityRead(coll, 1); + // Majority read should eventually see new documents because it will not block on the index + // build. + assert.soonNoExcept(() => { + doMajorityRead(coll, 1); + return true; + }); assert.commandWorked(coll.insert({x: 7, y: 2})); assert.commandWorked(coll.runCommand( 'collMod', {'index': {'keyPattern': {x: 1}, 'expireAfterSeconds': 60 * 60 * 24}})); - doMajorityRead(coll, 2); + // Majority read should eventually see new documents because it will not block on the index + // build. + assert.soonNoExcept(() => { + doMajorityRead(coll, 2); + return true; + }); assert.commandWorked(coll.insert({x: 7, y: 3})); assert.commandWorked(coll.dropIndexes()); - doMajorityRead(coll, 3); + // Majority read should eventually see new documents because it will not block on the drop. + assert.soonNoExcept(() => { + doMajorityRead(coll, 3); + return true; + }); assert.commandWorked(coll.insert({x: 7, y: 4})); const newCollNameI = collNameI + '_new'; assert.commandWorked(coll.renameCollection(newCollNameI)); coll = primary.getDB(dbName).getCollection(newCollNameI); + // Majority read should immediately see new documents because it blocks on the rename. doMajorityRead(coll, 4); } diff --git a/jstests/replsets/read_committed_with_catalog_changes.js b/jstests/replsets/read_committed_with_catalog_changes.js index 1afbaa40f10..d314187e561 100644 --- a/jstests/replsets/read_committed_with_catalog_changes.js +++ b/jstests/replsets/read_committed_with_catalog_changes.js @@ -21,7 +21,10 @@ * - reindex collection * - compact collection * - * @tags: [requires_majority_read_concern] + * @tags: [ + * requires_fcv_47, + * requires_majority_read_concern, + * ] */ load("jstests/libs/parallelTester.js"); // For Thread. @@ -140,8 +143,22 @@ const testCases = { // So, disabling index build commit quorum. assert.commandWorked(db.coll.createIndex({x: 1}, {}, 0)); }, - blockedCollections: ['coll'], - unblockedCollections: ['other'], + blockedCollections: [], + unblockedCollections: ['coll', 'other'], + }, + collMod: { + prepare: function(db) { + // This test create indexes with majority of nodes not available for replication. + // So, disabling index build commit quorum. + assert.commandWorked(db.coll.createIndex({x: 1}, {expireAfterSeconds: 60 * 60}, 0)); + assert.commandWorked(db.coll.insert({_id: 1, x: 1})); + }, + performOp: function(db) { + assert.commandWorked(db.coll.runCommand( + 'collMod', {index: {keyPattern: {x: 1}, expireAfterSeconds: 60 * 61}})); + }, + blockedCollections: [], + unblockedCollections: ['coll'], }, dropIndex: { prepare: function(db) { @@ -155,8 +172,8 @@ const testCases = { performOp: function(db) { assert.commandWorked(db.coll.dropIndex({x: 1})); }, - blockedCollections: ['coll'], - unblockedCollections: ['other'], + blockedCollections: [], + unblockedCollections: ['coll', 'other'], }, // Remaining case is a local-only operation. diff --git a/src/mongo/db/SConscript b/src/mongo/db/SConscript index 66aa6d8cf13..2ad4e518b49 100644 --- a/src/mongo/db/SConscript +++ b/src/mongo/db/SConscript @@ -612,7 +612,6 @@ env.Library( target='db_raii', source=[ 'db_raii.cpp', - env.Idlc('db_raii.idl')[0] ], LIBDEPS=[ 'catalog_raii', diff --git a/src/mongo/db/catalog/catalog_control.cpp b/src/mongo/db/catalog/catalog_control.cpp index a81bf952f4c..2b243b53c08 100644 --- a/src/mongo/db/catalog/catalog_control.cpp +++ b/src/mongo/db/catalog/catalog_control.cpp @@ -104,7 +104,9 @@ MinVisibleTimestampMap closeCatalog(OperationContext* opCtx) { return minVisibleTimestampMap; } -void openCatalog(OperationContext* opCtx, const MinVisibleTimestampMap& minVisibleTimestampMap) { +void openCatalog(OperationContext* opCtx, + const MinVisibleTimestampMap& minVisibleTimestampMap, + Timestamp stableTimestamp) { invariant(opCtx->lockState()->isW()); // Load the catalog in the storage engine. @@ -197,8 +199,18 @@ void openCatalog(OperationContext* opCtx, const MinVisibleTimestampMap& minVisib << "failed to get valid collection pointer for namespace " << collNss); if (minVisibleTimestampMap.count(collection->uuid()) > 0) { - collection->setMinimumVisibleSnapshot( - minVisibleTimestampMap.find(collection->uuid())->second); + // After rolling back to a stable timestamp T, the minimum visible timestamp for + // each collection must be reset to (at least) its value at T. Additionally, there + // cannot exist a minimum visible timestamp greater than lastApplied. This allows us + // to upper bound what the minimum visible timestamp can be coming out of rollback. + // + // Because we only save the latest minimum visible timestamp for each collection, we + // bound the minimum visible timestamp (where necessary) to the stable timestamp. + // The benefit of fine grained tracking is assumed to be low-value compared to the + // cost/effort. + auto minVisible = std::min(stableTimestamp, + minVisibleTimestampMap.find(collection->uuid())->second); + collection->setMinimumVisibleSnapshot(minVisible); } // If this is the oplog collection, re-establish the replication system's cached pointer diff --git a/src/mongo/db/catalog/catalog_control.h b/src/mongo/db/catalog/catalog_control.h index ab6ee87e8f4..527e1725c07 100644 --- a/src/mongo/db/catalog/catalog_control.h +++ b/src/mongo/db/catalog/catalog_control.h @@ -50,6 +50,8 @@ MinVisibleTimestampMap closeCatalog(OperationContext* opCtx); * * Must be called with the global lock acquired in exclusive mode. */ -void openCatalog(OperationContext* opCtx, const MinVisibleTimestampMap& catalogState); +void openCatalog(OperationContext* opCtx, + const MinVisibleTimestampMap& catalogState, + Timestamp stableTimestamp); } // namespace catalog } // namespace mongo diff --git a/src/mongo/db/catalog/catalog_control_test.cpp b/src/mongo/db/catalog/catalog_control_test.cpp index b0877a70def..602b8c26a2b 100644 --- a/src/mongo/db/catalog/catalog_control_test.cpp +++ b/src/mongo/db/catalog/catalog_control_test.cpp @@ -75,7 +75,7 @@ TEST_F(CatalogControlTest, CloseAndOpenCatalog) { OperationContextNoop opCtx(&cc(), 0); auto map = catalog::closeCatalog(&opCtx); ASSERT_EQUALS(0U, map.size()); - catalog::openCatalog(&opCtx, {}); + catalog::openCatalog(&opCtx, {}, Timestamp()); } } // namespace diff --git a/src/mongo/db/catalog/index_build_block.cpp b/src/mongo/db/catalog/index_build_block.cpp index 8226c415c4d..d49a71bbc7e 100644 --- a/src/mongo/db/catalog/index_build_block.cpp +++ b/src/mongo/db/catalog/index_build_block.cpp @@ -159,7 +159,6 @@ Status IndexBuildBlock::init(OperationContext* opCtx, Collection* collection) { // This will prevent the unfinished index from being visible on index iterators. if (commitTime) { entry->setMinimumVisibleSnapshot(commitTime.get()); - coll->setMinimumVisibleSnapshot(commitTime.get()); } }); } @@ -223,15 +222,6 @@ void IndexBuildBlock::success(OperationContext* opCtx, Collection* collection) { // Note: this runs after the WUOW commits but before we release our X lock on the // collection. This means that any snapshot created after this must include the full // index, and no one can try to read this index before we set the visibility. - if (!commitTime) { - // The end of background index builds on secondaries does not get a commit - // timestamp. We use the cluster time since it's guaranteed to be greater than the - // time of the index build. It is possible the cluster time could be in the future, - // and we will need to do another write to reach the minimum visible snapshot. - const auto currentTime = VectorClock::get(svcCtx)->getTime(); - commitTime = currentTime.clusterTime().asTimestamp(); - } - LOGV2(20345, "Index build: done building index {indexName} on ns {nss}", "Index build: done building", @@ -240,11 +230,9 @@ void IndexBuildBlock::success(OperationContext* opCtx, Collection* collection) { "index"_attr = indexName, "commitTimestamp"_attr = commitTime); - entry->setMinimumVisibleSnapshot(commitTime.get()); - // We must also set the minimum visible snapshot on the collection like during init(). - // This prevents reads in the past from reading inconsistent metadata. We should be - // able to remove this when the catalog is versioned. - coll->setMinimumVisibleSnapshot(commitTime.get()); + if (commitTime) { + entry->setMinimumVisibleSnapshot(commitTime.get()); + } // Add the index to the TTLCollectionCache upon successfully committing the index build. if (spec.hasField(IndexDescriptor::kExpireAfterSecondsFieldName)) { diff --git a/src/mongo/db/catalog/index_catalog_impl.cpp b/src/mongo/db/catalog/index_catalog_impl.cpp index 0a1254805c8..8874008e553 100644 --- a/src/mongo/db/catalog/index_catalog_impl.cpp +++ b/src/mongo/db/catalog/index_catalog_impl.cpp @@ -1020,17 +1020,7 @@ public: : _opCtx(opCtx), _collection(collection), _entries(entries), _entry(std::move(entry)) {} void commit(boost::optional commitTime) final { - // Ban reading from this collection on committed reads on snapshots before now. - if (!commitTime) { - // This is called when we refresh the index catalog entry, which does not always have - // a commit timestamp. We use the cluster time since it's guaranteed to be greater - // than the time of the index removal. It is possible the cluster time could be in the - // future, and we will need to do another write to reach the minimum visible snapshot. - const auto currentTime = VectorClock::get(_opCtx)->getTime(); - commitTime = currentTime.clusterTime().asTimestamp(); - } _entry->setDropped(); - _collection->setMinimumVisibleSnapshot(commitTime.get()); } void rollback() final { @@ -1318,11 +1308,16 @@ const IndexDescriptor* IndexCatalogImpl::refreshEntry(OperationContext* opCtx, // to the CollectionQueryInfo. auto newDesc = std::make_unique(_collection, _getAccessMethodName(keyPattern), spec); - const IndexCatalogEntry* newEntry = - createIndexEntry(opCtx, std::move(newDesc), CreateIndexEntryFlags::kIsReady); + auto newEntry = createIndexEntry(opCtx, std::move(newDesc), CreateIndexEntryFlags::kIsReady); invariant(newEntry->isReady(opCtx)); CollectionQueryInfo::get(_collection).addedIndex(opCtx, _collection, newEntry->descriptor()); + opCtx->recoveryUnit()->onCommit([newEntry](auto commitTime) { + if (commitTime) { + newEntry->setMinimumVisibleSnapshot(*commitTime); + } + }); + // Return the new descriptor. return newEntry->descriptor(); } diff --git a/src/mongo/db/db_raii.cpp b/src/mongo/db/db_raii.cpp index 22a9181f157..5e294acefe4 100644 --- a/src/mongo/db/db_raii.cpp +++ b/src/mongo/db/db_raii.cpp @@ -36,7 +36,6 @@ #include "mongo/db/catalog/database_holder.h" #include "mongo/db/concurrency/locker.h" #include "mongo/db/curop.h" -#include "mongo/db/db_raii_gen.h" #include "mongo/db/repl/replication_coordinator.h" #include "mongo/db/s/collection_sharding_state.h" #include "mongo/db/storage/snapshot_helper.h" @@ -49,7 +48,6 @@ const boost::optional kDoNotChangeProfilingLevel = boost::none; // TODO: SERVER-44105 remove // If set to false, secondary reads should wait behind the PBW lock. -// Does nothing if gAllowSecondaryReadsDuringBatchApplication setting is false. const auto allowSecondaryReadsDuringBatchApplication_DONT_USE = OperationContext::declareDecoration>(); @@ -94,10 +92,7 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx, // i.e. the caller does not currently have a ShouldNotConflict... block in scope. bool callerWasConflicting = opCtx->lockState()->shouldConflictWithSecondaryBatchApplication(); - // Don't take the ParallelBatchWriterMode lock when the server parameter is set and our - // storage engine supports snapshot reads. - if (gAllowSecondaryReadsDuringBatchApplication.load() && - allowSecondaryReadsDuringBatchApplication_DONT_USE(opCtx).value_or(true) && + if (allowSecondaryReadsDuringBatchApplication_DONT_USE(opCtx).value_or(true) && opCtx->getServiceContext()->getStorageEngine()->supportsReadConcernSnapshot()) { _shouldNotConflictWithSecondaryBatchApplicationBlock.emplace(opCtx->lockState()); } @@ -123,11 +118,6 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx, // consistent time, so not taking the PBWM lock is not a problem. On secondaries, we have to // guarantee we read at a consistent state, so we must read at the lastApplied timestamp, // which is set after each complete batch. - // - // If an attempt to read at the lastApplied timestamp is unsuccessful because there are - // pending catalog changes that occur after that timestamp, we release our locks and try - // again with the PBWM lock (by unsetting - // _shouldNotConflictWithSecondaryBatchApplicationBlock). const NamespaceString nss = coll->ns(); auto readSource = opCtx->recoveryUnit()->getTimestampReadSource(); @@ -175,7 +165,8 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx, !nss.mustBeAppliedInOwnOplogBatch() && SnapshotHelper::shouldReadAtLastApplied(opCtx, nss)) { LOGV2_FATAL(4728700, - "Reading from replicated collection without read timestamp or PBWM lock", + "Reading from replicated collection on a secondary without read timestamp " + "or PBWM lock", "collection"_attr = nss); } @@ -208,39 +199,28 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx, _autoColl = boost::none; // If there are pending catalog changes when using a no-overlap or lastApplied read source, - // we choose to take the PBWM lock to conflict with any in-progress batches. This prevents - // us from idly spinning in this loop trying to get a new read timestamp ahead of the - // minimum visible snapshot. This helps us guarantee liveness (i.e. we can eventually get a - // suitable read timestamp) but should not be necessary for correctness. After initial sync - // finishes, if we waited instead of retrying, readers would block indefinitely waiting for - // their read timstamp to move forward. Instead we force the reader take the PBWM lock and - // retry. + // we yield to get a new read timestamp ahead of the minimum visible snapshot. if (readSource == RecoveryUnit::ReadSource::kLastApplied || readSource == RecoveryUnit::ReadSource::kNoOverlap) { invariant(readTimestamp); LOGV2(20576, "Tried reading at a timestamp, but future catalog changes are pending. " - "Trying again without reading at a timestamp", + "Trying again", "readTimestamp"_attr = *readTimestamp, "collection"_attr = nss.ns(), "collectionMinSnapshot"_attr = *minSnapshot); - // Destructing the block sets _shouldConflictWithSecondaryBatchApplication back to the - // previous value. If the previous value is false (because there is another - // shouldNotConflictWithSecondaryBatchApplicationBlock outside of this function), this - // does not take the PBWM lock. - _shouldNotConflictWithSecondaryBatchApplicationBlock = boost::none; - - // As alluded to above, if we are AutoGetting multiple collections, it - // is possible that our "reaquire the PBWM" trick doesn't work, since we've already done + + // If we are AutoGetting multiple collections, it is possible that we've already done // some reads and locked in our snapshot. At this point, the only way out is to fail // the operation. The client application will need to retry. uassert( ErrorCodes::SnapshotUnavailable, str::stream() << "Unable to read from a snapshot due to pending collection catalog " - "changes; please retry the operation. Snapshot timestamp is " + "changes and holding multiple collection locks; please retry the " + "operation. Snapshot timestamp is " << readTimestamp->toString() << ". Collection minimum is " << minSnapshot->toString(), - opCtx->lockState()->shouldConflictWithSecondaryBatchApplication()); + !opCtx->lockState()->isLocked()); // Abandon our snapshot. We may select a new read timestamp or ReadSource in the next // loop iteration. diff --git a/src/mongo/db/db_raii.idl b/src/mongo/db/db_raii.idl deleted file mode 100644 index d04759f18bc..00000000000 --- a/src/mongo/db/db_raii.idl +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (C) 2019-present MongoDB, Inc. -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the Server Side Public License, version 1, -# as published by MongoDB, Inc. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Server Side Public License for more details. -# -# You should have received a copy of the Server Side Public License -# along with this program. If not, see -# . -# -# As a special exception, the copyright holders give permission to link the -# code of portions of this program with the OpenSSL library under certain -# conditions as described in each individual source file and distribute -# linked combinations including the program with the OpenSSL library. You -# must comply with the Server Side Public License in all respects for -# all of the code used other than as permitted herein. If you modify file(s) -# with this exception, you may extend this exception to your version of the -# file(s), but you are not obligated to do so. If you do not wish to do so, -# delete this exception statement from your version. If you delete this -# exception statement from all source files in the program, then also delete -# it in the license file. -# - -global: - cpp_namespace: "mongo" - -server_parameters: - allowSecondaryReadsDuringBatchApplication: - description: 'If true, do not take PBWM lock in AutoGetCollectionForRead on secondaries during batch application.' - set_at: [startup, runtime] - cpp_vartype: AtomicWord - cpp_varname: gAllowSecondaryReadsDuringBatchApplication - default: true diff --git a/src/mongo/db/db_raii_test.cpp b/src/mongo/db/db_raii_test.cpp index eba322c5581..a9239f0a92e 100644 --- a/src/mongo/db/db_raii_test.cpp +++ b/src/mongo/db/db_raii_test.cpp @@ -235,10 +235,9 @@ TEST_F(DBRAIITestFixture, // Don't call into the ReplicationCoordinator to update lastApplied because it is only a mock // class and does not update the correct state in the SnapshotManager. - repl::OpTime opTime(Timestamp(200, 1), 1); auto snapshotManager = client1.second.get()->getServiceContext()->getStorageEngine()->getSnapshotManager(); - snapshotManager->setLastApplied(opTime.getTimestamp()); + snapshotManager->setLastApplied(replCoord->getMyLastAppliedOpTime().getTimestamp()); Lock::DBLock dbLock1(client1.second.get(), nss.db(), MODE_IX); ASSERT(client1.second->lockState()->isDbLockedForMode(nss.db(), MODE_IX)); @@ -310,14 +309,24 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedConflict) { // Simulate using a DBDirectClient to test this behavior for user reads. client1.first->setInDirectClient(true); - AutoGetCollectionForRead coll(client1.second.get(), nss); - // We can't read from kLastApplied in this scenario because there is a catalog conflict. Resort - // to taking the PBWM lock and reading without a timestamp. + auto timeoutError = client1.second->getTimeoutError(); + auto waitForLock = [&] { + auto deadline = Date_t::now() + Milliseconds(10); + client1.second->runWithDeadline(deadline, timeoutError, [&] { + AutoGetCollectionForRead coll(client1.second.get(), nss); + }); + }; + + // Expect that the lock acquisition eventually times out because lastApplied is not advancing. + ASSERT_THROWS_CODE(waitForLock(), DBException, timeoutError); + + // Advance lastApplied and ensure the lock acquisition succeeds. + snapshotManager->setLastApplied(replCoord->getMyLastAppliedOpTime().getTimestamp()); + + AutoGetCollectionForRead coll(client1.second.get(), nss); ASSERT_EQ(client1.second.get()->recoveryUnit()->getTimestampReadSource(), - RecoveryUnit::ReadSource::kNoTimestamp); - ASSERT_TRUE(client1.second.get()->lockState()->isLockHeldForMode( - resourceIdParallelBatchWriterMode, MODE_IS)); + RecoveryUnit::ReadSource::kLastApplied); } TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedUnavailable) { diff --git a/src/mongo/db/repair.cpp b/src/mongo/db/repair.cpp index 0b3dfca8ebb..950e0b8d9aa 100644 --- a/src/mongo/db/repair.cpp +++ b/src/mongo/db/repair.cpp @@ -152,7 +152,7 @@ Status repairDatabase(OperationContext* opCtx, StorageEngine* engine, const std: databaseHolder->close(opCtx, dbName); // Reopening db is necessary for repairCollections. - auto db = databaseHolder->openDb(opCtx, dbName); + databaseHolder->openDb(opCtx, dbName); auto status = repairCollections(opCtx, engine, dbName); if (!status.isOK()) { @@ -167,19 +167,6 @@ Status repairDatabase(OperationContext* opCtx, StorageEngine* engine, const std: // Ensure that we don't trigger an exception when attempting to take locks. UninterruptibleLockGuard noInterrupt(opCtx->lockState()); - // Set the minimum snapshot for all Collections in this db. This ensures that readers - // using majority readConcern level can only use the collections after their repaired - // versions are in the committed view. - const auto currentTime = VectorClock::get(opCtx)->getTime(); - - for (auto collIt = db->begin(opCtx); collIt != db->end(opCtx); ++collIt) { - auto collection = - collIt.getWritableCollection(opCtx, CollectionCatalog::LifetimeMode::kInplace); - if (collection) { - collection->setMinimumVisibleSnapshot(currentTime.clusterTime().asTimestamp()); - } - } - // Restore oplog Collection pointer cache. repl::acquireOplogCollectionForLogging(opCtx); } catch (const ExceptionFor&) { diff --git a/src/mongo/db/storage/storage_engine_impl.cpp b/src/mongo/db/storage/storage_engine_impl.cpp index ea2e13f3a0f..c8c87173421 100644 --- a/src/mongo/db/storage/storage_engine_impl.cpp +++ b/src/mongo/db/storage/storage_engine_impl.cpp @@ -956,7 +956,7 @@ StatusWith StorageEngineImpl::recoverToStableTimestamp(OperationConte return swTimestamp; } - catalog::openCatalog(opCtx, state); + catalog::openCatalog(opCtx, state, swTimestamp.getValue()); LOGV2(22259, "recoverToStableTimestamp successful", diff --git a/src/mongo/dbtests/storage_timestamp_tests.cpp b/src/mongo/dbtests/storage_timestamp_tests.cpp index e7c007b5835..a70b5b42402 100644 --- a/src/mongo/dbtests/storage_timestamp_tests.cpp +++ b/src/mongo/dbtests/storage_timestamp_tests.cpp @@ -1618,9 +1618,8 @@ public: AutoGetCollection autoColl(_opCtx, nss, LockMode::MODE_IX); - // It is not valid to read the multikey state earlier than the 'minimumVisibleTimestamp', - // so at least assert that it has been updated due to the index creation. - ASSERT_GT(autoColl.getCollection()->getMinimumVisibleSnapshot().get(), + // Ensure minimumVisible has not been updated due to the index creation. + ASSERT_LT(autoColl.getCollection()->getMinimumVisibleSnapshot().get(), pastTime.asTimestamp()); // Reading the multikey state before 'insertTime0' is not valid or reliable to test. If the -- cgit v1.2.1