summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLouis Williams <louis.williams@mongodb.com>2020-09-22 16:03:46 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-09-23 16:49:17 +0000
commit669e5650151733738ce8270e5bdf3c5759665316 (patch)
tree40e8d2b26c236fd3effe64d5fd04b7a05162f1ec
parent0e044981649f6712b3cc3f39bdb41c6e06546815 (diff)
downloadmongo-669e5650151733738ce8270e5bdf3c5759665316.tar.gz
SERVER-47866 Secondary readers do not need to reacquire PBWM lock if there are catalog conflicts
-rw-r--r--jstests/concurrency/fsm_workloads/snapshot_read_at_cluster_time_ddl_operations.js4
-rw-r--r--jstests/concurrency/fsm_workloads/snapshot_read_catalog_operations.js4
-rw-r--r--jstests/libs/global_snapshot_reads_util.js10
-rw-r--r--jstests/noPassthrough/index_stepdown_abort_prepare_conflict.js1
-rw-r--r--jstests/noPassthrough/out_majority_read_replset.js20
-rw-r--r--jstests/noPassthrough/read_concern_snapshot_catalog_invalidation.js20
-rw-r--r--jstests/noPassthrough/read_majority.js70
-rw-r--r--jstests/replsets/minimum_visible_with_cluster_time.js21
-rw-r--r--jstests/replsets/read_committed_with_catalog_changes.js27
-rw-r--r--src/mongo/db/SConscript1
-rw-r--r--src/mongo/db/catalog/catalog_control.cpp18
-rw-r--r--src/mongo/db/catalog/catalog_control.h4
-rw-r--r--src/mongo/db/catalog/catalog_control_test.cpp2
-rw-r--r--src/mongo/db/catalog/index_build_block.cpp18
-rw-r--r--src/mongo/db/catalog/index_catalog_impl.cpp19
-rw-r--r--src/mongo/db/db_raii.cpp40
-rw-r--r--src/mongo/db/db_raii.idl38
-rw-r--r--src/mongo/db/db_raii_test.cpp25
-rw-r--r--src/mongo/db/repair.cpp15
-rw-r--r--src/mongo/db/storage/storage_engine_impl.cpp2
-rw-r--r--src/mongo/dbtests/storage_timestamp_tests.cpp5
21 files changed, 202 insertions, 162 deletions
diff --git a/jstests/concurrency/fsm_workloads/snapshot_read_at_cluster_time_ddl_operations.js b/jstests/concurrency/fsm_workloads/snapshot_read_at_cluster_time_ddl_operations.js
index 07d1b578109..c6bf1e808bd 100644
--- a/jstests/concurrency/fsm_workloads/snapshot_read_at_cluster_time_ddl_operations.js
+++ b/jstests/concurrency/fsm_workloads/snapshot_read_at_cluster_time_ddl_operations.js
@@ -2,8 +2,7 @@
/**
* Perform point-in-time snapshot reads that span a 'find' and multiple 'getmore's concurrently with
- * CRUD operations. Index operations running concurrently with the snapshot read may cause
- * the read to fail with a SnapshotUnavailable error.
+ * CRUD operations.
*
* @tags: [creates_background_indexes, requires_fcv_47, requires_replication,
* does_not_support_causal_consistency, requires_majority_read_concern]
@@ -17,7 +16,6 @@ var $config = (function() {
snapshotScan: function snapshotScan(db, collName) {
const readErrorCodes = [
- ErrorCodes.SnapshotUnavailable,
ErrorCodes.ShutdownInProgress,
ErrorCodes.CursorNotFound,
ErrorCodes.QueryPlanKilled,
diff --git a/jstests/concurrency/fsm_workloads/snapshot_read_catalog_operations.js b/jstests/concurrency/fsm_workloads/snapshot_read_catalog_operations.js
index 87a0c2f0515..5b89d01d29f 100644
--- a/jstests/concurrency/fsm_workloads/snapshot_read_catalog_operations.js
+++ b/jstests/concurrency/fsm_workloads/snapshot_read_catalog_operations.js
@@ -5,8 +5,7 @@
* snapshot reads and CRUD operations will all contend for locks on db and collName. Since the
* snapshot read does not release its locks until the transaction is committed, it is expected that
* once the read has begun, catalog operations with conflicting locks will block until the read is
- * finished. Additionally, index operations running concurrently with the snapshot read may cause
- * the read to fail with a SnapshotUnavailable error.
+ * finished.
*
* @tags: [creates_background_indexes, uses_transactions]
*/
@@ -32,7 +31,6 @@ var $config = (function() {
const sortByAscending = sortOptions[Random.randInt(2)];
const readErrorCodes = [
ErrorCodes.NoSuchTransaction,
- ErrorCodes.SnapshotUnavailable,
ErrorCodes.SnapshotTooOld,
ErrorCodes.StaleChunkHistory,
ErrorCodes.LockTimeout,
diff --git a/jstests/libs/global_snapshot_reads_util.js b/jstests/libs/global_snapshot_reads_util.js
index 10f8f9ff74d..658141ebdbd 100644
--- a/jstests/libs/global_snapshot_reads_util.js
+++ b/jstests/libs/global_snapshot_reads_util.js
@@ -138,9 +138,17 @@ function SnapshotReadsTest({primaryDB, secondaryDB, awaitCommittedFn}) {
// This update is not visible to reads at insertTimestamp.
res = assert.commandWorked(primaryDB.runCommand(
{update: collName, updates: [{q: {}, u: {$set: {x: true}}, multi: true}]}));
-
jsTestLog(`Updated collection "${collName}" at timestamp ${
tojson(res.operationTime)}`);
+
+ awaitCommittedFn(db, res.operationTime);
+
+ // This index is not visible to reads at insertTimestamp and does not cause the
+ // operation to fail.
+ res = assert.commandWorked(primaryDB.runCommand(
+ {createIndexes: collName, indexes: [{key: {x: 1}, name: 'x_1'}]}));
+ jsTestLog(`Created an index on collection "${collName}" at timestamp ${
+ tojson(res.operationTime)}`);
awaitCommittedFn(db, res.operationTime);
// Retrieve the rest of the read command's result set.
diff --git a/jstests/noPassthrough/index_stepdown_abort_prepare_conflict.js b/jstests/noPassthrough/index_stepdown_abort_prepare_conflict.js
index c47922aeb68..6b207866ed7 100644
--- a/jstests/noPassthrough/index_stepdown_abort_prepare_conflict.js
+++ b/jstests/noPassthrough/index_stepdown_abort_prepare_conflict.js
@@ -100,6 +100,7 @@ assert.commandWorked(newSession.abortTransaction_forTesting());
IndexBuildTest.waitForIndexBuildToStop(newPrimary.getDB(dbName), collName, indexName);
IndexBuildTest.waitForIndexBuildToStop(primary.getDB(dbName), collName, indexName);
+rst.awaitReplication();
IndexBuildTest.assertIndexes(newPrimary.getDB(dbName).getCollection(collName), 1, ["_id_"], []);
IndexBuildTest.assertIndexes(primaryColl, 1, ["_id_"], []);
diff --git a/jstests/noPassthrough/out_majority_read_replset.js b/jstests/noPassthrough/out_majority_read_replset.js
index 0ffba741b3e..4ca9bd83e0a 100644
--- a/jstests/noPassthrough/out_majority_read_replset.js
+++ b/jstests/noPassthrough/out_majority_read_replset.js
@@ -28,12 +28,22 @@ rst.awaitLastOpCommitted();
stopReplicationOnSecondaries(rst);
-// Create the index that is not majority commited
-// This test create indexes with majority of nodes not available for replication. So, disabling
-// index build commit quorum.
+// Rename the collection temporarily and then back to its original name. This advances the minimum
+// visible snapshot and forces the $out to block until its snapshot advances.
+const tempColl = db.getName() + '.temp';
+assert.commandWorked(db.adminCommand({
+ renameCollection: sourceColl.getFullName(),
+ to: tempColl,
+}));
+assert.commandWorked(db.adminCommand({
+ renameCollection: tempColl,
+ to: sourceColl.getFullName(),
+}));
+
+// Create the index that is not majority committed
assert.commandWorked(sourceColl.createIndex({state: 1}, {name: "secondIndex"}, 0));
-// Run the $out in the parallel shell as it will block in the metadata until the shapshot is
+// Run the $out in the parallel shell as it will block in the metadata until the snapshot is
// advanced.
const awaitShell = startParallelShell(`{
const testDB = db.getSiblingDB("${name}");
@@ -58,7 +68,7 @@ assert.soon(function() {
return assert.commandWorked(db.currentOp(filter)).inprog.length === 1;
});
-// Restart data replicaiton and wait until the new write becomes visible.
+// Restart data replication and wait until the new write becomes visible.
restartReplicationOnSecondaries(rst);
rst.awaitLastOpCommitted();
diff --git a/jstests/noPassthrough/read_concern_snapshot_catalog_invalidation.js b/jstests/noPassthrough/read_concern_snapshot_catalog_invalidation.js
index b6cc5d4b21d..fb296a9d29e 100644
--- a/jstests/noPassthrough/read_concern_snapshot_catalog_invalidation.js
+++ b/jstests/noPassthrough/read_concern_snapshot_catalog_invalidation.js
@@ -42,13 +42,19 @@ function testCommand(cmd, curOpFilter) {
waitForCurOpByFailPointNoNS(testDB, "hangAfterPreallocateSnapshot", curOpFilter);
- // Create an index on the collection the command was executed against. This will move the
- // collection's minimum visible timestamp to a point later than the point-in-time referenced
- // by the transaction snapshot.
- assert.commandWorked(testDB.runCommand({
- createIndexes: kCollName,
- indexes: [{key: {x: 1}, name: "x_1"}],
- writeConcern: {w: "majority"}
+ // Rename the collection the command was executed against and then back to its original name.
+ // This will move the collection's minimum visible timestamp to a point later than the
+ // point-in-time referenced by the transaction snapshot.
+ const tempColl = testDB.getName() + '.temp';
+ assert.commandWorked(testDB.adminCommand({
+ renameCollection: testDB.getName() + '.' + kCollName,
+ to: tempColl,
+ writeConcern: {w: "majority"},
+ }));
+ assert.commandWorked(testDB.adminCommand({
+ renameCollection: tempColl,
+ to: testDB.getName() + '.' + kCollName,
+ writeConcern: {w: "majority"},
}));
// Disable the hang and check for parallel shell success. Success indicates that the command
diff --git a/jstests/noPassthrough/read_majority.js b/jstests/noPassthrough/read_majority.js
index 12a2c991641..227e04e7dc4 100644
--- a/jstests/noPassthrough/read_majority.js
+++ b/jstests/noPassthrough/read_majority.js
@@ -53,6 +53,13 @@ function testReadConcernLevel(level) {
assert.eq(res.code, ErrorCodes.MaxTimeMSExpired);
}
+ function assertNoSnapshotAvailableForReadConcernLevelByUUID(uuid) {
+ var res =
+ db.runCommand({find: uuid, batchSize: 2, readConcern: {level: level}, maxTimeMS: 1000});
+ assert.commandFailed(res);
+ assert.eq(res.code, ErrorCodes.MaxTimeMSExpired);
+ }
+
function getCursorForReadConcernLevel() {
var res = t.runCommand('find', {batchSize: 2, readConcern: {level: level}});
assert.commandWorked(res);
@@ -145,30 +152,77 @@ function testReadConcernLevel(level) {
assert.eq(cursor.next().version, 4);
assert.eq(cursor.next().version, 4);
- // Adding an index bumps the min snapshot for a collection as of SERVER-20260. This may
- // change to just filter that index out from query planning as part of SERVER-20439.
+ // Adding an index does not bump the min snapshot for a collection. Collection scans are
+ // possible, however the index is not guaranteed to be usable until the majority-committed
+ // snapshot advances.
t.createIndex({version: 1}, {}, 0);
- assertNoSnapshotAvailableForReadConcernLevel();
+ assert.eq(getCursorForReadConcernLevel().itcount(), 10);
+ assert.eq(getAggCursorForReadConcernLevel().itcount(), 10);
// To use the index, a snapshot created after the index was completed must be marked
// committed.
var newSnapshot = assert.commandWorked(db.adminCommand("makeSnapshot")).name;
- assertNoSnapshotAvailableForReadConcernLevel();
assert.commandWorked(db.adminCommand({"setCommittedSnapshot": newSnapshot}));
assert.eq(getCursorForReadConcernLevel().itcount(), 10);
assert.eq(getAggCursorForReadConcernLevel().itcount(), 10);
assert(isIxscan(db, getExplainPlan({version: 1})));
- // Dropping an index does bump the min snapshot.
+ // Dropping an index does not bump the min snapshot, so the query should succeed.
t.dropIndex({version: 1});
- assertNoSnapshotAvailableForReadConcernLevel();
+ assert.eq(getCursorForReadConcernLevel().itcount(), 10);
+ assert.eq(getAggCursorForReadConcernLevel().itcount(), 10);
+ assert(isCollscan(db, getExplainPlan({version: 1})));
- // To use the collection again, a snapshot created after the dropIndex must be marked
- // committed.
newSnapshot = assert.commandWorked(db.adminCommand("makeSnapshot")).name;
+ assert.commandWorked(db.adminCommand({"setCommittedSnapshot": newSnapshot}));
+ assert.eq(getCursorForReadConcernLevel().itcount(), 10);
+ assert.eq(getAggCursorForReadConcernLevel().itcount(), 10);
+ assert(isCollscan(db, getExplainPlan({version: 1})));
+
+ // Get the UUID before renaming.
+ const collUuid = (() => {
+ const collectionInfos =
+ assert.commandWorked(db.runCommand({listCollections: 1})).cursor.firstBatch;
+ assert.eq(1, collectionInfos.length);
+ const info = collectionInfos[0];
+ assert.eq(t.getName(), info.name);
+ return info.info.uuid;
+ })();
+ assert(collUuid);
+
+ // Get a cursor before renaming.
+ cursor = getCursorForReadConcernLevel(); // Note: uses batchsize=2.
+ assert.eq(cursor.next().version, 4);
+ assert.eq(cursor.next().version, 4);
+ assert(!cursor.objsLeftInBatch());
+
+ // Even though renaming advances the minimum visible snapshot, we're querying by a namespace
+ // that no longer exists. Because of this, the query surprisingly returns no results instead of
+ // timing out. This violates read-committed semantics but is allowed by the current
+ // specification.
+ const tempNs = db.getName() + '.temp';
+ assert.commandWorked(db.adminCommand({renameCollection: t.getFullName(), to: tempNs}));
+ assert.eq(getCursorForReadConcernLevel().itcount(), 0);
+
+ // Trigger a getMore that should fail due to the rename.
+ let error = assert.throws(() => {
+ cursor.next();
+ });
+ assert.eq(error.code, ErrorCodes.QueryPlanKilled);
+
+ // Starting a new query by UUID will block because the minimum visible timestamp is ahead of the
+ // majority-committed snapshot.
+ assertNoSnapshotAvailableForReadConcernLevelByUUID(collUuid);
+
+ // Renaming back will cause queries to block again because the original namespace exists, and
+ // its minimum visible timestamp is ahead of the current majority-committed snapshot.
+ assert.commandWorked(db.adminCommand({renameCollection: tempNs, to: t.getFullName()}));
assertNoSnapshotAvailableForReadConcernLevel();
+
+ newSnapshot = assert.commandWorked(db.adminCommand("makeSnapshot")).name;
assert.commandWorked(db.adminCommand({"setCommittedSnapshot": newSnapshot}));
assert.eq(getCursorForReadConcernLevel().itcount(), 10);
+ assert.eq(getAggCursorForReadConcernLevel().itcount(), 10);
// Dropping the collection is visible in the committed snapshot, even though it hasn't been
// marked committed yet. This is allowed by the current specification even though it
diff --git a/jstests/replsets/minimum_visible_with_cluster_time.js b/jstests/replsets/minimum_visible_with_cluster_time.js
index 58797e9e47f..7bbbff02522 100644
--- a/jstests/replsets/minimum_visible_with_cluster_time.js
+++ b/jstests/replsets/minimum_visible_with_cluster_time.js
@@ -73,23 +73,38 @@ for (let i = 0; i < 10; i++) {
assert.commandWorked(
coll.createIndex({x: 1}, {'name': 'x_1', 'expireAfterSeconds': 60 * 60 * 23}));
- doMajorityRead(coll, 1);
+ // Majority read should eventually see new documents because it will not block on the index
+ // build.
+ assert.soonNoExcept(() => {
+ doMajorityRead(coll, 1);
+ return true;
+ });
assert.commandWorked(coll.insert({x: 7, y: 2}));
assert.commandWorked(coll.runCommand(
'collMod', {'index': {'keyPattern': {x: 1}, 'expireAfterSeconds': 60 * 60 * 24}}));
- doMajorityRead(coll, 2);
+ // Majority read should eventually see new documents because it will not block on the index
+ // build.
+ assert.soonNoExcept(() => {
+ doMajorityRead(coll, 2);
+ return true;
+ });
assert.commandWorked(coll.insert({x: 7, y: 3}));
assert.commandWorked(coll.dropIndexes());
- doMajorityRead(coll, 3);
+ // Majority read should eventually see new documents because it will not block on the drop.
+ assert.soonNoExcept(() => {
+ doMajorityRead(coll, 3);
+ return true;
+ });
assert.commandWorked(coll.insert({x: 7, y: 4}));
const newCollNameI = collNameI + '_new';
assert.commandWorked(coll.renameCollection(newCollNameI));
coll = primary.getDB(dbName).getCollection(newCollNameI);
+ // Majority read should immediately see new documents because it blocks on the rename.
doMajorityRead(coll, 4);
}
diff --git a/jstests/replsets/read_committed_with_catalog_changes.js b/jstests/replsets/read_committed_with_catalog_changes.js
index 1afbaa40f10..d314187e561 100644
--- a/jstests/replsets/read_committed_with_catalog_changes.js
+++ b/jstests/replsets/read_committed_with_catalog_changes.js
@@ -21,7 +21,10 @@
* - reindex collection
* - compact collection
*
- * @tags: [requires_majority_read_concern]
+ * @tags: [
+ * requires_fcv_47,
+ * requires_majority_read_concern,
+ * ]
*/
load("jstests/libs/parallelTester.js"); // For Thread.
@@ -140,8 +143,22 @@ const testCases = {
// So, disabling index build commit quorum.
assert.commandWorked(db.coll.createIndex({x: 1}, {}, 0));
},
- blockedCollections: ['coll'],
- unblockedCollections: ['other'],
+ blockedCollections: [],
+ unblockedCollections: ['coll', 'other'],
+ },
+ collMod: {
+ prepare: function(db) {
+ // This test create indexes with majority of nodes not available for replication.
+ // So, disabling index build commit quorum.
+ assert.commandWorked(db.coll.createIndex({x: 1}, {expireAfterSeconds: 60 * 60}, 0));
+ assert.commandWorked(db.coll.insert({_id: 1, x: 1}));
+ },
+ performOp: function(db) {
+ assert.commandWorked(db.coll.runCommand(
+ 'collMod', {index: {keyPattern: {x: 1}, expireAfterSeconds: 60 * 61}}));
+ },
+ blockedCollections: [],
+ unblockedCollections: ['coll'],
},
dropIndex: {
prepare: function(db) {
@@ -155,8 +172,8 @@ const testCases = {
performOp: function(db) {
assert.commandWorked(db.coll.dropIndex({x: 1}));
},
- blockedCollections: ['coll'],
- unblockedCollections: ['other'],
+ blockedCollections: [],
+ unblockedCollections: ['coll', 'other'],
},
// Remaining case is a local-only operation.
diff --git a/src/mongo/db/SConscript b/src/mongo/db/SConscript
index 66aa6d8cf13..2ad4e518b49 100644
--- a/src/mongo/db/SConscript
+++ b/src/mongo/db/SConscript
@@ -612,7 +612,6 @@ env.Library(
target='db_raii',
source=[
'db_raii.cpp',
- env.Idlc('db_raii.idl')[0]
],
LIBDEPS=[
'catalog_raii',
diff --git a/src/mongo/db/catalog/catalog_control.cpp b/src/mongo/db/catalog/catalog_control.cpp
index a81bf952f4c..2b243b53c08 100644
--- a/src/mongo/db/catalog/catalog_control.cpp
+++ b/src/mongo/db/catalog/catalog_control.cpp
@@ -104,7 +104,9 @@ MinVisibleTimestampMap closeCatalog(OperationContext* opCtx) {
return minVisibleTimestampMap;
}
-void openCatalog(OperationContext* opCtx, const MinVisibleTimestampMap& minVisibleTimestampMap) {
+void openCatalog(OperationContext* opCtx,
+ const MinVisibleTimestampMap& minVisibleTimestampMap,
+ Timestamp stableTimestamp) {
invariant(opCtx->lockState()->isW());
// Load the catalog in the storage engine.
@@ -197,8 +199,18 @@ void openCatalog(OperationContext* opCtx, const MinVisibleTimestampMap& minVisib
<< "failed to get valid collection pointer for namespace " << collNss);
if (minVisibleTimestampMap.count(collection->uuid()) > 0) {
- collection->setMinimumVisibleSnapshot(
- minVisibleTimestampMap.find(collection->uuid())->second);
+ // After rolling back to a stable timestamp T, the minimum visible timestamp for
+ // each collection must be reset to (at least) its value at T. Additionally, there
+ // cannot exist a minimum visible timestamp greater than lastApplied. This allows us
+ // to upper bound what the minimum visible timestamp can be coming out of rollback.
+ //
+ // Because we only save the latest minimum visible timestamp for each collection, we
+ // bound the minimum visible timestamp (where necessary) to the stable timestamp.
+ // The benefit of fine grained tracking is assumed to be low-value compared to the
+ // cost/effort.
+ auto minVisible = std::min(stableTimestamp,
+ minVisibleTimestampMap.find(collection->uuid())->second);
+ collection->setMinimumVisibleSnapshot(minVisible);
}
// If this is the oplog collection, re-establish the replication system's cached pointer
diff --git a/src/mongo/db/catalog/catalog_control.h b/src/mongo/db/catalog/catalog_control.h
index ab6ee87e8f4..527e1725c07 100644
--- a/src/mongo/db/catalog/catalog_control.h
+++ b/src/mongo/db/catalog/catalog_control.h
@@ -50,6 +50,8 @@ MinVisibleTimestampMap closeCatalog(OperationContext* opCtx);
*
* Must be called with the global lock acquired in exclusive mode.
*/
-void openCatalog(OperationContext* opCtx, const MinVisibleTimestampMap& catalogState);
+void openCatalog(OperationContext* opCtx,
+ const MinVisibleTimestampMap& catalogState,
+ Timestamp stableTimestamp);
} // namespace catalog
} // namespace mongo
diff --git a/src/mongo/db/catalog/catalog_control_test.cpp b/src/mongo/db/catalog/catalog_control_test.cpp
index b0877a70def..602b8c26a2b 100644
--- a/src/mongo/db/catalog/catalog_control_test.cpp
+++ b/src/mongo/db/catalog/catalog_control_test.cpp
@@ -75,7 +75,7 @@ TEST_F(CatalogControlTest, CloseAndOpenCatalog) {
OperationContextNoop opCtx(&cc(), 0);
auto map = catalog::closeCatalog(&opCtx);
ASSERT_EQUALS(0U, map.size());
- catalog::openCatalog(&opCtx, {});
+ catalog::openCatalog(&opCtx, {}, Timestamp());
}
} // namespace
diff --git a/src/mongo/db/catalog/index_build_block.cpp b/src/mongo/db/catalog/index_build_block.cpp
index 8226c415c4d..d49a71bbc7e 100644
--- a/src/mongo/db/catalog/index_build_block.cpp
+++ b/src/mongo/db/catalog/index_build_block.cpp
@@ -159,7 +159,6 @@ Status IndexBuildBlock::init(OperationContext* opCtx, Collection* collection) {
// This will prevent the unfinished index from being visible on index iterators.
if (commitTime) {
entry->setMinimumVisibleSnapshot(commitTime.get());
- coll->setMinimumVisibleSnapshot(commitTime.get());
}
});
}
@@ -223,15 +222,6 @@ void IndexBuildBlock::success(OperationContext* opCtx, Collection* collection) {
// Note: this runs after the WUOW commits but before we release our X lock on the
// collection. This means that any snapshot created after this must include the full
// index, and no one can try to read this index before we set the visibility.
- if (!commitTime) {
- // The end of background index builds on secondaries does not get a commit
- // timestamp. We use the cluster time since it's guaranteed to be greater than the
- // time of the index build. It is possible the cluster time could be in the future,
- // and we will need to do another write to reach the minimum visible snapshot.
- const auto currentTime = VectorClock::get(svcCtx)->getTime();
- commitTime = currentTime.clusterTime().asTimestamp();
- }
-
LOGV2(20345,
"Index build: done building index {indexName} on ns {nss}",
"Index build: done building",
@@ -240,11 +230,9 @@ void IndexBuildBlock::success(OperationContext* opCtx, Collection* collection) {
"index"_attr = indexName,
"commitTimestamp"_attr = commitTime);
- entry->setMinimumVisibleSnapshot(commitTime.get());
- // We must also set the minimum visible snapshot on the collection like during init().
- // This prevents reads in the past from reading inconsistent metadata. We should be
- // able to remove this when the catalog is versioned.
- coll->setMinimumVisibleSnapshot(commitTime.get());
+ if (commitTime) {
+ entry->setMinimumVisibleSnapshot(commitTime.get());
+ }
// Add the index to the TTLCollectionCache upon successfully committing the index build.
if (spec.hasField(IndexDescriptor::kExpireAfterSecondsFieldName)) {
diff --git a/src/mongo/db/catalog/index_catalog_impl.cpp b/src/mongo/db/catalog/index_catalog_impl.cpp
index 0a1254805c8..8874008e553 100644
--- a/src/mongo/db/catalog/index_catalog_impl.cpp
+++ b/src/mongo/db/catalog/index_catalog_impl.cpp
@@ -1020,17 +1020,7 @@ public:
: _opCtx(opCtx), _collection(collection), _entries(entries), _entry(std::move(entry)) {}
void commit(boost::optional<Timestamp> commitTime) final {
- // Ban reading from this collection on committed reads on snapshots before now.
- if (!commitTime) {
- // This is called when we refresh the index catalog entry, which does not always have
- // a commit timestamp. We use the cluster time since it's guaranteed to be greater
- // than the time of the index removal. It is possible the cluster time could be in the
- // future, and we will need to do another write to reach the minimum visible snapshot.
- const auto currentTime = VectorClock::get(_opCtx)->getTime();
- commitTime = currentTime.clusterTime().asTimestamp();
- }
_entry->setDropped();
- _collection->setMinimumVisibleSnapshot(commitTime.get());
}
void rollback() final {
@@ -1318,11 +1308,16 @@ const IndexDescriptor* IndexCatalogImpl::refreshEntry(OperationContext* opCtx,
// to the CollectionQueryInfo.
auto newDesc =
std::make_unique<IndexDescriptor>(_collection, _getAccessMethodName(keyPattern), spec);
- const IndexCatalogEntry* newEntry =
- createIndexEntry(opCtx, std::move(newDesc), CreateIndexEntryFlags::kIsReady);
+ auto newEntry = createIndexEntry(opCtx, std::move(newDesc), CreateIndexEntryFlags::kIsReady);
invariant(newEntry->isReady(opCtx));
CollectionQueryInfo::get(_collection).addedIndex(opCtx, _collection, newEntry->descriptor());
+ opCtx->recoveryUnit()->onCommit([newEntry](auto commitTime) {
+ if (commitTime) {
+ newEntry->setMinimumVisibleSnapshot(*commitTime);
+ }
+ });
+
// Return the new descriptor.
return newEntry->descriptor();
}
diff --git a/src/mongo/db/db_raii.cpp b/src/mongo/db/db_raii.cpp
index 22a9181f157..5e294acefe4 100644
--- a/src/mongo/db/db_raii.cpp
+++ b/src/mongo/db/db_raii.cpp
@@ -36,7 +36,6 @@
#include "mongo/db/catalog/database_holder.h"
#include "mongo/db/concurrency/locker.h"
#include "mongo/db/curop.h"
-#include "mongo/db/db_raii_gen.h"
#include "mongo/db/repl/replication_coordinator.h"
#include "mongo/db/s/collection_sharding_state.h"
#include "mongo/db/storage/snapshot_helper.h"
@@ -49,7 +48,6 @@ const boost::optional<int> kDoNotChangeProfilingLevel = boost::none;
// TODO: SERVER-44105 remove
// If set to false, secondary reads should wait behind the PBW lock.
-// Does nothing if gAllowSecondaryReadsDuringBatchApplication setting is false.
const auto allowSecondaryReadsDuringBatchApplication_DONT_USE =
OperationContext::declareDecoration<boost::optional<bool>>();
@@ -94,10 +92,7 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
// i.e. the caller does not currently have a ShouldNotConflict... block in scope.
bool callerWasConflicting = opCtx->lockState()->shouldConflictWithSecondaryBatchApplication();
- // Don't take the ParallelBatchWriterMode lock when the server parameter is set and our
- // storage engine supports snapshot reads.
- if (gAllowSecondaryReadsDuringBatchApplication.load() &&
- allowSecondaryReadsDuringBatchApplication_DONT_USE(opCtx).value_or(true) &&
+ if (allowSecondaryReadsDuringBatchApplication_DONT_USE(opCtx).value_or(true) &&
opCtx->getServiceContext()->getStorageEngine()->supportsReadConcernSnapshot()) {
_shouldNotConflictWithSecondaryBatchApplicationBlock.emplace(opCtx->lockState());
}
@@ -123,11 +118,6 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
// consistent time, so not taking the PBWM lock is not a problem. On secondaries, we have to
// guarantee we read at a consistent state, so we must read at the lastApplied timestamp,
// which is set after each complete batch.
- //
- // If an attempt to read at the lastApplied timestamp is unsuccessful because there are
- // pending catalog changes that occur after that timestamp, we release our locks and try
- // again with the PBWM lock (by unsetting
- // _shouldNotConflictWithSecondaryBatchApplicationBlock).
const NamespaceString nss = coll->ns();
auto readSource = opCtx->recoveryUnit()->getTimestampReadSource();
@@ -175,7 +165,8 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
!nss.mustBeAppliedInOwnOplogBatch() &&
SnapshotHelper::shouldReadAtLastApplied(opCtx, nss)) {
LOGV2_FATAL(4728700,
- "Reading from replicated collection without read timestamp or PBWM lock",
+ "Reading from replicated collection on a secondary without read timestamp "
+ "or PBWM lock",
"collection"_attr = nss);
}
@@ -208,39 +199,28 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx,
_autoColl = boost::none;
// If there are pending catalog changes when using a no-overlap or lastApplied read source,
- // we choose to take the PBWM lock to conflict with any in-progress batches. This prevents
- // us from idly spinning in this loop trying to get a new read timestamp ahead of the
- // minimum visible snapshot. This helps us guarantee liveness (i.e. we can eventually get a
- // suitable read timestamp) but should not be necessary for correctness. After initial sync
- // finishes, if we waited instead of retrying, readers would block indefinitely waiting for
- // their read timstamp to move forward. Instead we force the reader take the PBWM lock and
- // retry.
+ // we yield to get a new read timestamp ahead of the minimum visible snapshot.
if (readSource == RecoveryUnit::ReadSource::kLastApplied ||
readSource == RecoveryUnit::ReadSource::kNoOverlap) {
invariant(readTimestamp);
LOGV2(20576,
"Tried reading at a timestamp, but future catalog changes are pending. "
- "Trying again without reading at a timestamp",
+ "Trying again",
"readTimestamp"_attr = *readTimestamp,
"collection"_attr = nss.ns(),
"collectionMinSnapshot"_attr = *minSnapshot);
- // Destructing the block sets _shouldConflictWithSecondaryBatchApplication back to the
- // previous value. If the previous value is false (because there is another
- // shouldNotConflictWithSecondaryBatchApplicationBlock outside of this function), this
- // does not take the PBWM lock.
- _shouldNotConflictWithSecondaryBatchApplicationBlock = boost::none;
-
- // As alluded to above, if we are AutoGetting multiple collections, it
- // is possible that our "reaquire the PBWM" trick doesn't work, since we've already done
+
+ // If we are AutoGetting multiple collections, it is possible that we've already done
// some reads and locked in our snapshot. At this point, the only way out is to fail
// the operation. The client application will need to retry.
uassert(
ErrorCodes::SnapshotUnavailable,
str::stream() << "Unable to read from a snapshot due to pending collection catalog "
- "changes; please retry the operation. Snapshot timestamp is "
+ "changes and holding multiple collection locks; please retry the "
+ "operation. Snapshot timestamp is "
<< readTimestamp->toString() << ". Collection minimum is "
<< minSnapshot->toString(),
- opCtx->lockState()->shouldConflictWithSecondaryBatchApplication());
+ !opCtx->lockState()->isLocked());
// Abandon our snapshot. We may select a new read timestamp or ReadSource in the next
// loop iteration.
diff --git a/src/mongo/db/db_raii.idl b/src/mongo/db/db_raii.idl
deleted file mode 100644
index d04759f18bc..00000000000
--- a/src/mongo/db/db_raii.idl
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (C) 2019-present MongoDB, Inc.
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the Server Side Public License, version 1,
-# as published by MongoDB, Inc.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# Server Side Public License for more details.
-#
-# You should have received a copy of the Server Side Public License
-# along with this program. If not, see
-# <http://www.mongodb.com/licensing/server-side-public-license>.
-#
-# As a special exception, the copyright holders give permission to link the
-# code of portions of this program with the OpenSSL library under certain
-# conditions as described in each individual source file and distribute
-# linked combinations including the program with the OpenSSL library. You
-# must comply with the Server Side Public License in all respects for
-# all of the code used other than as permitted herein. If you modify file(s)
-# with this exception, you may extend this exception to your version of the
-# file(s), but you are not obligated to do so. If you do not wish to do so,
-# delete this exception statement from your version. If you delete this
-# exception statement from all source files in the program, then also delete
-# it in the license file.
-#
-
-global:
- cpp_namespace: "mongo"
-
-server_parameters:
- allowSecondaryReadsDuringBatchApplication:
- description: 'If true, do not take PBWM lock in AutoGetCollectionForRead on secondaries during batch application.'
- set_at: [startup, runtime]
- cpp_vartype: AtomicWord<bool>
- cpp_varname: gAllowSecondaryReadsDuringBatchApplication
- default: true
diff --git a/src/mongo/db/db_raii_test.cpp b/src/mongo/db/db_raii_test.cpp
index eba322c5581..a9239f0a92e 100644
--- a/src/mongo/db/db_raii_test.cpp
+++ b/src/mongo/db/db_raii_test.cpp
@@ -235,10 +235,9 @@ TEST_F(DBRAIITestFixture,
// Don't call into the ReplicationCoordinator to update lastApplied because it is only a mock
// class and does not update the correct state in the SnapshotManager.
- repl::OpTime opTime(Timestamp(200, 1), 1);
auto snapshotManager =
client1.second.get()->getServiceContext()->getStorageEngine()->getSnapshotManager();
- snapshotManager->setLastApplied(opTime.getTimestamp());
+ snapshotManager->setLastApplied(replCoord->getMyLastAppliedOpTime().getTimestamp());
Lock::DBLock dbLock1(client1.second.get(), nss.db(), MODE_IX);
ASSERT(client1.second->lockState()->isDbLockedForMode(nss.db(), MODE_IX));
@@ -310,14 +309,24 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedConflict) {
// Simulate using a DBDirectClient to test this behavior for user reads.
client1.first->setInDirectClient(true);
- AutoGetCollectionForRead coll(client1.second.get(), nss);
- // We can't read from kLastApplied in this scenario because there is a catalog conflict. Resort
- // to taking the PBWM lock and reading without a timestamp.
+ auto timeoutError = client1.second->getTimeoutError();
+ auto waitForLock = [&] {
+ auto deadline = Date_t::now() + Milliseconds(10);
+ client1.second->runWithDeadline(deadline, timeoutError, [&] {
+ AutoGetCollectionForRead coll(client1.second.get(), nss);
+ });
+ };
+
+ // Expect that the lock acquisition eventually times out because lastApplied is not advancing.
+ ASSERT_THROWS_CODE(waitForLock(), DBException, timeoutError);
+
+ // Advance lastApplied and ensure the lock acquisition succeeds.
+ snapshotManager->setLastApplied(replCoord->getMyLastAppliedOpTime().getTimestamp());
+
+ AutoGetCollectionForRead coll(client1.second.get(), nss);
ASSERT_EQ(client1.second.get()->recoveryUnit()->getTimestampReadSource(),
- RecoveryUnit::ReadSource::kNoTimestamp);
- ASSERT_TRUE(client1.second.get()->lockState()->isLockHeldForMode(
- resourceIdParallelBatchWriterMode, MODE_IS));
+ RecoveryUnit::ReadSource::kLastApplied);
}
TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedUnavailable) {
diff --git a/src/mongo/db/repair.cpp b/src/mongo/db/repair.cpp
index 0b3dfca8ebb..950e0b8d9aa 100644
--- a/src/mongo/db/repair.cpp
+++ b/src/mongo/db/repair.cpp
@@ -152,7 +152,7 @@ Status repairDatabase(OperationContext* opCtx, StorageEngine* engine, const std:
databaseHolder->close(opCtx, dbName);
// Reopening db is necessary for repairCollections.
- auto db = databaseHolder->openDb(opCtx, dbName);
+ databaseHolder->openDb(opCtx, dbName);
auto status = repairCollections(opCtx, engine, dbName);
if (!status.isOK()) {
@@ -167,19 +167,6 @@ Status repairDatabase(OperationContext* opCtx, StorageEngine* engine, const std:
// Ensure that we don't trigger an exception when attempting to take locks.
UninterruptibleLockGuard noInterrupt(opCtx->lockState());
- // Set the minimum snapshot for all Collections in this db. This ensures that readers
- // using majority readConcern level can only use the collections after their repaired
- // versions are in the committed view.
- const auto currentTime = VectorClock::get(opCtx)->getTime();
-
- for (auto collIt = db->begin(opCtx); collIt != db->end(opCtx); ++collIt) {
- auto collection =
- collIt.getWritableCollection(opCtx, CollectionCatalog::LifetimeMode::kInplace);
- if (collection) {
- collection->setMinimumVisibleSnapshot(currentTime.clusterTime().asTimestamp());
- }
- }
-
// Restore oplog Collection pointer cache.
repl::acquireOplogCollectionForLogging(opCtx);
} catch (const ExceptionFor<ErrorCodes::MustDowngrade>&) {
diff --git a/src/mongo/db/storage/storage_engine_impl.cpp b/src/mongo/db/storage/storage_engine_impl.cpp
index ea2e13f3a0f..c8c87173421 100644
--- a/src/mongo/db/storage/storage_engine_impl.cpp
+++ b/src/mongo/db/storage/storage_engine_impl.cpp
@@ -956,7 +956,7 @@ StatusWith<Timestamp> StorageEngineImpl::recoverToStableTimestamp(OperationConte
return swTimestamp;
}
- catalog::openCatalog(opCtx, state);
+ catalog::openCatalog(opCtx, state, swTimestamp.getValue());
LOGV2(22259,
"recoverToStableTimestamp successful",
diff --git a/src/mongo/dbtests/storage_timestamp_tests.cpp b/src/mongo/dbtests/storage_timestamp_tests.cpp
index e7c007b5835..a70b5b42402 100644
--- a/src/mongo/dbtests/storage_timestamp_tests.cpp
+++ b/src/mongo/dbtests/storage_timestamp_tests.cpp
@@ -1618,9 +1618,8 @@ public:
AutoGetCollection autoColl(_opCtx, nss, LockMode::MODE_IX);
- // It is not valid to read the multikey state earlier than the 'minimumVisibleTimestamp',
- // so at least assert that it has been updated due to the index creation.
- ASSERT_GT(autoColl.getCollection()->getMinimumVisibleSnapshot().get(),
+ // Ensure minimumVisible has not been updated due to the index creation.
+ ASSERT_LT(autoColl.getCollection()->getMinimumVisibleSnapshot().get(),
pastTime.asTimestamp());
// Reading the multikey state before 'insertTime0' is not valid or reliable to test. If the