SERVER-39169 Add $_internalReadAtClusterTime option to find and dbHash.

The new $_internalReadAtClusterTime option replaces all usages of running the dbHash command inside of a multi-statement transaction. It can be used to read from a consistent snapshot in place of specifying an atClusterTime read concern. Unlike multi-statement transactions, the new $_internalReadAtClusterTime option doesn't cause locks to be left on the server after returning a network response. It instead restores the snapshot to read from as part of handling the request.
author: Max Hirschhorn <max.hirschhorn@mongodb.com> 2019-02-11 15:43:27 -0500
committer: Max Hirschhorn <max.hirschhorn@mongodb.com> 2019-02-11 15:43:27 -0500
commit: 9db1a8dffe753808bea0d8c47d9fc959eaea9ea0 (patch)
tree: 26b5750c5088d745ab1fb93596d010501b0b4cbe /jstests
parent: 691ab6da0c38f52f32c1028a8fa7447997ced255 (diff)
download: mongo-9db1a8dffe753808bea0d8c47d9fc959eaea9ea0.tar.gz
4 files changed, 313 insertions, 86 deletions
diff --git a/jstests/hooks/run_check_repl_dbhash_background.js b/jstests/hooks/run_check_repl_dbhash_background.js
index 8e5b942a2eb..8cd338b2d32 100644
--- a/jstests/hooks/run_check_repl_dbhash_background.js
+++ b/jstests/hooks/run_check_repl_dbhash_background.js
@@ -3,15 +3,17 @@
  *
  * Unlike run_check_repl_dbhash.js, this version of the hook doesn't require that all operations
  * have finished replicating, nor does it require that the test has finished running. The dbHash
- * command is run inside of a transaction specifying atClusterTime in order for an identical
- * snapshot to be used by all members of the replica set.
+ * command reads at a particular clusterTime in order for an identical snapshot to be used by all
+ * members of the replica set.
  *
- * The find and getMore commands used to generate the collection diff run as part of the same
- * transaction as the dbHash command. This ensures the diagnostics for a dbhash mismatch aren't
- * subjected to changes from any operations in flight.
+ * The find and getMore commands used to generate the collection diff read at the same clusterTime
+ * as the dbHash command. While this ensures the diagnostics for a dbhash mismatch aren't subjected
+ * to changes from any operations in flight, it is possible for the collection or an index on the
+ * collection to be dropped due to no locks being held.
  *
- * If a transient transaction error occurs, then the dbhash check is retried until it succeeds, or
- * until it fails with a non-transient error.
+ * If a transient error occurs, then the dbhash check is retried until it succeeds, or until it
+ * fails with a non-transient error. The most common case of a transient error is attempting to read
+ * from a collection after a catalog operation has been performed on the collection or database.
  */
 'use strict';
 
@@ -26,7 +28,7 @@
     let debugInfo = [];
 
     // We turn off printing the JavaScript stacktrace in doassert() to avoid generating an
-    // overwhelming amount of log messages when handling transient transaction errors.
+    // overwhelming amount of log messages when handling transient errors.
     TestData = TestData || {};
     TestData.traceExceptions = false;
 
@@ -69,13 +71,8 @@
     const kForeverSeconds = 1e9;
     const dbNames = new Set();
 
-    // We enable the "WTPreserveSnapshotHistoryIndefinitely" failpoint and extend the
-    // "transactionLifetimeLimitSeconds" server parameter to ensure that
-    //
-    //   (1) the same snapshot will be available to read at on the primary and secondaries, and
-    //
-    //   (2) the potentally long-running transaction isn't killed while we are part-way through
-    //       verifying data consistency.
+    // We enable the "WTPreserveSnapshotHistoryIndefinitely" failpoint to ensure that the same
+    // snapshot will be available to read at on the primary and secondaries.
     for (let session of sessions) {
         const db = session.getDatabase('admin');
 
@@ -96,19 +93,6 @@
                 mode: 'off',
             }));
         });
-
-        const res = assert.commandWorked(db.runCommand({
-            setParameter: 1,
-            transactionLifetimeLimitSeconds: kForeverSeconds,
-        }));
-
-        const transactionLifetimeLimitSecondsOriginal = res.was;
-        resetFns.push(() => {
-            assert.commandWorked(db.runCommand({
-                setParameter: 1,
-                transactionLifetimeLimitSeconds: transactionLifetimeLimitSecondsOriginal,
-            }));
-        });
     }
 
     for (let session of sessions) {
@@ -124,8 +108,8 @@
         });
     }
 
-    // Transactions cannot be run on the following databases. (The "local" database is also not
-    // replicated.)
+    // Transactions cannot be run on the following databases so we don't attempt to read at a
+    // clusterTime on them either. (The "local" database is also not replicated.)
     dbNames.delete('admin');
     dbNames.delete('config');
     dbNames.delete('local');
@@ -133,9 +117,8 @@
     const results = [];
 
     // The waitForSecondaries() function waits for all secondaries to have applied up to
-    // 'clusterTime' locally. This ensures that a later atClusterTime read inside a transaction
-    // doesn't stall as a result of a pending global X lock (e.g. from a dropDatabase command) on
-    // the primary preventing getMores on the oplog from receiving a response.
+    // 'clusterTime' locally. This ensures that a later $_internalReadAtClusterTime read doesn't
+    // fail as a result of the secondary's clusterTime being behind 'clusterTime'.
     const waitForSecondaries = (clusterTime, signedClusterTime) => {
         debugInfo.push({"waitForSecondaries": clusterTime, "signedClusterTime": signedClusterTime});
         for (let i = 1; i < sessions.length; ++i) {
@@ -198,9 +181,10 @@
     // isn't multi-versioned. Unlike with ReplSetTest#checkReplicatedDataHashes(), it is possible
     // for a collection catalog operation (e.g. a drop or rename) to have been applied on the
     // primary but not yet applied on the secondary.
-    const checkCollectionHashesForDB = (dbName) => {
+    const checkCollectionHashesForDB = (dbName, clusterTime) => {
         const result = [];
-        const hashes = rst.getHashesUsingSessions(sessions, dbName);
+        const hashes =
+            rst.getHashesUsingSessions(sessions, dbName, {readAtClusterTime: clusterTime});
         const hashesByUUID = hashes.map((response, i) => {
             const info = {};
 
@@ -281,9 +265,7 @@
             // ReplSetTest#getCollectionDiffUsingSessions() upon detecting a dbHash mismatch. It is
             // presumed to still useful to know that a bug exists even if we cannot get more
             // diagnostics for it.
-            if ((e.hasOwnProperty('errorLabels') &&
-                 e.errorLabels.includes('TransientTransactionError')) ||
-                e.code === ErrorCodes.Interrupted) {
+            if (e.code === ErrorCodes.Interrupted || e.code === ErrorCodes.SnapshotUnavailable) {
                 hasTransientError = true;
                 return true;
             }
@@ -310,23 +292,15 @@
                 debugInfo.push({
                     "node": session.getClient(),
                     "session": session,
-                    "startTransaction": clusterTime
+                    "readAtClusterTime": clusterTime
                 });
-                session.startTransaction(
-                    {readConcern: {level: 'snapshot', atClusterTime: clusterTime}});
             }
 
             hasTransientError = false;
 
             try {
-                result = checkCollectionHashesForDB(dbName);
+                result = checkCollectionHashesForDB(dbName, clusterTime);
             } catch (e) {
-                // We abort each of the transactions started on the nodes if one of them returns an
-                // error while running the dbHash check.
-                for (let session of sessions) {
-                    session.abortTransaction_forTesting();
-                }
-
                 if (isTransientError(e)) {
                     debugInfo.push({"transientError": e});
                     continue;
@@ -335,21 +309,6 @@
                 jsTestLog(debugInfo);
                 throw e;
             }
-
-            // We then attempt to commit each of the transactions started on the nodes to confirm
-            // the data we read was actually majority-committed. If one of them returns an error,
-            // then we still try to commit the transactions started on subsequent nodes in order to
-            // clear their transaction state.
-            for (let session of sessions) {
-                try {
-                    session.commitTransaction();
-                } catch (e) {
-                    if (!isTransientError(e)) {
-                        jsTestLog(debugInfo);
-                        throw e;
-                    }
-                }
-            }
         } while (hasTransientError);
 
         for (let mismatchInfo of result) {
diff --git a/jstests/replsets/dbhash_lock_acquisition.js b/jstests/replsets/dbhash_lock_acquisition.js
index f178e5ce4b5..cf26fb6caea 100644
--- a/jstests/replsets/dbhash_lock_acquisition.js
+++ b/jstests/replsets/dbhash_lock_acquisition.js
@@ -1,11 +1,14 @@
 /**
- * Tests that the dbHash command acquires IX mode locks on the global, database, and collection
- * resources when running inside a multi-statement transaction.
+ * Tests that the dbHash command acquires IS mode locks on the global, database, and collection
+ * resources when reading a timestamp using the $_internalReadAtClusterTime option.
+ *
  * @tags: [uses_transactions]
  */
 (function() {
     "use strict";
 
+    load("jstests/libs/parallelTester.js");  // for ScopedThread
+
     const rst = new ReplSetTest({nodes: 1});
     rst.startSet();
     rst.initiate();
@@ -16,31 +19,77 @@
     const session = primary.startSession({causalConsistency: false});
     const sessionDB = session.getDatabase(db.getName());
 
-    function assertTransactionAcquiresIXLocks(txnOptions) {
-        session.startTransaction(txnOptions);
+    // We insert a document so the dbHash command has a collection to process.
+    assert.commandWorked(sessionDB.mycoll.insert({}, {writeConcern: {w: "majority"}}));
+    const clusterTime = session.getOperationTime();
+
+    // We then start a transaction in order to be able have a catalog operation queue up behind it.
+    session.startTransaction();
+    assert.commandWorked(sessionDB.mycoll.insert({}));
 
-        assert.commandWorked(sessionDB.runCommand({dbHash: 1}));
-        const ops = db.currentOp({"lsid.id": session.getSessionId().id}).inprog;
-        assert.eq(1,
-                  ops.length,
-                  () => "Failed to find session in currentOp() output: " + tojson(db.currentOp()));
-        assert.eq(ops[0].locks, {Global: "w", Database: "w", Collection: "w"});
+    const ops = db.currentOp({"lsid.id": session.getSessionId().id}).inprog;
+    assert.eq(1,
+              ops.length,
+              () => "Failed to find session in currentOp() output: " + tojson(db.currentOp()));
+    assert.eq(ops[0].locks, {Global: "w", Database: "w", Collection: "w"});
 
-        session.abortTransaction_forTesting();
-    }
+    const threadCaptruncCmd = new ScopedThread(function(host) {
+        try {
+            const conn = new Mongo(host);
+            const db = conn.getDB("test");
 
-    // We insert a document so the dbHash command has a collection to process.
-    assert.commandWorked(db.mycoll.insert({}, {writeConcern: {w: "majority"}}));
-
-    assertTransactionAcquiresIXLocks({});
-    assertTransactionAcquiresIXLocks({readConcern: {level: "local"}});
-    assertTransactionAcquiresIXLocks({readConcern: {level: "snapshot"}});
-    assertTransactionAcquiresIXLocks({
-        readConcern: {
-            level: "snapshot",
-            atClusterTime: session.getOperationTime(),
+            // We use the captrunc command as a catalog operation that requires a MODE_X lock on the
+            // collection. This ensures we aren't having the dbHash command queue up behind it on a
+            // database-level lock. The collection isn't capped so it'll fail with an
+            // IllegalOperation error response.
+            assert.commandFailedWithCode(db.runCommand({captrunc: "mycoll", n: 1}),
+                                         ErrorCodes.IllegalOperation);
+            return {ok: 1};
+        } catch (e) {
+            return {ok: 0, error: e.toString(), stack: e.stack};
         }
-    });
+    }, db.getMongo().host);
+
+    threadCaptruncCmd.start();
+
+    assert.soon(() => {
+        const ops = db.currentOp({"command.captrunc": "mycoll", waitingForLock: true}).inprog;
+        return ops.length === 1;
+    }, () => "Failed to find create collection in currentOp() output: " + tojson(db.currentOp()));
+
+    const threadDBHash = new ScopedThread(function(host, clusterTime) {
+        try {
+            const conn = new Mongo(host);
+            const db = conn.getDB("test");
+            assert.commandWorked(db.runCommand({
+                dbHash: 1,
+                $_internalReadAtClusterTime: eval(clusterTime),
+            }));
+            return {ok: 1};
+        } catch (e) {
+            return {ok: 0, error: e.toString(), stack: e.stack};
+        }
+    }, db.getMongo().host, tojson(clusterTime));
+
+    threadDBHash.start();
+
+    assert.soon(() => {
+        const ops = db.currentOp({"command.dbHash": 1, waitingForLock: true}).inprog;
+        if (ops.length === 0) {
+            return false;
+        }
+        // The lock mode for the Global resource is reported as "w" rather than "r" because of the
+        // mode taken for the ReplicationStateTransitionLock when acquiring the global lock.
+        assert.eq(ops[0].locks, {Global: "w", Database: "r", Collection: "r"});
+        return true;
+    }, () => "Failed to find create collection in currentOp() output: " + tojson(db.currentOp()));
+
+    session.commitTransaction();
+    threadCaptruncCmd.join();
+    threadDBHash.join();
+
+    assert.commandWorked(threadCaptruncCmd.returnData());
+    assert.commandWorked(threadDBHash.returnData());
 
     session.endSession();
     rst.stopSet();
diff --git a/jstests/replsets/dbhash_read_at_cluster_time.js b/jstests/replsets/dbhash_read_at_cluster_time.js
new file mode 100644
index 00000000000..252b5004004
--- /dev/null
+++ b/jstests/replsets/dbhash_read_at_cluster_time.js
@@ -0,0 +1,120 @@
+/**
+ * Tests that "$_internalReadAtClusterTime" is supported by the "dbHash" command.
+ *
+ * @tags: [uses_transactions]
+ */
+(function() {
+    "use strict";
+
+    const rst = new ReplSetTest({nodes: 2});
+    rst.startSet();
+
+    const replSetConfig = rst.getReplSetConfig();
+    replSetConfig.members[1].priority = 0;
+    rst.initiate(replSetConfig);
+
+    const primary = rst.getPrimary();
+    const secondary = rst.getSecondary();
+
+    const session = primary.startSession({causalConsistency: false});
+    const db = session.getDatabase("test");
+    let txnNumber = 0;
+
+    // We prevent the replica set from advancing oldest_timestamp. This ensures that the snapshot
+    // associated with 'clusterTime' is retained for the duration of this test.
+    rst.nodes.forEach(conn => {
+        assert.commandWorked(conn.adminCommand({
+            configureFailPoint: "WTPreserveSnapshotHistoryIndefinitely",
+            mode: "alwaysOn",
+        }));
+    });
+
+    // We insert a document and save the md5sum associated with the opTime of that write.
+    assert.commandWorked(db.mycoll.insert({_id: 1}, {writeConcern: {w: "majority"}}));
+    const clusterTime = db.getSession().getOperationTime();
+
+    let res = assert.commandWorked(db.runCommand({
+        dbHash: 1,
+        $_internalReadAtClusterTime: clusterTime,
+    }));
+
+    const hash1 = {collections: res.collections, md5: res.md5};
+
+    // We insert another document to ensure the collection's contents have a different md5sum now.
+    // We use a w=majority write concern to ensure that the insert has also been applied on the
+    // secondary by the time we go to run the dbHash command later. This avoids a race where the
+    // replication subsystem could be applying the insert operation when the dbHash command is run
+    // on the secondary.
+    assert.commandWorked(db.mycoll.insert({_id: 2}, {writeConcern: {w: "majority"}}));
+
+    // However, using $_internalReadAtClusterTime to read at the opTime of the first insert should
+    // return the same md5sum as it did originally.
+    res = assert.commandWorked(db.runCommand({
+        dbHash: 1,
+        $_internalReadAtClusterTime: clusterTime,
+    }));
+
+    const hash2 = {collections: res.collections, md5: res.md5};
+    assert.eq(hash1, hash2, "primary returned different dbhash after second insert");
+
+    {
+        const secondarySession = secondary.startSession({causalConsistency: false});
+        const secondaryDB = secondarySession.getDatabase("test");
+
+        // Using $_internalReadAtClusterTime to read at the opTime of the first insert should return
+        // the same md5sum on the secondary as it did on the primary.
+        res = assert.commandWorked(secondaryDB.runCommand({
+            dbHash: 1,
+            $_internalReadAtClusterTime: clusterTime,
+        }));
+
+        const secondaryHash = {collections: res.collections, md5: res.md5};
+        assert.eq(hash1, secondaryHash, "primary and secondary have different dbhash");
+    }
+
+    {
+        const otherSession = primary.startSession({causalConsistency: false});
+        const otherDB = otherSession.getDatabase("test");
+
+        // We perform another insert inside a separate transaction to cause a MODE_IX lock to be
+        // held on the collection.
+        otherSession.startTransaction();
+        assert.commandWorked(otherDB.mycoll.insert({_id: 3}));
+
+        // It should be possible to run the "dbHash" command with "$_internalReadAtClusterTime"
+        // concurrently.
+        res = assert.commandWorked(db.runCommand({
+            dbHash: 1,
+            $_internalReadAtClusterTime: clusterTime,
+        }));
+
+        const hash3 = {collections: res.collections, md5: res.md5};
+        assert.eq(hash1, hash3, "primary returned different dbhash after third insert");
+
+        // However, the "dbHash" command should block behind the transaction if
+        // "$_internalReadAtClusterTime" wasn't specified.
+        res = assert.commandFailedWithCode(db.runCommand({dbHash: 1, maxTimeMS: 1000}),
+                                           ErrorCodes.MaxTimeMSExpired);
+
+        otherSession.abortTransaction_forTesting();
+        otherSession.endSession();
+    }
+
+    {
+        const otherSession = primary.startSession({causalConsistency: false});
+        const otherDB = otherSession.getDatabase("test");
+
+        // We create another collection inside a separate session to modify the collection catalog
+        // at an opTime later than 'clusterTime'. This prevents further usage of the snapshot
+        // associated with 'clusterTime' for snapshot reads.
+        assert.commandWorked(otherDB.runCommand({create: "mycoll2"}));
+        assert.commandFailedWithCode(
+            db.runCommand({dbHash: 1, $_internalReadAtClusterTime: clusterTime}),
+            ErrorCodes.SnapshotUnavailable);
+
+        otherSession.endSession();
+    }
+
+    session.endSession();
+    rst.stopSet();
+})();
diff --git a/jstests/replsets/read_at_cluster_time_outside_transactions.js b/jstests/replsets/read_at_cluster_time_outside_transactions.js
new file mode 100644
index 00000000000..3df9e09976d
--- /dev/null
+++ b/jstests/replsets/read_at_cluster_time_outside_transactions.js
@@ -0,0 +1,99 @@
+/**
+ * Tests that the "find" and "dbHash" commands support reading at a Timestamp by using the
+ * $_internalReadAtClusterTime option.
+ */
+(function() {
+    "use strict";
+
+    const rst = new ReplSetTest({nodes: 1});
+    rst.startSet();
+    rst.initiate();
+
+    const primary = rst.getPrimary();
+    const db = primary.getDB("test");
+
+    const collName = "read_at_cluster_time_outside_transactions";
+    const collection = db[collName];
+
+    // We prevent the replica set from advancing oldest_timestamp. This ensures that the snapshot
+    // associated with 'clusterTime' is retained for the duration of this test.
+    rst.nodes.forEach(conn => {
+        assert.commandWorked(conn.adminCommand({
+            configureFailPoint: "WTPreserveSnapshotHistoryIndefinitely",
+            mode: "alwaysOn",
+        }));
+    });
+
+    // We insert 3 documents in order to have data to return for both the find and getMore commands
+    // when using a batch size of 2. We then save the md5sum associated with the opTime of the last
+    // insert.
+    assert.commandWorked(collection.insert({_id: 1, comment: "should be seen by find command"}));
+    assert.commandWorked(collection.insert({_id: 3, comment: "should be seen by find command"}));
+    assert.commandWorked(collection.insert({_id: 5, comment: "should be seen by getMore command"}));
+
+    const clusterTime = db.getSession().getOperationTime();
+
+    let res = assert.commandWorked(db.runCommand({dbHash: 1}));
+    const hashAfterOriginalInserts = {collections: res.collections, md5: res.md5};
+
+    // The documents with _id=1 and _id=3 should be returned by the find command.
+    let cursor = collection.find().sort({_id: 1}).batchSize(2);
+    assert.eq({_id: 1, comment: "should be seen by find command"}, cursor.next());
+    assert.eq({_id: 3, comment: "should be seen by find command"}, cursor.next());
+
+    // We then insert documents with _id=2 and _id=4. The document with _id=2 is positioned behind
+    // the _id index cursor and won't be returned by the getMore command. However, the document with
+    // _id=4 is positioned ahead and should end up being returned.
+    assert.commandWorked(
+        collection.insert({_id: 2, comment: "should not be seen by getMore command"}));
+    assert.commandWorked(
+        collection.insert({_id: 4, comment: "should be seen by non-snapshot getMore command"}));
+    assert.eq({_id: 4, comment: "should be seen by non-snapshot getMore command"}, cursor.next());
+    assert.eq({_id: 5, comment: "should be seen by getMore command"}, cursor.next());
+    assert(!cursor.hasNext());
+
+    // When using the $_internalReadAtClusterTime option with a clusterTime from after the
+    // original 3 documents were inserted, the document with _id=2 shouldn't be visible to the find
+    // command because it was inserted afterwards. The same applies to the document with _id=4 and
+    // the getMore command.
+    res = collection.runCommand("find", {
+        batchSize: 2,
+        sort: {_id: 1},
+        $_internalReadAtClusterTime: clusterTime,
+    });
+
+    const batchSize = 2;
+    cursor = new DBCommandCursor(db, res, batchSize);
+    assert.eq({_id: 1, comment: "should be seen by find command"}, cursor.next());
+    assert.eq({_id: 3, comment: "should be seen by find command"}, cursor.next());
+    assert.eq({_id: 5, comment: "should be seen by getMore command"}, cursor.next());
+    assert(!cursor.hasNext());
+
+    // Using the $_internalReadAtClusterTime option to read at the opTime of the last of the 3
+    // original inserts should return the same md5sum as it did originally.
+    res = assert.commandWorked(db.runCommand({
+        dbHash: 1,
+        $_internalReadAtClusterTime: clusterTime,
+    }));
+
+    const hashAtClusterTime = {collections: res.collections, md5: res.md5};
+    assert.eq(hashAtClusterTime, hashAfterOriginalInserts);
+
+    // Attempting to read at a clusterTime in the future should return an error.
+    const futureClusterTime = new Timestamp(clusterTime.getTime() + 1000, 1);
+
+    assert.commandFailedWithCode(collection.runCommand("find", {
+        batchSize: 2,
+        sort: {_id: 1},
+        $_internalReadAtClusterTime: futureClusterTime,
+    }),
+                                 ErrorCodes.InvalidOptions);
+
+    assert.commandFailedWithCode(db.runCommand({
+        dbHash: 1,
+        $_internalReadAtClusterTime: futureClusterTime,
+    }),
+                                 ErrorCodes.InvalidOptions);
+
+    rst.stopSet();
+})();
author	Max Hirschhorn <max.hirschhorn@mongodb.com>	2019-02-11 15:43:27 -0500
committer	Max Hirschhorn <max.hirschhorn@mongodb.com>	2019-02-11 15:43:27 -0500
commit	9db1a8dffe753808bea0d8c47d9fc959eaea9ea0 (patch)
tree	26b5750c5088d745ab1fb93596d010501b0b4cbe /jstests
parent	691ab6da0c38f52f32c1028a8fa7447997ced255 (diff)
download	mongo-9db1a8dffe753808bea0d8c47d9fc959eaea9ea0.tar.gz