SERVER-34778 Add support for dbHash command inside multi-stmt txn.

The dbHash command is only allowed inside of a multi-statement transaction when test commands are enabled. Also introduces a WTPreserveSnapshotHistoryIndefinitely failpoint to skip setting the oldest timestamp.
author: Max Hirschhorn <max.hirschhorn@mongodb.com> 2018-05-17 20:59:18 -0400
committer: Max Hirschhorn <max.hirschhorn@mongodb.com> 2018-05-17 20:59:18 -0400
commit: 25b0e6f7d22de88faaa7e223195992e995acdff4 (patch)
tree: 0e161ecf41af78dd5b4553549ea7a9df832ce242
parent: 05651d31cad6fa886a436fda597234ceebf52dfd (diff)
download: mongo-25b0e6f7d22de88faaa7e223195992e995acdff4.tar.gz
5 files changed, 185 insertions, 3 deletions
diff --git a/jstests/replsets/dbhash_at_cluster_time.js b/jstests/replsets/dbhash_at_cluster_time.js
new file mode 100644
index 00000000000..b3b57965258
--- /dev/null
+++ b/jstests/replsets/dbhash_at_cluster_time.js
@@ -0,0 +1,124 @@
+/**
+ * Tests that "atClusterTime" is supported by the "dbHash" command.
+ */
+(function() {
+    "use strict";
+
+    const rst = new ReplSetTest({nodes: 2});
+    rst.startSet();
+
+    const replSetConfig = rst.getReplSetConfig();
+    replSetConfig.members[1].priority = 0;
+    rst.initiate(replSetConfig);
+
+    const primary = rst.getPrimary();
+    const secondary = rst.getSecondary();
+
+    const session = primary.startSession({causalConsistency: false});
+    const db = session.getDatabase("test");
+    let txnNumber = 0;
+
+    if (!db.serverStatus().storageEngine.supportsSnapshotReadConcern) {
+        rst.stopSet();
+        return;
+    }
+
+    // We force 'secondary' to sync from 'primary' using the "forceSyncSourceCandidate" failpoint to
+    // ensure that an intermittent connectivity issue doesn't lead to the secondary not advancing
+    // its belief of the majority commit point. This avoids any complications that would arise due
+    // to SERVER-33248.
+    assert.commandWorked(secondary.adminCommand({
+        configureFailPoint: "forceSyncSourceCandidate",
+        mode: "alwaysOn",
+        data: {hostAndPort: primary.host}
+    }));
+    rst.awaitSyncSource(secondary, primary);
+
+    // We also prevent all nodes in the replica set from advancing oldest_timestamp. This ensures
+    // that the snapshot associated with 'clusterTime' is retained for the duration of this test.
+    rst.nodes.forEach(conn => {
+        assert.commandWorked(conn.adminCommand({
+            configureFailPoint: "WTPreserveSnapshotHistoryIndefinitely",
+            mode: "alwaysOn",
+        }));
+    });
+
+    // We insert a document and save the md5sum associated with the opTime of that write.
+    assert.commandWorked(db.mycoll.insert({_id: 1}, {writeConcern: {w: "majority"}}));
+    const clusterTime = db.getSession().getOperationTime();
+
+    session.startTransaction({readConcern: {level: "snapshot", atClusterTime: clusterTime}});
+    let res = assert.commandWorked(db.runCommand({dbHash: 1}));
+    session.commitTransaction();
+    const hash1 = {collections: res.collections, md5: res.md5};
+
+    // We insert another document to ensure the collection's contents have a different md5sum now.
+    assert.commandWorked(db.mycoll.insert({_id: 2}));
+
+    // However, using atClusterTime to read at the opTime of the first insert should return the same
+    // md5sum as it did originally.
+    session.startTransaction({readConcern: {level: "snapshot", atClusterTime: clusterTime}});
+    res = assert.commandWorked(db.runCommand({dbHash: 1}));
+    session.commitTransaction();
+    const hash2 = {collections: res.collections, md5: res.md5};
+    assert.eq(hash1, hash2, "primary returned different dbhash after second insert");
+
+    {
+        const secondarySession = secondary.startSession({causalConsistency: false});
+        const secondaryDB = secondarySession.getDatabase("test");
+
+        // Using atClusterTime to read at the opTime of the first insert should return the same
+        // md5sum on the secondary as it did on the primary.
+        secondarySession.startTransaction(
+            {readConcern: {level: "snapshot", atClusterTime: clusterTime}});
+        res = assert.commandWorked(secondaryDB.runCommand({dbHash: 1}));
+        secondarySession.commitTransaction();
+        const secondaryHash = {collections: res.collections, md5: res.md5};
+        assert.eq(hash1, secondaryHash, "primary and secondary have different dbhash");
+
+        secondarySession.endSession();
+    }
+
+    {
+        const otherSession = primary.startSession({causalConsistency: false});
+        const otherDB = otherSession.getDatabase("test");
+
+        // We perform another insert inside a separate transaction to cause a MODE_IX lock to be
+        // held on the collection.
+        otherSession.startTransaction();
+        assert.commandWorked(otherDB.mycoll.insert({_id: 3}));
+
+        // It should be possible to run the "dbHash" command with "atClusterTime" concurrently.
+        session.startTransaction({readConcern: {level: "snapshot", atClusterTime: clusterTime}});
+        res = assert.commandWorked(db.runCommand({dbHash: 1}));
+        session.commitTransaction();
+        const hash3 = {collections: res.collections, md5: res.md5};
+        assert.eq(hash1, hash3, "primary returned different dbhash after third insert");
+
+        // However, the "dbHash" command should block behind the transaction if "atClusterTime"
+        // wasn't specified.
+        res = assert.commandFailedWithCode(db.runCommand({dbHash: 1, maxTimeMS: 1000}),
+                                           ErrorCodes.ExceededTimeLimit);
+
+        otherSession.abortTransaction();
+        otherSession.endSession();
+    }
+
+    {
+        const otherSession = primary.startSession({causalConsistency: false});
+        const otherDB = otherSession.getDatabase("test");
+
+        // We create another collection inside a separate session to modify the collection catalog
+        // at an opTime later than 'clusterTime'. This prevents further usage of the snapshot
+        // associated with 'clusterTime' for snapshot reads.
+        assert.commandWorked(otherDB.runCommand({create: "mycoll2"}));
+        session.startTransaction({readConcern: {level: "snapshot", atClusterTime: clusterTime}});
+        assert.commandFailedWithCode(db.runCommand({dbHash: 1}), ErrorCodes.SnapshotUnavailable);
+        session.abortTransaction();
+
+        otherSession.endSession();
+    }
+
+    session.endSession();
+    rst.stopSet();
+})();
diff --git a/src/mongo/db/commands/dbhash.cpp b/src/mongo/db/commands/dbhash.cpp
index c7aa1c64e14..29a7e1fb129 100644
--- a/src/mongo/db/commands/dbhash.cpp
+++ b/src/mongo/db/commands/dbhash.cpp
@@ -32,6 +32,7 @@
 
 #include "mongo/platform/basic.h"
 
+#include <boost/optional.hpp>
 #include <map>
 #include <string>
 
@@ -62,6 +63,17 @@ public:
         return false;
     }
 
+    ReadWriteType getReadWriteType() const override {
+        return ReadWriteType::kRead;
+    }
+
+    bool supportsReadConcern(const std::string& dbName,
+                             const BSONObj& cmdObj,
+                             repl::ReadConcernLevel level) const override {
+        return level == repl::ReadConcernLevel::kLocalReadConcern ||
+            level == repl::ReadConcernLevel::kSnapshotReadConcern;
+    }
+
     AllowedOnSecondary secondaryAllowed(ServiceContext*) const override {
         return AllowedOnSecondary::kAlways;
     }
@@ -101,7 +113,14 @@ public:
 
         // We lock the entire database in S-mode in order to ensure that the contents will not
         // change for the snapshot.
-        AutoGetDb autoDb(opCtx, ns, MODE_S);
+        auto lockMode = LockMode::MODE_S;
+        if (repl::ReadConcernArgs::get(opCtx).getArgsAtClusterTime()) {
+            // However, if we are using "atClusterTime" to read from a consistent snapshot, then we
+            // only need to lock the database in intent mode to ensure that none of the collections
+            // get dropped.
+            lockMode = getLockModeForQuery(opCtx);
+        }
+        AutoGetDb autoDb(opCtx, ns, lockMode);
         Database* db = autoDb.getDb();
         std::list<std::string> colls;
         if (db) {
@@ -177,6 +196,30 @@ private:
         if (!collection)
             return "";
 
+        boost::optional<Lock::CollectionLock> collLock;
+        if (repl::ReadConcernArgs::get(opCtx).getArgsAtClusterTime()) {
+            // When using "atClusterTime", we are only holding the database lock in intent mode. We
+            // need to also acquire the collection lock in intent mode to ensure reading from the
+            // consistent snapshot doesn't overlap with any catalog operations on the collection.
+            invariant(opCtx->lockState()->isDbLockedForMode(db->name(), MODE_IS));
+            collLock.emplace(opCtx->lockState(), fullCollectionName, getLockModeForQuery(opCtx));
+
+            auto minSnapshot = collection->getMinimumVisibleSnapshot();
+            auto mySnapshot = opCtx->recoveryUnit()->getPointInTimeReadTimestamp();
+            invariant(mySnapshot);
+
+            uassert(ErrorCodes::SnapshotUnavailable,
+                    str::stream() << "Unable to read from a snapshot due to pending collection"
+                                     " catalog changes; please retry the operation. Snapshot"
+                                     " timestamp is "
+                                  << mySnapshot->toString()
+                                  << ". Collection minimum timestamp is "
+                                  << minSnapshot->toString(),
+                    !minSnapshot || *mySnapshot >= *minSnapshot);
+        } else {
+            invariant(opCtx->lockState()->isDbLockedForMode(db->name(), MODE_S));
+        }
+
         IndexDescriptor* desc = collection->getIndexCatalog()->findIdIndex(opCtx);
 
         std::unique_ptr<PlanExecutor, PlanExecutor::Deleter> exec;
diff --git a/src/mongo/db/service_entry_point_common.cpp b/src/mongo/db/service_entry_point_common.cpp
index 90e0b686c76..05d6f3ef09d 100644
--- a/src/mongo/db/service_entry_point_common.cpp
+++ b/src/mongo/db/service_entry_point_common.cpp
@@ -111,6 +111,7 @@ const StringMap<int> sessionCheckoutWhitelist = {{"abortTransaction", 1},
                                                  {"applyOps", 1},
                                                  {"commitTransaction", 1},
                                                  {"count", 1},
+                                                 {"dbHash", 1},
                                                  {"delete", 1},
                                                  {"distinct", 1},
                                                  {"doTxn", 1},
diff --git a/src/mongo/db/session.cpp b/src/mongo/db/session.cpp
index 75afa18586d..22683a3ab59 100644
--- a/src/mongo/db/session.cpp
+++ b/src/mongo/db/session.cpp
@@ -110,6 +110,10 @@ const StringMap<int> txnCmdWhitelist = {{"abortTransaction", 1},
                                         {"prepareTransaction", 1},
                                         {"update", 1}};
 
+// The command names that are allowed in a multi-document transaction only when test commands are
+// enabled.
+const StringMap<int> txnCmdForTestingWhitelist = {{"dbHash", 1}};
+
 // The commands that can be run on the 'admin' database in multi-document transactions.
 const StringMap<int> txnAdminCommands = {
     {"abortTransaction", 1}, {"commitTransaction", 1}, {"doTxn", 1}, {"prepareTransaction", 1}};
@@ -347,7 +351,9 @@ void Session::beginOrContinueTxn(OperationContext* opCtx,
 
     uassert(50767,
             str::stream() << "Cannot run '" << cmdName << "' in a multi-document transaction.",
-            !autocommit || txnCmdWhitelist.find(cmdName) != txnCmdWhitelist.cend());
+            !autocommit || txnCmdWhitelist.find(cmdName) != txnCmdWhitelist.cend() ||
+                (getTestCommandsEnabled() &&
+                 txnCmdForTestingWhitelist.find(cmdName) != txnCmdForTestingWhitelist.cend()));
 
     uassert(50844,
             str::stream() << "Cannot run command against the '" << dbName
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
index 2596a4f5cb5..25685dc5024 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
@@ -1104,6 +1104,12 @@ void WiredTigerKVEngine::setOldestTimestamp(Timestamp oldestTimestamp) {
     _setOldestTimestamp(oldestTimestamp, doForce);
 }
 
+namespace {
+
+MONGO_FP_DECLARE(WTPreserveSnapshotHistoryIndefinitely);
+
+}  // namespace
+
 void WiredTigerKVEngine::setStableTimestamp(Timestamp stableTimestamp) {
     if (stableTimestamp.isNull()) {
         return;
@@ -1144,7 +1150,9 @@ void WiredTigerKVEngine::setStableTimestamp(Timestamp stableTimestamp) {
     // Communicate to WiredTiger that it can clean up timestamp data earlier than the timestamp
     // provided.  No future queries will need point-in-time reads at a timestamp prior to the one
     // provided here.
-    _setOldestTimestamp(stableTimestamp);
+    if (!MONGO_FAIL_POINT(WTPreserveSnapshotHistoryIndefinitely)) {
+        _setOldestTimestamp(stableTimestamp);
+    }
 }
 
 void WiredTigerKVEngine::_setOldestTimestamp(Timestamp oldestTimestamp, bool force) {
author	Max Hirschhorn <max.hirschhorn@mongodb.com>	2018-05-17 20:59:18 -0400
committer	Max Hirschhorn <max.hirschhorn@mongodb.com>	2018-05-17 20:59:18 -0400
commit	25b0e6f7d22de88faaa7e223195992e995acdff4 (patch)
tree	0e161ecf41af78dd5b4553549ea7a9df832ce242
parent	05651d31cad6fa886a436fda597234ceebf52dfd (diff)
download	mongo-25b0e6f7d22de88faaa7e223195992e995acdff4.tar.gz