diff options
author | Max Hirschhorn <max.hirschhorn@mongodb.com> | 2018-05-17 20:59:18 -0400 |
---|---|---|
committer | Max Hirschhorn <max.hirschhorn@mongodb.com> | 2018-05-17 20:59:18 -0400 |
commit | 25b0e6f7d22de88faaa7e223195992e995acdff4 (patch) | |
tree | 0e161ecf41af78dd5b4553549ea7a9df832ce242 | |
parent | 05651d31cad6fa886a436fda597234ceebf52dfd (diff) | |
download | mongo-25b0e6f7d22de88faaa7e223195992e995acdff4.tar.gz |
SERVER-34778 Add support for dbHash command inside multi-stmt txn.
The dbHash command is only allowed inside of a multi-statement
transaction when test commands are enabled.
Also introduces a WTPreserveSnapshotHistoryIndefinitely failpoint to
skip setting the oldest timestamp.
-rw-r--r-- | jstests/replsets/dbhash_at_cluster_time.js | 124 | ||||
-rw-r--r-- | src/mongo/db/commands/dbhash.cpp | 45 | ||||
-rw-r--r-- | src/mongo/db/service_entry_point_common.cpp | 1 | ||||
-rw-r--r-- | src/mongo/db/session.cpp | 8 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp | 10 |
5 files changed, 185 insertions, 3 deletions
diff --git a/jstests/replsets/dbhash_at_cluster_time.js b/jstests/replsets/dbhash_at_cluster_time.js new file mode 100644 index 00000000000..b3b57965258 --- /dev/null +++ b/jstests/replsets/dbhash_at_cluster_time.js @@ -0,0 +1,124 @@ +/** + * Tests that "atClusterTime" is supported by the "dbHash" command. + */ +(function() { + "use strict"; + + const rst = new ReplSetTest({nodes: 2}); + rst.startSet(); + + const replSetConfig = rst.getReplSetConfig(); + replSetConfig.members[1].priority = 0; + rst.initiate(replSetConfig); + + const primary = rst.getPrimary(); + const secondary = rst.getSecondary(); + + const session = primary.startSession({causalConsistency: false}); + const db = session.getDatabase("test"); + let txnNumber = 0; + + if (!db.serverStatus().storageEngine.supportsSnapshotReadConcern) { + rst.stopSet(); + return; + } + + // We force 'secondary' to sync from 'primary' using the "forceSyncSourceCandidate" failpoint to + // ensure that an intermittent connectivity issue doesn't lead to the secondary not advancing + // its belief of the majority commit point. This avoids any complications that would arise due + // to SERVER-33248. + assert.commandWorked(secondary.adminCommand({ + configureFailPoint: "forceSyncSourceCandidate", + mode: "alwaysOn", + data: {hostAndPort: primary.host} + })); + rst.awaitSyncSource(secondary, primary); + + // We also prevent all nodes in the replica set from advancing oldest_timestamp. This ensures + // that the snapshot associated with 'clusterTime' is retained for the duration of this test. + rst.nodes.forEach(conn => { + assert.commandWorked(conn.adminCommand({ + configureFailPoint: "WTPreserveSnapshotHistoryIndefinitely", + mode: "alwaysOn", + })); + }); + + // We insert a document and save the md5sum associated with the opTime of that write. + assert.commandWorked(db.mycoll.insert({_id: 1}, {writeConcern: {w: "majority"}})); + const clusterTime = db.getSession().getOperationTime(); + + session.startTransaction({readConcern: {level: "snapshot", atClusterTime: clusterTime}}); + let res = assert.commandWorked(db.runCommand({dbHash: 1})); + session.commitTransaction(); + const hash1 = {collections: res.collections, md5: res.md5}; + + // We insert another document to ensure the collection's contents have a different md5sum now. + assert.commandWorked(db.mycoll.insert({_id: 2})); + + // However, using atClusterTime to read at the opTime of the first insert should return the same + // md5sum as it did originally. + session.startTransaction({readConcern: {level: "snapshot", atClusterTime: clusterTime}}); + res = assert.commandWorked(db.runCommand({dbHash: 1})); + session.commitTransaction(); + const hash2 = {collections: res.collections, md5: res.md5}; + assert.eq(hash1, hash2, "primary returned different dbhash after second insert"); + + { + const secondarySession = secondary.startSession({causalConsistency: false}); + const secondaryDB = secondarySession.getDatabase("test"); + + // Using atClusterTime to read at the opTime of the first insert should return the same + // md5sum on the secondary as it did on the primary. + secondarySession.startTransaction( + {readConcern: {level: "snapshot", atClusterTime: clusterTime}}); + res = assert.commandWorked(secondaryDB.runCommand({dbHash: 1})); + secondarySession.commitTransaction(); + const secondaryHash = {collections: res.collections, md5: res.md5}; + assert.eq(hash1, secondaryHash, "primary and secondary have different dbhash"); + + secondarySession.endSession(); + } + + { + const otherSession = primary.startSession({causalConsistency: false}); + const otherDB = otherSession.getDatabase("test"); + + // We perform another insert inside a separate transaction to cause a MODE_IX lock to be + // held on the collection. + otherSession.startTransaction(); + assert.commandWorked(otherDB.mycoll.insert({_id: 3})); + + // It should be possible to run the "dbHash" command with "atClusterTime" concurrently. + session.startTransaction({readConcern: {level: "snapshot", atClusterTime: clusterTime}}); + res = assert.commandWorked(db.runCommand({dbHash: 1})); + session.commitTransaction(); + const hash3 = {collections: res.collections, md5: res.md5}; + assert.eq(hash1, hash3, "primary returned different dbhash after third insert"); + + // However, the "dbHash" command should block behind the transaction if "atClusterTime" + // wasn't specified. + res = assert.commandFailedWithCode(db.runCommand({dbHash: 1, maxTimeMS: 1000}), + ErrorCodes.ExceededTimeLimit); + + otherSession.abortTransaction(); + otherSession.endSession(); + } + + { + const otherSession = primary.startSession({causalConsistency: false}); + const otherDB = otherSession.getDatabase("test"); + + // We create another collection inside a separate session to modify the collection catalog + // at an opTime later than 'clusterTime'. This prevents further usage of the snapshot + // associated with 'clusterTime' for snapshot reads. + assert.commandWorked(otherDB.runCommand({create: "mycoll2"})); + session.startTransaction({readConcern: {level: "snapshot", atClusterTime: clusterTime}}); + assert.commandFailedWithCode(db.runCommand({dbHash: 1}), ErrorCodes.SnapshotUnavailable); + session.abortTransaction(); + + otherSession.endSession(); + } + + session.endSession(); + rst.stopSet(); +})(); diff --git a/src/mongo/db/commands/dbhash.cpp b/src/mongo/db/commands/dbhash.cpp index c7aa1c64e14..29a7e1fb129 100644 --- a/src/mongo/db/commands/dbhash.cpp +++ b/src/mongo/db/commands/dbhash.cpp @@ -32,6 +32,7 @@ #include "mongo/platform/basic.h" +#include <boost/optional.hpp> #include <map> #include <string> @@ -62,6 +63,17 @@ public: return false; } + ReadWriteType getReadWriteType() const override { + return ReadWriteType::kRead; + } + + bool supportsReadConcern(const std::string& dbName, + const BSONObj& cmdObj, + repl::ReadConcernLevel level) const override { + return level == repl::ReadConcernLevel::kLocalReadConcern || + level == repl::ReadConcernLevel::kSnapshotReadConcern; + } + AllowedOnSecondary secondaryAllowed(ServiceContext*) const override { return AllowedOnSecondary::kAlways; } @@ -101,7 +113,14 @@ public: // We lock the entire database in S-mode in order to ensure that the contents will not // change for the snapshot. - AutoGetDb autoDb(opCtx, ns, MODE_S); + auto lockMode = LockMode::MODE_S; + if (repl::ReadConcernArgs::get(opCtx).getArgsAtClusterTime()) { + // However, if we are using "atClusterTime" to read from a consistent snapshot, then we + // only need to lock the database in intent mode to ensure that none of the collections + // get dropped. + lockMode = getLockModeForQuery(opCtx); + } + AutoGetDb autoDb(opCtx, ns, lockMode); Database* db = autoDb.getDb(); std::list<std::string> colls; if (db) { @@ -177,6 +196,30 @@ private: if (!collection) return ""; + boost::optional<Lock::CollectionLock> collLock; + if (repl::ReadConcernArgs::get(opCtx).getArgsAtClusterTime()) { + // When using "atClusterTime", we are only holding the database lock in intent mode. We + // need to also acquire the collection lock in intent mode to ensure reading from the + // consistent snapshot doesn't overlap with any catalog operations on the collection. + invariant(opCtx->lockState()->isDbLockedForMode(db->name(), MODE_IS)); + collLock.emplace(opCtx->lockState(), fullCollectionName, getLockModeForQuery(opCtx)); + + auto minSnapshot = collection->getMinimumVisibleSnapshot(); + auto mySnapshot = opCtx->recoveryUnit()->getPointInTimeReadTimestamp(); + invariant(mySnapshot); + + uassert(ErrorCodes::SnapshotUnavailable, + str::stream() << "Unable to read from a snapshot due to pending collection" + " catalog changes; please retry the operation. Snapshot" + " timestamp is " + << mySnapshot->toString() + << ". Collection minimum timestamp is " + << minSnapshot->toString(), + !minSnapshot || *mySnapshot >= *minSnapshot); + } else { + invariant(opCtx->lockState()->isDbLockedForMode(db->name(), MODE_S)); + } + IndexDescriptor* desc = collection->getIndexCatalog()->findIdIndex(opCtx); std::unique_ptr<PlanExecutor, PlanExecutor::Deleter> exec; diff --git a/src/mongo/db/service_entry_point_common.cpp b/src/mongo/db/service_entry_point_common.cpp index 90e0b686c76..05d6f3ef09d 100644 --- a/src/mongo/db/service_entry_point_common.cpp +++ b/src/mongo/db/service_entry_point_common.cpp @@ -111,6 +111,7 @@ const StringMap<int> sessionCheckoutWhitelist = {{"abortTransaction", 1}, {"applyOps", 1}, {"commitTransaction", 1}, {"count", 1}, + {"dbHash", 1}, {"delete", 1}, {"distinct", 1}, {"doTxn", 1}, diff --git a/src/mongo/db/session.cpp b/src/mongo/db/session.cpp index 75afa18586d..22683a3ab59 100644 --- a/src/mongo/db/session.cpp +++ b/src/mongo/db/session.cpp @@ -110,6 +110,10 @@ const StringMap<int> txnCmdWhitelist = {{"abortTransaction", 1}, {"prepareTransaction", 1}, {"update", 1}}; +// The command names that are allowed in a multi-document transaction only when test commands are +// enabled. +const StringMap<int> txnCmdForTestingWhitelist = {{"dbHash", 1}}; + // The commands that can be run on the 'admin' database in multi-document transactions. const StringMap<int> txnAdminCommands = { {"abortTransaction", 1}, {"commitTransaction", 1}, {"doTxn", 1}, {"prepareTransaction", 1}}; @@ -347,7 +351,9 @@ void Session::beginOrContinueTxn(OperationContext* opCtx, uassert(50767, str::stream() << "Cannot run '" << cmdName << "' in a multi-document transaction.", - !autocommit || txnCmdWhitelist.find(cmdName) != txnCmdWhitelist.cend()); + !autocommit || txnCmdWhitelist.find(cmdName) != txnCmdWhitelist.cend() || + (getTestCommandsEnabled() && + txnCmdForTestingWhitelist.find(cmdName) != txnCmdForTestingWhitelist.cend())); uassert(50844, str::stream() << "Cannot run command against the '" << dbName diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp index 2596a4f5cb5..25685dc5024 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp @@ -1104,6 +1104,12 @@ void WiredTigerKVEngine::setOldestTimestamp(Timestamp oldestTimestamp) { _setOldestTimestamp(oldestTimestamp, doForce); } +namespace { + +MONGO_FP_DECLARE(WTPreserveSnapshotHistoryIndefinitely); + +} // namespace + void WiredTigerKVEngine::setStableTimestamp(Timestamp stableTimestamp) { if (stableTimestamp.isNull()) { return; @@ -1144,7 +1150,9 @@ void WiredTigerKVEngine::setStableTimestamp(Timestamp stableTimestamp) { // Communicate to WiredTiger that it can clean up timestamp data earlier than the timestamp // provided. No future queries will need point-in-time reads at a timestamp prior to the one // provided here. - _setOldestTimestamp(stableTimestamp); + if (!MONGO_FAIL_POINT(WTPreserveSnapshotHistoryIndefinitely)) { + _setOldestTimestamp(stableTimestamp); + } } void WiredTigerKVEngine::_setOldestTimestamp(Timestamp oldestTimestamp, bool force) { |