diff options
author | Max Hirschhorn <max.hirschhorn@mongodb.com> | 2019-02-11 15:43:27 -0500 |
---|---|---|
committer | Max Hirschhorn <max.hirschhorn@mongodb.com> | 2019-02-11 15:43:27 -0500 |
commit | 9db1a8dffe753808bea0d8c47d9fc959eaea9ea0 (patch) | |
tree | 26b5750c5088d745ab1fb93596d010501b0b4cbe /src/mongo | |
parent | 691ab6da0c38f52f32c1028a8fa7447997ced255 (diff) | |
download | mongo-9db1a8dffe753808bea0d8c47d9fc959eaea9ea0.tar.gz |
SERVER-39169 Add $_internalReadAtClusterTime option to find and dbHash.
The new $_internalReadAtClusterTime option replaces all usages of
running the dbHash command inside of a multi-statement transaction. It
can be used to read from a consistent snapshot in place of specifying an
atClusterTime read concern.
Unlike multi-statement transactions, the new $_internalReadAtClusterTime
option doesn't cause locks to be left on the server after returning a
network response. It instead restores the snapshot to read from as part
of handling the request.
Diffstat (limited to 'src/mongo')
-rw-r--r-- | src/mongo/db/commands.cpp | 8 | ||||
-rw-r--r-- | src/mongo/db/commands/dbhash.cpp | 99 | ||||
-rw-r--r-- | src/mongo/db/commands/find_cmd.cpp | 66 | ||||
-rw-r--r-- | src/mongo/db/commands/getmore_cmd.cpp | 26 | ||||
-rw-r--r-- | src/mongo/db/query/query_request.cpp | 17 | ||||
-rw-r--r-- | src/mongo/db/query/query_request.h | 8 | ||||
-rw-r--r-- | src/mongo/db/transaction_validation.cpp | 1 | ||||
-rw-r--r-- | src/mongo/shell/replsettest.js | 24 |
8 files changed, 215 insertions, 34 deletions
diff --git a/src/mongo/db/commands.cpp b/src/mongo/db/commands.cpp index c4323d6a36f..507dbe78f82 100644 --- a/src/mongo/db/commands.cpp +++ b/src/mongo/db/commands.cpp @@ -129,10 +129,6 @@ const StringMap<int> txnCmdWhitelist = {{"abortTransaction", 1}, {"voteAbortTransaction", 1}, {"voteCommitTransaction", 1}}; -// The command names that are allowed in a multi-document transaction only when test commands are -// enabled. -const StringMap<int> txnCmdForTestingWhitelist = {{"dbHash", 1}}; - // The commands that can be run on the 'admin' database in multi-document transactions. const StringMap<int> txnAdminCommands = {{"abortTransaction", 1}, @@ -451,9 +447,7 @@ Status CommandHelpers::canUseTransactions(StringData dbName, StringData cmdName) "http://dochub.mongodb.org/core/transaction-count for a recommended alternative."}; } - if (txnCmdWhitelist.find(cmdName) == txnCmdWhitelist.cend() && - !(getTestCommandsEnabled() && - txnCmdForTestingWhitelist.find(cmdName) != txnCmdForTestingWhitelist.cend())) { + if (txnCmdWhitelist.find(cmdName) == txnCmdWhitelist.cend()) { return {ErrorCodes::OperationNotSupportedInTransaction, str::stream() << "Cannot run '" << cmdName << "' in a multi-document transaction."}; } diff --git a/src/mongo/db/commands/dbhash.cpp b/src/mongo/db/commands/dbhash.cpp index 669f707111a..0ed79280f17 100644 --- a/src/mongo/db/commands/dbhash.cpp +++ b/src/mongo/db/commands/dbhash.cpp @@ -43,10 +43,14 @@ #include "mongo/db/catalog/database_catalog_entry.h" #include "mongo/db/catalog/index_catalog.h" #include "mongo/db/commands.h" +#include "mongo/db/commands/test_commands_enabled.h" #include "mongo/db/db_raii.h" #include "mongo/db/exec/working_set_common.h" +#include "mongo/db/logical_clock.h" #include "mongo/db/namespace_string.h" #include "mongo/db/query/internal_plans.h" +#include "mongo/db/repl/replication_coordinator.h" +#include "mongo/db/storage/storage_engine.h" #include "mongo/db/transaction_participant.h" #include "mongo/stdx/mutex.h" #include "mongo/util/log.h" @@ -70,13 +74,6 @@ public: return ReadWriteType::kRead; } - bool supportsReadConcern(const std::string& dbName, - const BSONObj& cmdObj, - repl::ReadConcernLevel level) const override { - return level == repl::ReadConcernLevel::kLocalReadConcern || - level == repl::ReadConcernLevel::kSnapshotReadConcern; - } - AllowedOnSecondary secondaryAllowed(ServiceContext*) const override { return AllowedOnSecondary::kAlways; } @@ -114,14 +111,80 @@ public: str::stream() << "Invalid db name: " << ns, NamespaceString::validDBName(ns, NamespaceString::DollarInDbNameBehavior::Allow)); + if (auto elem = cmdObj["$_internalReadAtClusterTime"]) { + uassert(ErrorCodes::InvalidOptions, + "The '$_internalReadAtClusterTime' option is only supported when testing" + " commands are enabled", + getTestCommandsEnabled()); + + auto* replCoord = repl::ReplicationCoordinator::get(opCtx); + uassert(ErrorCodes::InvalidOptions, + "The '$_internalReadAtClusterTime' option is only supported when replication is" + " enabled", + replCoord->isReplEnabled()); + + auto* storageEngine = opCtx->getServiceContext()->getStorageEngine(); + uassert(ErrorCodes::InvalidOptions, + "The '$_internalReadAtClusterTime' option is only supported by storage engines" + " that support document-level concurrency", + storageEngine->supportsDocLocking()); + + uassert(ErrorCodes::TypeMismatch, + "The '$_internalReadAtClusterTime' option must be a Timestamp", + elem.type() == BSONType::bsonTimestamp); + + auto targetClusterTime = elem.timestamp(); + + // We aren't holding the global lock in intent mode, so it is possible after comparing + // 'targetClusterTime' to 'lastAppliedOpTime' for the last applied opTime to go + // backwards or for the term to change due to replication rollback. This isn't an actual + // concern because the testing infrastructure won't use the $_internalReadAtClusterTime + // option in any test suite where rollback is expected to occur. + auto lastAppliedOpTime = replCoord->getMyLastAppliedOpTime(); + + uassert(ErrorCodes::InvalidOptions, + str::stream() << "$_internalReadAtClusterTime value must not be greater" + " than the last applied opTime. Requested clusterTime: " + << targetClusterTime.toString() + << "; last applied opTime: " + << lastAppliedOpTime.toString(), + lastAppliedOpTime.getTimestamp() >= targetClusterTime); + + // We aren't holding the global lock in intent mode, so it is possible for the global + // storage engine to have been destructed already as a result of the server shutting + // down. This isn't an actual concern because the testing infrastructure won't use the + // $_internalReadAtClusterTime option in any test suite where clean shutdown is expected + // to occur concurrently with tests running. + auto allCommittedTime = storageEngine->getAllCommittedTimestamp(); + invariant(!allCommittedTime.isNull()); + + uassert(ErrorCodes::InvalidOptions, + str::stream() << "$_internalReadAtClusterTime value must not be greater" + " than the all-committed timestamp. Requested clusterTime: " + << targetClusterTime.toString() + << "; all-committed timestamp: " + << allCommittedTime.toString(), + allCommittedTime >= targetClusterTime); + + // The $_internalReadAtClusterTime option causes any storage-layer cursors created + // during plan execution to read from a consistent snapshot of data at the supplied + // clusterTime, even across yields. + opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kProvided, + targetClusterTime); + + // The $_internalReadAtClusterTime option also causes any storage-layer cursors created + // during plan execution to block on prepared transactions. + opCtx->recoveryUnit()->setIgnorePrepared(false); + } + // We lock the entire database in S-mode in order to ensure that the contents will not // change for the snapshot. auto lockMode = LockMode::MODE_S; - auto txnParticipant = TransactionParticipant::get(opCtx); - if (txnParticipant && txnParticipant->inMultiDocumentTransaction()) { - // However, if we are inside a multi-statement transaction, then we only need to lock - // the database in intent mode to ensure that none of the collections get dropped. - lockMode = getLockModeForQuery(opCtx, boost::none); + if (opCtx->recoveryUnit()->getTimestampReadSource() == + RecoveryUnit::ReadSource::kProvided) { + // However, if we are performing a read at a timestamp, then we only need to lock the + // database in intent mode to ensure that none of the collections get dropped. + lockMode = LockMode::MODE_IS; } AutoGetDb autoDb(opCtx, ns, lockMode); Database* db = autoDb.getDb(); @@ -220,16 +283,14 @@ private: return ""; boost::optional<Lock::CollectionLock> collLock; - auto txnParticipant = TransactionParticipant::get(opCtx); - if (txnParticipant && txnParticipant->inMultiDocumentTransaction()) { - // When inside a multi-statement transaction, we are only holding the database lock in + if (opCtx->recoveryUnit()->getTimestampReadSource() == + RecoveryUnit::ReadSource::kProvided) { + // When performing a read at a timestamp, we are only holding the database lock in // intent mode. We need to also acquire the collection lock in intent mode to ensure // reading from the consistent snapshot doesn't overlap with any catalog operations on // the collection. - invariant( - opCtx->lockState()->isDbLockedForMode(db->name(), getLockModeForQuery(opCtx, ns))); - collLock.emplace( - opCtx->lockState(), fullCollectionName, getLockModeForQuery(opCtx, ns)); + invariant(opCtx->lockState()->isDbLockedForMode(db->name(), MODE_IS)); + collLock.emplace(opCtx->lockState(), fullCollectionName, MODE_IS); auto minSnapshot = collection->getMinimumVisibleSnapshot(); auto mySnapshot = opCtx->recoveryUnit()->getPointInTimeReadTimestamp(); diff --git a/src/mongo/db/commands/find_cmd.cpp b/src/mongo/db/commands/find_cmd.cpp index 0fde452ab39..a13d09d2fb7 100644 --- a/src/mongo/db/commands/find_cmd.cpp +++ b/src/mongo/db/commands/find_cmd.cpp @@ -37,6 +37,7 @@ #include "mongo/db/clientcursor.h" #include "mongo/db/commands.h" #include "mongo/db/commands/run_aggregate.h" +#include "mongo/db/commands/test_commands_enabled.h" #include "mongo/db/curop_failpoint_helpers.h" #include "mongo/db/cursor_manager.h" #include "mongo/db/db_raii.h" @@ -53,6 +54,7 @@ #include "mongo/db/service_context.h" #include "mongo/db/stats/counters.h" #include "mongo/db/stats/server_read_concern_metrics.h" +#include "mongo/db/storage/storage_engine.h" #include "mongo/db/transaction_participant.h" #include "mongo/rpc/get_status_from_command_result.h" #include "mongo/util/log.h" @@ -262,12 +264,76 @@ public: !txnParticipant->inActiveOrKilledMultiDocumentTransaction() || !qr->isReadOnce()); + uassert(ErrorCodes::InvalidOptions, + "The '$_internalReadAtClusterTime' option is only supported when testing" + " commands are enabled", + !qr->getReadAtClusterTime() || getTestCommandsEnabled()); + + uassert(ErrorCodes::InvalidOptions, + "The '$_internalReadAtClusterTime' option is only supported when replication is" + " enabled", + !qr->getReadAtClusterTime() || replCoord->isReplEnabled()); + + auto* storageEngine = opCtx->getServiceContext()->getStorageEngine(); + uassert(ErrorCodes::InvalidOptions, + "The '$_internalReadAtClusterTime' option is only supported by storage engines" + " that support document-level concurrency", + !qr->getReadAtClusterTime() || storageEngine->supportsDocLocking()); + // Validate term before acquiring locks, if provided. if (auto term = qr->getReplicationTerm()) { // Note: updateTerm returns ok if term stayed the same. uassertStatusOK(replCoord->updateTerm(opCtx, *term)); } + // We call RecoveryUnit::setTimestampReadSource() before acquiring a lock on the + // collection via AutoGetCollectionForRead in order to ensure the comparison to the + // collection's minimum visible snapshot is accurate. + if (auto targetClusterTime = qr->getReadAtClusterTime()) { + // We aren't holding the global lock in intent mode, so it is possible after + // comparing 'targetClusterTime' to 'lastAppliedOpTime' for the last applied opTime + // to go backwards or for the term to change due to replication rollback. This isn't + // an actual concern because the testing infrastructure won't use the + // $_internalReadAtClusterTime option in any test suite where rollback is expected + // to occur. + auto lastAppliedOpTime = replCoord->getMyLastAppliedOpTime(); + + uassert(ErrorCodes::InvalidOptions, + str::stream() << "$_internalReadAtClusterTime value must not be greater" + " than the last applied opTime. Requested clusterTime: " + << targetClusterTime->toString() + << "; last applied opTime: " + << lastAppliedOpTime.toString(), + lastAppliedOpTime.getTimestamp() >= targetClusterTime); + + // We aren't holding the global lock in intent mode, so it is possible for the + // global storage engine to have been destructed already as a result of the server + // shutting down. This isn't an actual concern because the testing infrastructure + // won't use the $_internalReadAtClusterTime option in any test suite where clean + // shutdown is expected to occur concurrently with tests running. + auto allCommittedTime = storageEngine->getAllCommittedTimestamp(); + invariant(!allCommittedTime.isNull()); + + uassert(ErrorCodes::InvalidOptions, + str::stream() << "$_internalReadAtClusterTime value must not be greater" + " than the all-committed timestamp. Requested" + " clusterTime: " + << targetClusterTime->toString() + << "; all-committed timestamp: " + << allCommittedTime.toString(), + allCommittedTime >= targetClusterTime); + + // The $_internalReadAtClusterTime option causes any storage-layer cursors created + // during plan execution to read from a consistent snapshot of data at the supplied + // clusterTime, even across yields. + opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kProvided, + targetClusterTime); + + // The $_internalReadAtClusterTime option also causes any storage-layer cursors + // created during plan execution to block on prepared transactions. + opCtx->recoveryUnit()->setIgnorePrepared(false); + } + // Acquire locks. If the query is on a view, we release our locks and convert the query // request into an aggregation command. boost::optional<AutoGetCollectionForReadCommand> ctx; diff --git a/src/mongo/db/commands/getmore_cmd.cpp b/src/mongo/db/commands/getmore_cmd.cpp index 350383d706f..3c6f5c5ad5b 100644 --- a/src/mongo/db/commands/getmore_cmd.cpp +++ b/src/mongo/db/commands/getmore_cmd.cpp @@ -307,6 +307,32 @@ public: auto cursorManager = CursorManager::get(opCtx); auto cursorPin = uassertStatusOK(cursorManager->pinCursor(opCtx, _request.cursorid)); + { + // We call RecoveryUnit::setTimestampReadSource() before acquiring a lock on the + // collection via AutoGetCollectionForRead in order to ensure the comparison to the + // collection's minimum visible snapshot is accurate. + PlanExecutor* exec = cursorPin->getExecutor(); + const auto* cq = exec->getCanonicalQuery(); + + if (auto clusterTime = + (cq ? cq->getQueryRequest().getReadAtClusterTime() : boost::none)) { + // We don't compare 'clusterTime' to the last applied opTime or to the + // all-committed timestamp because the testing infrastructure won't use the + // $_internalReadAtClusterTime option in any test suite where rollback is + // expected to occur. + + // The $_internalReadAtClusterTime option causes any storage-layer cursors + // created during plan execution to read from a consistent snapshot of data at + // the supplied clusterTime, even across yields. + opCtx->recoveryUnit()->setTimestampReadSource( + RecoveryUnit::ReadSource::kProvided, clusterTime); + + // The $_internalReadAtClusterTime option also causes any storage-layer cursors + // created during plan execution to block on prepared transactions. + opCtx->recoveryUnit()->setIgnorePrepared(false); + } + } + if (cursorPin->lockPolicy() == ClientCursorParams::LockPolicy::kLocksInternally) { if (!_request.nss.isCollectionlessCursorNamespace()) { const boost::optional<int> dbProfilingLevel = boost::none; diff --git a/src/mongo/db/query/query_request.cpp b/src/mongo/db/query/query_request.cpp index 7f02487ff7f..42c338bb5ec 100644 --- a/src/mongo/db/query/query_request.cpp +++ b/src/mongo/db/query/query_request.cpp @@ -103,6 +103,7 @@ const char kTermField[] = "term"; const char kOptionsField[] = "options"; const char kReadOnceField[] = "readOnce"; const char kAllowSpeculativeMajorityReadField[] = "allowSpeculativeMajorityRead"; +const char kInternalReadAtClusterTimeField[] = "$_internalReadAtClusterTime"; // Field names for sorting options. const char kNaturalSortField[] = "$natural"; @@ -381,6 +382,12 @@ StatusWith<unique_ptr<QueryRequest>> QueryRequest::parseFromFindCommand(unique_p return status; } qr->_allowSpeculativeMajorityRead = el.boolean(); + } else if (fieldName == kInternalReadAtClusterTimeField) { + Status status = checkFieldType(el, BSONType::bsonTimestamp); + if (!status.isOK()) { + return status; + } + qr->_internalReadAtClusterTime = el.timestamp(); } else if (!isGenericArgument(fieldName)) { return Status(ErrorCodes::FailedToParse, str::stream() << "Failed to parse: " << cmdObj.toString() << ". " @@ -549,6 +556,10 @@ void QueryRequest::asFindCommandInternal(BSONObjBuilder* cmdBuilder) const { if (_allowSpeculativeMajorityRead) { cmdBuilder->append(kAllowSpeculativeMajorityReadField, true); } + + if (_internalReadAtClusterTime) { + cmdBuilder->append(kInternalReadAtClusterTimeField, *_internalReadAtClusterTime); + } } void QueryRequest::addReturnKeyMetaProj() { @@ -1044,6 +1055,12 @@ StatusWith<BSONObj> QueryRequest::asAggregationCommand() const { << " not supported in aggregation."}; } + if (_internalReadAtClusterTime) { + return {ErrorCodes::InvalidPipelineOperator, + str::stream() << "Option " << kInternalReadAtClusterTimeField + << " not supported in aggregation."}; + } + // Now that we've successfully validated this QR, begin building the aggregation command. aggregationBuilder.append("aggregate", _nss.coll()); diff --git a/src/mongo/db/query/query_request.h b/src/mongo/db/query/query_request.h index 1d23544f7cd..8028ba452c0 100644 --- a/src/mongo/db/query/query_request.h +++ b/src/mongo/db/query/query_request.h @@ -394,6 +394,10 @@ public: return _allowSpeculativeMajorityRead; } + boost::optional<Timestamp> getReadAtClusterTime() const { + return _internalReadAtClusterTime; + } + /** * Return options as a bit vector. */ @@ -522,6 +526,10 @@ private: bool _allowSpeculativeMajorityRead = false; boost::optional<long long> _replicationTerm; + + // The Timestamp that RecoveryUnit::setTimestampReadSource() should be called with. The optional + // should only ever be engaged when testing commands are enabled. + boost::optional<Timestamp> _internalReadAtClusterTime; }; } // namespace mongo diff --git a/src/mongo/db/transaction_validation.cpp b/src/mongo/db/transaction_validation.cpp index b0e95e61b12..19ae87ccca6 100644 --- a/src/mongo/db/transaction_validation.cpp +++ b/src/mongo/db/transaction_validation.cpp @@ -50,7 +50,6 @@ const StringMap<int> sessionCheckOutList = {{"abortTransaction", 1}, {"applyOps", 1}, {"commitTransaction", 1}, {"count", 1}, - {"dbHash", 1}, {"delete", 1}, {"distinct", 1}, {"doTxn", 1}, diff --git a/src/mongo/shell/replsettest.js b/src/mongo/shell/replsettest.js index ea8766968e6..9efa8ac7f2e 100644 --- a/src/mongo/shell/replsettest.js +++ b/src/mongo/shell/replsettest.js @@ -1494,11 +1494,16 @@ var ReplSetTest = function(opts) { this.getHashesUsingSessions = function(sessions, dbName, { filterCapped: filterCapped = true, - filterMapReduce: filterMapReduce = true, + filterMapReduce: filterMapReduce = true, readAtClusterTime, } = {}) { return sessions.map(session => { + const commandObj = {dbHash: 1}; + if (readAtClusterTime !== undefined) { + commandObj.$_internalReadAtClusterTime = readAtClusterTime; + } + const db = session.getDatabase(dbName); - const res = assert.commandWorked(db.runCommand({dbHash: 1})); + const res = assert.commandWorked(db.runCommand(commandObj)); // The "capped" field in the dbHash command response is new as of MongoDB 4.0. const cappedCollections = new Set(filterCapped ? res.capped : []); @@ -1528,7 +1533,7 @@ var ReplSetTest = function(opts) { }; this.getCollectionDiffUsingSessions = function( - primarySession, secondarySession, dbName, collNameOrUUID) { + primarySession, secondarySession, dbName, collNameOrUUID, readAtClusterTime) { function PeekableCursor(cursor) { let _stashedDoc; @@ -1557,11 +1562,16 @@ var ReplSetTest = function(opts) { const primaryDB = primarySession.getDatabase(dbName); const secondaryDB = secondarySession.getDatabase(dbName); - const primaryCursor = new PeekableCursor(new DBCommandCursor( - primaryDB, primaryDB.runCommand({find: collNameOrUUID, sort: {_id: 1}}))); + const commandObj = {find: collNameOrUUID, sort: {_id: 1}}; + if (readAtClusterTime !== undefined) { + commandObj.$_internalReadAtClusterTime = readAtClusterTime; + } + + const primaryCursor = + new PeekableCursor(new DBCommandCursor(primaryDB, primaryDB.runCommand(commandObj))); - const secondaryCursor = new PeekableCursor(new DBCommandCursor( - secondaryDB, secondaryDB.runCommand({find: collNameOrUUID, sort: {_id: 1}}))); + const secondaryCursor = new PeekableCursor( + new DBCommandCursor(secondaryDB, secondaryDB.runCommand(commandObj))); while (primaryCursor.hasNext() && secondaryCursor.hasNext()) { const primaryDoc = primaryCursor.peekNext(); |