diff options
author | Max Hirschhorn <max.hirschhorn@mongodb.com> | 2019-02-11 15:43:27 -0500 |
---|---|---|
committer | Max Hirschhorn <max.hirschhorn@mongodb.com> | 2019-02-11 15:43:27 -0500 |
commit | 9db1a8dffe753808bea0d8c47d9fc959eaea9ea0 (patch) | |
tree | 26b5750c5088d745ab1fb93596d010501b0b4cbe /src/mongo/db/commands | |
parent | 691ab6da0c38f52f32c1028a8fa7447997ced255 (diff) | |
download | mongo-9db1a8dffe753808bea0d8c47d9fc959eaea9ea0.tar.gz |
SERVER-39169 Add $_internalReadAtClusterTime option to find and dbHash.
The new $_internalReadAtClusterTime option replaces all usages of
running the dbHash command inside of a multi-statement transaction. It
can be used to read from a consistent snapshot in place of specifying an
atClusterTime read concern.
Unlike multi-statement transactions, the new $_internalReadAtClusterTime
option doesn't cause locks to be left on the server after returning a
network response. It instead restores the snapshot to read from as part
of handling the request.
Diffstat (limited to 'src/mongo/db/commands')
-rw-r--r-- | src/mongo/db/commands/dbhash.cpp | 99 | ||||
-rw-r--r-- | src/mongo/db/commands/find_cmd.cpp | 66 | ||||
-rw-r--r-- | src/mongo/db/commands/getmore_cmd.cpp | 26 |
3 files changed, 172 insertions, 19 deletions
diff --git a/src/mongo/db/commands/dbhash.cpp b/src/mongo/db/commands/dbhash.cpp index 669f707111a..0ed79280f17 100644 --- a/src/mongo/db/commands/dbhash.cpp +++ b/src/mongo/db/commands/dbhash.cpp @@ -43,10 +43,14 @@ #include "mongo/db/catalog/database_catalog_entry.h" #include "mongo/db/catalog/index_catalog.h" #include "mongo/db/commands.h" +#include "mongo/db/commands/test_commands_enabled.h" #include "mongo/db/db_raii.h" #include "mongo/db/exec/working_set_common.h" +#include "mongo/db/logical_clock.h" #include "mongo/db/namespace_string.h" #include "mongo/db/query/internal_plans.h" +#include "mongo/db/repl/replication_coordinator.h" +#include "mongo/db/storage/storage_engine.h" #include "mongo/db/transaction_participant.h" #include "mongo/stdx/mutex.h" #include "mongo/util/log.h" @@ -70,13 +74,6 @@ public: return ReadWriteType::kRead; } - bool supportsReadConcern(const std::string& dbName, - const BSONObj& cmdObj, - repl::ReadConcernLevel level) const override { - return level == repl::ReadConcernLevel::kLocalReadConcern || - level == repl::ReadConcernLevel::kSnapshotReadConcern; - } - AllowedOnSecondary secondaryAllowed(ServiceContext*) const override { return AllowedOnSecondary::kAlways; } @@ -114,14 +111,80 @@ public: str::stream() << "Invalid db name: " << ns, NamespaceString::validDBName(ns, NamespaceString::DollarInDbNameBehavior::Allow)); + if (auto elem = cmdObj["$_internalReadAtClusterTime"]) { + uassert(ErrorCodes::InvalidOptions, + "The '$_internalReadAtClusterTime' option is only supported when testing" + " commands are enabled", + getTestCommandsEnabled()); + + auto* replCoord = repl::ReplicationCoordinator::get(opCtx); + uassert(ErrorCodes::InvalidOptions, + "The '$_internalReadAtClusterTime' option is only supported when replication is" + " enabled", + replCoord->isReplEnabled()); + + auto* storageEngine = opCtx->getServiceContext()->getStorageEngine(); + uassert(ErrorCodes::InvalidOptions, + "The '$_internalReadAtClusterTime' option is only supported by storage engines" + " that support document-level concurrency", + storageEngine->supportsDocLocking()); + + uassert(ErrorCodes::TypeMismatch, + "The '$_internalReadAtClusterTime' option must be a Timestamp", + elem.type() == BSONType::bsonTimestamp); + + auto targetClusterTime = elem.timestamp(); + + // We aren't holding the global lock in intent mode, so it is possible after comparing + // 'targetClusterTime' to 'lastAppliedOpTime' for the last applied opTime to go + // backwards or for the term to change due to replication rollback. This isn't an actual + // concern because the testing infrastructure won't use the $_internalReadAtClusterTime + // option in any test suite where rollback is expected to occur. + auto lastAppliedOpTime = replCoord->getMyLastAppliedOpTime(); + + uassert(ErrorCodes::InvalidOptions, + str::stream() << "$_internalReadAtClusterTime value must not be greater" + " than the last applied opTime. Requested clusterTime: " + << targetClusterTime.toString() + << "; last applied opTime: " + << lastAppliedOpTime.toString(), + lastAppliedOpTime.getTimestamp() >= targetClusterTime); + + // We aren't holding the global lock in intent mode, so it is possible for the global + // storage engine to have been destructed already as a result of the server shutting + // down. This isn't an actual concern because the testing infrastructure won't use the + // $_internalReadAtClusterTime option in any test suite where clean shutdown is expected + // to occur concurrently with tests running. + auto allCommittedTime = storageEngine->getAllCommittedTimestamp(); + invariant(!allCommittedTime.isNull()); + + uassert(ErrorCodes::InvalidOptions, + str::stream() << "$_internalReadAtClusterTime value must not be greater" + " than the all-committed timestamp. Requested clusterTime: " + << targetClusterTime.toString() + << "; all-committed timestamp: " + << allCommittedTime.toString(), + allCommittedTime >= targetClusterTime); + + // The $_internalReadAtClusterTime option causes any storage-layer cursors created + // during plan execution to read from a consistent snapshot of data at the supplied + // clusterTime, even across yields. + opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kProvided, + targetClusterTime); + + // The $_internalReadAtClusterTime option also causes any storage-layer cursors created + // during plan execution to block on prepared transactions. + opCtx->recoveryUnit()->setIgnorePrepared(false); + } + // We lock the entire database in S-mode in order to ensure that the contents will not // change for the snapshot. auto lockMode = LockMode::MODE_S; - auto txnParticipant = TransactionParticipant::get(opCtx); - if (txnParticipant && txnParticipant->inMultiDocumentTransaction()) { - // However, if we are inside a multi-statement transaction, then we only need to lock - // the database in intent mode to ensure that none of the collections get dropped. - lockMode = getLockModeForQuery(opCtx, boost::none); + if (opCtx->recoveryUnit()->getTimestampReadSource() == + RecoveryUnit::ReadSource::kProvided) { + // However, if we are performing a read at a timestamp, then we only need to lock the + // database in intent mode to ensure that none of the collections get dropped. + lockMode = LockMode::MODE_IS; } AutoGetDb autoDb(opCtx, ns, lockMode); Database* db = autoDb.getDb(); @@ -220,16 +283,14 @@ private: return ""; boost::optional<Lock::CollectionLock> collLock; - auto txnParticipant = TransactionParticipant::get(opCtx); - if (txnParticipant && txnParticipant->inMultiDocumentTransaction()) { - // When inside a multi-statement transaction, we are only holding the database lock in + if (opCtx->recoveryUnit()->getTimestampReadSource() == + RecoveryUnit::ReadSource::kProvided) { + // When performing a read at a timestamp, we are only holding the database lock in // intent mode. We need to also acquire the collection lock in intent mode to ensure // reading from the consistent snapshot doesn't overlap with any catalog operations on // the collection. - invariant( - opCtx->lockState()->isDbLockedForMode(db->name(), getLockModeForQuery(opCtx, ns))); - collLock.emplace( - opCtx->lockState(), fullCollectionName, getLockModeForQuery(opCtx, ns)); + invariant(opCtx->lockState()->isDbLockedForMode(db->name(), MODE_IS)); + collLock.emplace(opCtx->lockState(), fullCollectionName, MODE_IS); auto minSnapshot = collection->getMinimumVisibleSnapshot(); auto mySnapshot = opCtx->recoveryUnit()->getPointInTimeReadTimestamp(); diff --git a/src/mongo/db/commands/find_cmd.cpp b/src/mongo/db/commands/find_cmd.cpp index 0fde452ab39..a13d09d2fb7 100644 --- a/src/mongo/db/commands/find_cmd.cpp +++ b/src/mongo/db/commands/find_cmd.cpp @@ -37,6 +37,7 @@ #include "mongo/db/clientcursor.h" #include "mongo/db/commands.h" #include "mongo/db/commands/run_aggregate.h" +#include "mongo/db/commands/test_commands_enabled.h" #include "mongo/db/curop_failpoint_helpers.h" #include "mongo/db/cursor_manager.h" #include "mongo/db/db_raii.h" @@ -53,6 +54,7 @@ #include "mongo/db/service_context.h" #include "mongo/db/stats/counters.h" #include "mongo/db/stats/server_read_concern_metrics.h" +#include "mongo/db/storage/storage_engine.h" #include "mongo/db/transaction_participant.h" #include "mongo/rpc/get_status_from_command_result.h" #include "mongo/util/log.h" @@ -262,12 +264,76 @@ public: !txnParticipant->inActiveOrKilledMultiDocumentTransaction() || !qr->isReadOnce()); + uassert(ErrorCodes::InvalidOptions, + "The '$_internalReadAtClusterTime' option is only supported when testing" + " commands are enabled", + !qr->getReadAtClusterTime() || getTestCommandsEnabled()); + + uassert(ErrorCodes::InvalidOptions, + "The '$_internalReadAtClusterTime' option is only supported when replication is" + " enabled", + !qr->getReadAtClusterTime() || replCoord->isReplEnabled()); + + auto* storageEngine = opCtx->getServiceContext()->getStorageEngine(); + uassert(ErrorCodes::InvalidOptions, + "The '$_internalReadAtClusterTime' option is only supported by storage engines" + " that support document-level concurrency", + !qr->getReadAtClusterTime() || storageEngine->supportsDocLocking()); + // Validate term before acquiring locks, if provided. if (auto term = qr->getReplicationTerm()) { // Note: updateTerm returns ok if term stayed the same. uassertStatusOK(replCoord->updateTerm(opCtx, *term)); } + // We call RecoveryUnit::setTimestampReadSource() before acquiring a lock on the + // collection via AutoGetCollectionForRead in order to ensure the comparison to the + // collection's minimum visible snapshot is accurate. + if (auto targetClusterTime = qr->getReadAtClusterTime()) { + // We aren't holding the global lock in intent mode, so it is possible after + // comparing 'targetClusterTime' to 'lastAppliedOpTime' for the last applied opTime + // to go backwards or for the term to change due to replication rollback. This isn't + // an actual concern because the testing infrastructure won't use the + // $_internalReadAtClusterTime option in any test suite where rollback is expected + // to occur. + auto lastAppliedOpTime = replCoord->getMyLastAppliedOpTime(); + + uassert(ErrorCodes::InvalidOptions, + str::stream() << "$_internalReadAtClusterTime value must not be greater" + " than the last applied opTime. Requested clusterTime: " + << targetClusterTime->toString() + << "; last applied opTime: " + << lastAppliedOpTime.toString(), + lastAppliedOpTime.getTimestamp() >= targetClusterTime); + + // We aren't holding the global lock in intent mode, so it is possible for the + // global storage engine to have been destructed already as a result of the server + // shutting down. This isn't an actual concern because the testing infrastructure + // won't use the $_internalReadAtClusterTime option in any test suite where clean + // shutdown is expected to occur concurrently with tests running. + auto allCommittedTime = storageEngine->getAllCommittedTimestamp(); + invariant(!allCommittedTime.isNull()); + + uassert(ErrorCodes::InvalidOptions, + str::stream() << "$_internalReadAtClusterTime value must not be greater" + " than the all-committed timestamp. Requested" + " clusterTime: " + << targetClusterTime->toString() + << "; all-committed timestamp: " + << allCommittedTime.toString(), + allCommittedTime >= targetClusterTime); + + // The $_internalReadAtClusterTime option causes any storage-layer cursors created + // during plan execution to read from a consistent snapshot of data at the supplied + // clusterTime, even across yields. + opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kProvided, + targetClusterTime); + + // The $_internalReadAtClusterTime option also causes any storage-layer cursors + // created during plan execution to block on prepared transactions. + opCtx->recoveryUnit()->setIgnorePrepared(false); + } + // Acquire locks. If the query is on a view, we release our locks and convert the query // request into an aggregation command. boost::optional<AutoGetCollectionForReadCommand> ctx; diff --git a/src/mongo/db/commands/getmore_cmd.cpp b/src/mongo/db/commands/getmore_cmd.cpp index 350383d706f..3c6f5c5ad5b 100644 --- a/src/mongo/db/commands/getmore_cmd.cpp +++ b/src/mongo/db/commands/getmore_cmd.cpp @@ -307,6 +307,32 @@ public: auto cursorManager = CursorManager::get(opCtx); auto cursorPin = uassertStatusOK(cursorManager->pinCursor(opCtx, _request.cursorid)); + { + // We call RecoveryUnit::setTimestampReadSource() before acquiring a lock on the + // collection via AutoGetCollectionForRead in order to ensure the comparison to the + // collection's minimum visible snapshot is accurate. + PlanExecutor* exec = cursorPin->getExecutor(); + const auto* cq = exec->getCanonicalQuery(); + + if (auto clusterTime = + (cq ? cq->getQueryRequest().getReadAtClusterTime() : boost::none)) { + // We don't compare 'clusterTime' to the last applied opTime or to the + // all-committed timestamp because the testing infrastructure won't use the + // $_internalReadAtClusterTime option in any test suite where rollback is + // expected to occur. + + // The $_internalReadAtClusterTime option causes any storage-layer cursors + // created during plan execution to read from a consistent snapshot of data at + // the supplied clusterTime, even across yields. + opCtx->recoveryUnit()->setTimestampReadSource( + RecoveryUnit::ReadSource::kProvided, clusterTime); + + // The $_internalReadAtClusterTime option also causes any storage-layer cursors + // created during plan execution to block on prepared transactions. + opCtx->recoveryUnit()->setIgnorePrepared(false); + } + } + if (cursorPin->lockPolicy() == ClientCursorParams::LockPolicy::kLocksInternally) { if (!_request.nss.isCollectionlessCursorNamespace()) { const boost::optional<int> dbProfilingLevel = boost::none; |