SERVER-39169 Add $_internalReadAtClusterTime option to find and dbHash.

The new $_internalReadAtClusterTime option replaces all usages of running the dbHash command inside of a multi-statement transaction. It can be used to read from a consistent snapshot in place of specifying an atClusterTime read concern. Unlike multi-statement transactions, the new $_internalReadAtClusterTime option doesn't cause locks to be left on the server after returning a network response. It instead restores the snapshot to read from as part of handling the request.
author: Max Hirschhorn <max.hirschhorn@mongodb.com> 2019-02-11 15:43:27 -0500
committer: Max Hirschhorn <max.hirschhorn@mongodb.com> 2019-02-11 15:43:27 -0500
commit: 9db1a8dffe753808bea0d8c47d9fc959eaea9ea0 (patch)
tree: 26b5750c5088d745ab1fb93596d010501b0b4cbe /src/mongo/db/commands
parent: 691ab6da0c38f52f32c1028a8fa7447997ced255 (diff)
download: mongo-9db1a8dffe753808bea0d8c47d9fc959eaea9ea0.tar.gz
3 files changed, 172 insertions, 19 deletions
diff --git a/src/mongo/db/commands/dbhash.cpp b/src/mongo/db/commands/dbhash.cpp
index 669f707111a..0ed79280f17 100644
--- a/src/mongo/db/commands/dbhash.cpp
+++ b/src/mongo/db/commands/dbhash.cpp
@@ -43,10 +43,14 @@
 #include "mongo/db/catalog/database_catalog_entry.h"
 #include "mongo/db/catalog/index_catalog.h"
 #include "mongo/db/commands.h"
+#include "mongo/db/commands/test_commands_enabled.h"
 #include "mongo/db/db_raii.h"
 #include "mongo/db/exec/working_set_common.h"
+#include "mongo/db/logical_clock.h"
 #include "mongo/db/namespace_string.h"
 #include "mongo/db/query/internal_plans.h"
+#include "mongo/db/repl/replication_coordinator.h"
+#include "mongo/db/storage/storage_engine.h"
 #include "mongo/db/transaction_participant.h"
 #include "mongo/stdx/mutex.h"
 #include "mongo/util/log.h"
@@ -70,13 +74,6 @@ public:
         return ReadWriteType::kRead;
     }
 
-    bool supportsReadConcern(const std::string& dbName,
-                             const BSONObj& cmdObj,
-                             repl::ReadConcernLevel level) const override {
-        return level == repl::ReadConcernLevel::kLocalReadConcern ||
-            level == repl::ReadConcernLevel::kSnapshotReadConcern;
-    }
-
     AllowedOnSecondary secondaryAllowed(ServiceContext*) const override {
         return AllowedOnSecondary::kAlways;
     }
@@ -114,14 +111,80 @@ public:
                 str::stream() << "Invalid db name: " << ns,
                 NamespaceString::validDBName(ns, NamespaceString::DollarInDbNameBehavior::Allow));
 
+        if (auto elem = cmdObj["$_internalReadAtClusterTime"]) {
+            uassert(ErrorCodes::InvalidOptions,
+                    "The '$_internalReadAtClusterTime' option is only supported when testing"
+                    " commands are enabled",
+                    getTestCommandsEnabled());
+
+            auto* replCoord = repl::ReplicationCoordinator::get(opCtx);
+            uassert(ErrorCodes::InvalidOptions,
+                    "The '$_internalReadAtClusterTime' option is only supported when replication is"
+                    " enabled",
+                    replCoord->isReplEnabled());
+
+            auto* storageEngine = opCtx->getServiceContext()->getStorageEngine();
+            uassert(ErrorCodes::InvalidOptions,
+                    "The '$_internalReadAtClusterTime' option is only supported by storage engines"
+                    " that support document-level concurrency",
+                    storageEngine->supportsDocLocking());
+
+            uassert(ErrorCodes::TypeMismatch,
+                    "The '$_internalReadAtClusterTime' option must be a Timestamp",
+                    elem.type() == BSONType::bsonTimestamp);
+
+            auto targetClusterTime = elem.timestamp();
+
+            // We aren't holding the global lock in intent mode, so it is possible after comparing
+            // 'targetClusterTime' to 'lastAppliedOpTime' for the last applied opTime to go
+            // backwards or for the term to change due to replication rollback. This isn't an actual
+            // concern because the testing infrastructure won't use the $_internalReadAtClusterTime
+            // option in any test suite where rollback is expected to occur.
+            auto lastAppliedOpTime = replCoord->getMyLastAppliedOpTime();
+
+            uassert(ErrorCodes::InvalidOptions,
+                    str::stream() << "$_internalReadAtClusterTime value must not be greater"
+                                     " than the last applied opTime. Requested clusterTime: "
+                                  << targetClusterTime.toString()
+                                  << "; last applied opTime: "
+                                  << lastAppliedOpTime.toString(),
+                    lastAppliedOpTime.getTimestamp() >= targetClusterTime);
+
+            // We aren't holding the global lock in intent mode, so it is possible for the global
+            // storage engine to have been destructed already as a result of the server shutting
+            // down. This isn't an actual concern because the testing infrastructure won't use the
+            // $_internalReadAtClusterTime option in any test suite where clean shutdown is expected
+            // to occur concurrently with tests running.
+            auto allCommittedTime = storageEngine->getAllCommittedTimestamp();
+            invariant(!allCommittedTime.isNull());
+
+            uassert(ErrorCodes::InvalidOptions,
+                    str::stream() << "$_internalReadAtClusterTime value must not be greater"
+                                     " than the all-committed timestamp. Requested clusterTime: "
+                                  << targetClusterTime.toString()
+                                  << "; all-committed timestamp: "
+                                  << allCommittedTime.toString(),
+                    allCommittedTime >= targetClusterTime);
+
+            // The $_internalReadAtClusterTime option causes any storage-layer cursors created
+            // during plan execution to read from a consistent snapshot of data at the supplied
+            // clusterTime, even across yields.
+            opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kProvided,
+                                                          targetClusterTime);
+
+            // The $_internalReadAtClusterTime option also causes any storage-layer cursors created
+            // during plan execution to block on prepared transactions.
+            opCtx->recoveryUnit()->setIgnorePrepared(false);
+        }
+
         // We lock the entire database in S-mode in order to ensure that the contents will not
         // change for the snapshot.
         auto lockMode = LockMode::MODE_S;
-        auto txnParticipant = TransactionParticipant::get(opCtx);
-        if (txnParticipant && txnParticipant->inMultiDocumentTransaction()) {
-            // However, if we are inside a multi-statement transaction, then we only need to lock
-            // the database in intent mode to ensure that none of the collections get dropped.
-            lockMode = getLockModeForQuery(opCtx, boost::none);
+        if (opCtx->recoveryUnit()->getTimestampReadSource() ==
+            RecoveryUnit::ReadSource::kProvided) {
+            // However, if we are performing a read at a timestamp, then we only need to lock the
+            // database in intent mode to ensure that none of the collections get dropped.
+            lockMode = LockMode::MODE_IS;
         }
         AutoGetDb autoDb(opCtx, ns, lockMode);
         Database* db = autoDb.getDb();
@@ -220,16 +283,14 @@ private:
             return "";
 
         boost::optional<Lock::CollectionLock> collLock;
-        auto txnParticipant = TransactionParticipant::get(opCtx);
-        if (txnParticipant && txnParticipant->inMultiDocumentTransaction()) {
-            // When inside a multi-statement transaction, we are only holding the database lock in
+        if (opCtx->recoveryUnit()->getTimestampReadSource() ==
+            RecoveryUnit::ReadSource::kProvided) {
+            // When performing a read at a timestamp, we are only holding the database lock in
             // intent mode. We need to also acquire the collection lock in intent mode to ensure
             // reading from the consistent snapshot doesn't overlap with any catalog operations on
             // the collection.
-            invariant(
-                opCtx->lockState()->isDbLockedForMode(db->name(), getLockModeForQuery(opCtx, ns)));
-            collLock.emplace(
-                opCtx->lockState(), fullCollectionName, getLockModeForQuery(opCtx, ns));
+            invariant(opCtx->lockState()->isDbLockedForMode(db->name(), MODE_IS));
+            collLock.emplace(opCtx->lockState(), fullCollectionName, MODE_IS);
 
             auto minSnapshot = collection->getMinimumVisibleSnapshot();
             auto mySnapshot = opCtx->recoveryUnit()->getPointInTimeReadTimestamp();
diff --git a/src/mongo/db/commands/find_cmd.cpp b/src/mongo/db/commands/find_cmd.cpp
index 0fde452ab39..a13d09d2fb7 100644
--- a/src/mongo/db/commands/find_cmd.cpp
+++ b/src/mongo/db/commands/find_cmd.cpp
@@ -37,6 +37,7 @@
 #include "mongo/db/clientcursor.h"
 #include "mongo/db/commands.h"
 #include "mongo/db/commands/run_aggregate.h"
+#include "mongo/db/commands/test_commands_enabled.h"
 #include "mongo/db/curop_failpoint_helpers.h"
 #include "mongo/db/cursor_manager.h"
 #include "mongo/db/db_raii.h"
@@ -53,6 +54,7 @@
 #include "mongo/db/service_context.h"
 #include "mongo/db/stats/counters.h"
 #include "mongo/db/stats/server_read_concern_metrics.h"
+#include "mongo/db/storage/storage_engine.h"
 #include "mongo/db/transaction_participant.h"
 #include "mongo/rpc/get_status_from_command_result.h"
 #include "mongo/util/log.h"
@@ -262,12 +264,76 @@ public:
                         !txnParticipant->inActiveOrKilledMultiDocumentTransaction() ||
                         !qr->isReadOnce());
 
+            uassert(ErrorCodes::InvalidOptions,
+                    "The '$_internalReadAtClusterTime' option is only supported when testing"
+                    " commands are enabled",
+                    !qr->getReadAtClusterTime() || getTestCommandsEnabled());
+
+            uassert(ErrorCodes::InvalidOptions,
+                    "The '$_internalReadAtClusterTime' option is only supported when replication is"
+                    " enabled",
+                    !qr->getReadAtClusterTime() || replCoord->isReplEnabled());
+
+            auto* storageEngine = opCtx->getServiceContext()->getStorageEngine();
+            uassert(ErrorCodes::InvalidOptions,
+                    "The '$_internalReadAtClusterTime' option is only supported by storage engines"
+                    " that support document-level concurrency",
+                    !qr->getReadAtClusterTime() || storageEngine->supportsDocLocking());
+
             // Validate term before acquiring locks, if provided.
             if (auto term = qr->getReplicationTerm()) {
                 // Note: updateTerm returns ok if term stayed the same.
                 uassertStatusOK(replCoord->updateTerm(opCtx, *term));
             }
 
+            // We call RecoveryUnit::setTimestampReadSource() before acquiring a lock on the
+            // collection via AutoGetCollectionForRead in order to ensure the comparison to the
+            // collection's minimum visible snapshot is accurate.
+            if (auto targetClusterTime = qr->getReadAtClusterTime()) {
+                // We aren't holding the global lock in intent mode, so it is possible after
+                // comparing 'targetClusterTime' to 'lastAppliedOpTime' for the last applied opTime
+                // to go backwards or for the term to change due to replication rollback. This isn't
+                // an actual concern because the testing infrastructure won't use the
+                // $_internalReadAtClusterTime option in any test suite where rollback is expected
+                // to occur.
+                auto lastAppliedOpTime = replCoord->getMyLastAppliedOpTime();
+
+                uassert(ErrorCodes::InvalidOptions,
+                        str::stream() << "$_internalReadAtClusterTime value must not be greater"
+                                         " than the last applied opTime. Requested clusterTime: "
+                                      << targetClusterTime->toString()
+                                      << "; last applied opTime: "
+                                      << lastAppliedOpTime.toString(),
+                        lastAppliedOpTime.getTimestamp() >= targetClusterTime);
+
+                // We aren't holding the global lock in intent mode, so it is possible for the
+                // global storage engine to have been destructed already as a result of the server
+                // shutting down. This isn't an actual concern because the testing infrastructure
+                // won't use the $_internalReadAtClusterTime option in any test suite where clean
+                // shutdown is expected to occur concurrently with tests running.
+                auto allCommittedTime = storageEngine->getAllCommittedTimestamp();
+                invariant(!allCommittedTime.isNull());
+
+                uassert(ErrorCodes::InvalidOptions,
+                        str::stream() << "$_internalReadAtClusterTime value must not be greater"
+                                         " than the all-committed timestamp. Requested"
+                                         " clusterTime: "
+                                      << targetClusterTime->toString()
+                                      << "; all-committed timestamp: "
+                                      << allCommittedTime.toString(),
+                        allCommittedTime >= targetClusterTime);
+
+                // The $_internalReadAtClusterTime option causes any storage-layer cursors created
+                // during plan execution to read from a consistent snapshot of data at the supplied
+                // clusterTime, even across yields.
+                opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kProvided,
+                                                              targetClusterTime);
+
+                // The $_internalReadAtClusterTime option also causes any storage-layer cursors
+                // created during plan execution to block on prepared transactions.
+                opCtx->recoveryUnit()->setIgnorePrepared(false);
+            }
+
             // Acquire locks. If the query is on a view, we release our locks and convert the query
             // request into an aggregation command.
             boost::optional<AutoGetCollectionForReadCommand> ctx;
diff --git a/src/mongo/db/commands/getmore_cmd.cpp b/src/mongo/db/commands/getmore_cmd.cpp
index 350383d706f..3c6f5c5ad5b 100644
--- a/src/mongo/db/commands/getmore_cmd.cpp
+++ b/src/mongo/db/commands/getmore_cmd.cpp
@@ -307,6 +307,32 @@ public:
             auto cursorManager = CursorManager::get(opCtx);
             auto cursorPin = uassertStatusOK(cursorManager->pinCursor(opCtx, _request.cursorid));
 
+            {
+                // We call RecoveryUnit::setTimestampReadSource() before acquiring a lock on the
+                // collection via AutoGetCollectionForRead in order to ensure the comparison to the
+                // collection's minimum visible snapshot is accurate.
+                PlanExecutor* exec = cursorPin->getExecutor();
+                const auto* cq = exec->getCanonicalQuery();
+
+                if (auto clusterTime =
+                        (cq ? cq->getQueryRequest().getReadAtClusterTime() : boost::none)) {
+                    // We don't compare 'clusterTime' to the last applied opTime or to the
+                    // all-committed timestamp because the testing infrastructure won't use the
+                    // $_internalReadAtClusterTime option in any test suite where rollback is
+                    // expected to occur.
+
+                    // The $_internalReadAtClusterTime option causes any storage-layer cursors
+                    // created during plan execution to read from a consistent snapshot of data at
+                    // the supplied clusterTime, even across yields.
+                    opCtx->recoveryUnit()->setTimestampReadSource(
+                        RecoveryUnit::ReadSource::kProvided, clusterTime);
+
+                    // The $_internalReadAtClusterTime option also causes any storage-layer cursors
+                    // created during plan execution to block on prepared transactions.
+                    opCtx->recoveryUnit()->setIgnorePrepared(false);
+                }
+            }
+
             if (cursorPin->lockPolicy() == ClientCursorParams::LockPolicy::kLocksInternally) {
                 if (!_request.nss.isCollectionlessCursorNamespace()) {
                     const boost::optional<int> dbProfilingLevel = boost::none;
author	Max Hirschhorn <max.hirschhorn@mongodb.com>	2019-02-11 15:43:27 -0500
committer	Max Hirschhorn <max.hirschhorn@mongodb.com>	2019-02-11 15:43:27 -0500
commit	9db1a8dffe753808bea0d8c47d9fc959eaea9ea0 (patch)
tree	26b5750c5088d745ab1fb93596d010501b0b4cbe /src/mongo/db/commands
parent	691ab6da0c38f52f32c1028a8fa7447997ced255 (diff)
download	mongo-9db1a8dffe753808bea0d8c47d9fc959eaea9ea0.tar.gz