summaryrefslogtreecommitdiff
path: root/src/mongo/db/commands
diff options
context:
space:
mode:
authorMax Hirschhorn <max.hirschhorn@mongodb.com>2019-02-11 15:43:27 -0500
committerMax Hirschhorn <max.hirschhorn@mongodb.com>2019-02-11 15:43:27 -0500
commit9db1a8dffe753808bea0d8c47d9fc959eaea9ea0 (patch)
tree26b5750c5088d745ab1fb93596d010501b0b4cbe /src/mongo/db/commands
parent691ab6da0c38f52f32c1028a8fa7447997ced255 (diff)
downloadmongo-9db1a8dffe753808bea0d8c47d9fc959eaea9ea0.tar.gz
SERVER-39169 Add $_internalReadAtClusterTime option to find and dbHash.
The new $_internalReadAtClusterTime option replaces all usages of running the dbHash command inside of a multi-statement transaction. It can be used to read from a consistent snapshot in place of specifying an atClusterTime read concern. Unlike multi-statement transactions, the new $_internalReadAtClusterTime option doesn't cause locks to be left on the server after returning a network response. It instead restores the snapshot to read from as part of handling the request.
Diffstat (limited to 'src/mongo/db/commands')
-rw-r--r--src/mongo/db/commands/dbhash.cpp99
-rw-r--r--src/mongo/db/commands/find_cmd.cpp66
-rw-r--r--src/mongo/db/commands/getmore_cmd.cpp26
3 files changed, 172 insertions, 19 deletions
diff --git a/src/mongo/db/commands/dbhash.cpp b/src/mongo/db/commands/dbhash.cpp
index 669f707111a..0ed79280f17 100644
--- a/src/mongo/db/commands/dbhash.cpp
+++ b/src/mongo/db/commands/dbhash.cpp
@@ -43,10 +43,14 @@
#include "mongo/db/catalog/database_catalog_entry.h"
#include "mongo/db/catalog/index_catalog.h"
#include "mongo/db/commands.h"
+#include "mongo/db/commands/test_commands_enabled.h"
#include "mongo/db/db_raii.h"
#include "mongo/db/exec/working_set_common.h"
+#include "mongo/db/logical_clock.h"
#include "mongo/db/namespace_string.h"
#include "mongo/db/query/internal_plans.h"
+#include "mongo/db/repl/replication_coordinator.h"
+#include "mongo/db/storage/storage_engine.h"
#include "mongo/db/transaction_participant.h"
#include "mongo/stdx/mutex.h"
#include "mongo/util/log.h"
@@ -70,13 +74,6 @@ public:
return ReadWriteType::kRead;
}
- bool supportsReadConcern(const std::string& dbName,
- const BSONObj& cmdObj,
- repl::ReadConcernLevel level) const override {
- return level == repl::ReadConcernLevel::kLocalReadConcern ||
- level == repl::ReadConcernLevel::kSnapshotReadConcern;
- }
-
AllowedOnSecondary secondaryAllowed(ServiceContext*) const override {
return AllowedOnSecondary::kAlways;
}
@@ -114,14 +111,80 @@ public:
str::stream() << "Invalid db name: " << ns,
NamespaceString::validDBName(ns, NamespaceString::DollarInDbNameBehavior::Allow));
+ if (auto elem = cmdObj["$_internalReadAtClusterTime"]) {
+ uassert(ErrorCodes::InvalidOptions,
+ "The '$_internalReadAtClusterTime' option is only supported when testing"
+ " commands are enabled",
+ getTestCommandsEnabled());
+
+ auto* replCoord = repl::ReplicationCoordinator::get(opCtx);
+ uassert(ErrorCodes::InvalidOptions,
+ "The '$_internalReadAtClusterTime' option is only supported when replication is"
+ " enabled",
+ replCoord->isReplEnabled());
+
+ auto* storageEngine = opCtx->getServiceContext()->getStorageEngine();
+ uassert(ErrorCodes::InvalidOptions,
+ "The '$_internalReadAtClusterTime' option is only supported by storage engines"
+ " that support document-level concurrency",
+ storageEngine->supportsDocLocking());
+
+ uassert(ErrorCodes::TypeMismatch,
+ "The '$_internalReadAtClusterTime' option must be a Timestamp",
+ elem.type() == BSONType::bsonTimestamp);
+
+ auto targetClusterTime = elem.timestamp();
+
+ // We aren't holding the global lock in intent mode, so it is possible after comparing
+ // 'targetClusterTime' to 'lastAppliedOpTime' for the last applied opTime to go
+ // backwards or for the term to change due to replication rollback. This isn't an actual
+ // concern because the testing infrastructure won't use the $_internalReadAtClusterTime
+ // option in any test suite where rollback is expected to occur.
+ auto lastAppliedOpTime = replCoord->getMyLastAppliedOpTime();
+
+ uassert(ErrorCodes::InvalidOptions,
+ str::stream() << "$_internalReadAtClusterTime value must not be greater"
+ " than the last applied opTime. Requested clusterTime: "
+ << targetClusterTime.toString()
+ << "; last applied opTime: "
+ << lastAppliedOpTime.toString(),
+ lastAppliedOpTime.getTimestamp() >= targetClusterTime);
+
+ // We aren't holding the global lock in intent mode, so it is possible for the global
+ // storage engine to have been destructed already as a result of the server shutting
+ // down. This isn't an actual concern because the testing infrastructure won't use the
+ // $_internalReadAtClusterTime option in any test suite where clean shutdown is expected
+ // to occur concurrently with tests running.
+ auto allCommittedTime = storageEngine->getAllCommittedTimestamp();
+ invariant(!allCommittedTime.isNull());
+
+ uassert(ErrorCodes::InvalidOptions,
+ str::stream() << "$_internalReadAtClusterTime value must not be greater"
+ " than the all-committed timestamp. Requested clusterTime: "
+ << targetClusterTime.toString()
+ << "; all-committed timestamp: "
+ << allCommittedTime.toString(),
+ allCommittedTime >= targetClusterTime);
+
+ // The $_internalReadAtClusterTime option causes any storage-layer cursors created
+ // during plan execution to read from a consistent snapshot of data at the supplied
+ // clusterTime, even across yields.
+ opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kProvided,
+ targetClusterTime);
+
+ // The $_internalReadAtClusterTime option also causes any storage-layer cursors created
+ // during plan execution to block on prepared transactions.
+ opCtx->recoveryUnit()->setIgnorePrepared(false);
+ }
+
// We lock the entire database in S-mode in order to ensure that the contents will not
// change for the snapshot.
auto lockMode = LockMode::MODE_S;
- auto txnParticipant = TransactionParticipant::get(opCtx);
- if (txnParticipant && txnParticipant->inMultiDocumentTransaction()) {
- // However, if we are inside a multi-statement transaction, then we only need to lock
- // the database in intent mode to ensure that none of the collections get dropped.
- lockMode = getLockModeForQuery(opCtx, boost::none);
+ if (opCtx->recoveryUnit()->getTimestampReadSource() ==
+ RecoveryUnit::ReadSource::kProvided) {
+ // However, if we are performing a read at a timestamp, then we only need to lock the
+ // database in intent mode to ensure that none of the collections get dropped.
+ lockMode = LockMode::MODE_IS;
}
AutoGetDb autoDb(opCtx, ns, lockMode);
Database* db = autoDb.getDb();
@@ -220,16 +283,14 @@ private:
return "";
boost::optional<Lock::CollectionLock> collLock;
- auto txnParticipant = TransactionParticipant::get(opCtx);
- if (txnParticipant && txnParticipant->inMultiDocumentTransaction()) {
- // When inside a multi-statement transaction, we are only holding the database lock in
+ if (opCtx->recoveryUnit()->getTimestampReadSource() ==
+ RecoveryUnit::ReadSource::kProvided) {
+ // When performing a read at a timestamp, we are only holding the database lock in
// intent mode. We need to also acquire the collection lock in intent mode to ensure
// reading from the consistent snapshot doesn't overlap with any catalog operations on
// the collection.
- invariant(
- opCtx->lockState()->isDbLockedForMode(db->name(), getLockModeForQuery(opCtx, ns)));
- collLock.emplace(
- opCtx->lockState(), fullCollectionName, getLockModeForQuery(opCtx, ns));
+ invariant(opCtx->lockState()->isDbLockedForMode(db->name(), MODE_IS));
+ collLock.emplace(opCtx->lockState(), fullCollectionName, MODE_IS);
auto minSnapshot = collection->getMinimumVisibleSnapshot();
auto mySnapshot = opCtx->recoveryUnit()->getPointInTimeReadTimestamp();
diff --git a/src/mongo/db/commands/find_cmd.cpp b/src/mongo/db/commands/find_cmd.cpp
index 0fde452ab39..a13d09d2fb7 100644
--- a/src/mongo/db/commands/find_cmd.cpp
+++ b/src/mongo/db/commands/find_cmd.cpp
@@ -37,6 +37,7 @@
#include "mongo/db/clientcursor.h"
#include "mongo/db/commands.h"
#include "mongo/db/commands/run_aggregate.h"
+#include "mongo/db/commands/test_commands_enabled.h"
#include "mongo/db/curop_failpoint_helpers.h"
#include "mongo/db/cursor_manager.h"
#include "mongo/db/db_raii.h"
@@ -53,6 +54,7 @@
#include "mongo/db/service_context.h"
#include "mongo/db/stats/counters.h"
#include "mongo/db/stats/server_read_concern_metrics.h"
+#include "mongo/db/storage/storage_engine.h"
#include "mongo/db/transaction_participant.h"
#include "mongo/rpc/get_status_from_command_result.h"
#include "mongo/util/log.h"
@@ -262,12 +264,76 @@ public:
!txnParticipant->inActiveOrKilledMultiDocumentTransaction() ||
!qr->isReadOnce());
+ uassert(ErrorCodes::InvalidOptions,
+ "The '$_internalReadAtClusterTime' option is only supported when testing"
+ " commands are enabled",
+ !qr->getReadAtClusterTime() || getTestCommandsEnabled());
+
+ uassert(ErrorCodes::InvalidOptions,
+ "The '$_internalReadAtClusterTime' option is only supported when replication is"
+ " enabled",
+ !qr->getReadAtClusterTime() || replCoord->isReplEnabled());
+
+ auto* storageEngine = opCtx->getServiceContext()->getStorageEngine();
+ uassert(ErrorCodes::InvalidOptions,
+ "The '$_internalReadAtClusterTime' option is only supported by storage engines"
+ " that support document-level concurrency",
+ !qr->getReadAtClusterTime() || storageEngine->supportsDocLocking());
+
// Validate term before acquiring locks, if provided.
if (auto term = qr->getReplicationTerm()) {
// Note: updateTerm returns ok if term stayed the same.
uassertStatusOK(replCoord->updateTerm(opCtx, *term));
}
+ // We call RecoveryUnit::setTimestampReadSource() before acquiring a lock on the
+ // collection via AutoGetCollectionForRead in order to ensure the comparison to the
+ // collection's minimum visible snapshot is accurate.
+ if (auto targetClusterTime = qr->getReadAtClusterTime()) {
+ // We aren't holding the global lock in intent mode, so it is possible after
+ // comparing 'targetClusterTime' to 'lastAppliedOpTime' for the last applied opTime
+ // to go backwards or for the term to change due to replication rollback. This isn't
+ // an actual concern because the testing infrastructure won't use the
+ // $_internalReadAtClusterTime option in any test suite where rollback is expected
+ // to occur.
+ auto lastAppliedOpTime = replCoord->getMyLastAppliedOpTime();
+
+ uassert(ErrorCodes::InvalidOptions,
+ str::stream() << "$_internalReadAtClusterTime value must not be greater"
+ " than the last applied opTime. Requested clusterTime: "
+ << targetClusterTime->toString()
+ << "; last applied opTime: "
+ << lastAppliedOpTime.toString(),
+ lastAppliedOpTime.getTimestamp() >= targetClusterTime);
+
+ // We aren't holding the global lock in intent mode, so it is possible for the
+ // global storage engine to have been destructed already as a result of the server
+ // shutting down. This isn't an actual concern because the testing infrastructure
+ // won't use the $_internalReadAtClusterTime option in any test suite where clean
+ // shutdown is expected to occur concurrently with tests running.
+ auto allCommittedTime = storageEngine->getAllCommittedTimestamp();
+ invariant(!allCommittedTime.isNull());
+
+ uassert(ErrorCodes::InvalidOptions,
+ str::stream() << "$_internalReadAtClusterTime value must not be greater"
+ " than the all-committed timestamp. Requested"
+ " clusterTime: "
+ << targetClusterTime->toString()
+ << "; all-committed timestamp: "
+ << allCommittedTime.toString(),
+ allCommittedTime >= targetClusterTime);
+
+ // The $_internalReadAtClusterTime option causes any storage-layer cursors created
+ // during plan execution to read from a consistent snapshot of data at the supplied
+ // clusterTime, even across yields.
+ opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kProvided,
+ targetClusterTime);
+
+ // The $_internalReadAtClusterTime option also causes any storage-layer cursors
+ // created during plan execution to block on prepared transactions.
+ opCtx->recoveryUnit()->setIgnorePrepared(false);
+ }
+
// Acquire locks. If the query is on a view, we release our locks and convert the query
// request into an aggregation command.
boost::optional<AutoGetCollectionForReadCommand> ctx;
diff --git a/src/mongo/db/commands/getmore_cmd.cpp b/src/mongo/db/commands/getmore_cmd.cpp
index 350383d706f..3c6f5c5ad5b 100644
--- a/src/mongo/db/commands/getmore_cmd.cpp
+++ b/src/mongo/db/commands/getmore_cmd.cpp
@@ -307,6 +307,32 @@ public:
auto cursorManager = CursorManager::get(opCtx);
auto cursorPin = uassertStatusOK(cursorManager->pinCursor(opCtx, _request.cursorid));
+ {
+ // We call RecoveryUnit::setTimestampReadSource() before acquiring a lock on the
+ // collection via AutoGetCollectionForRead in order to ensure the comparison to the
+ // collection's minimum visible snapshot is accurate.
+ PlanExecutor* exec = cursorPin->getExecutor();
+ const auto* cq = exec->getCanonicalQuery();
+
+ if (auto clusterTime =
+ (cq ? cq->getQueryRequest().getReadAtClusterTime() : boost::none)) {
+ // We don't compare 'clusterTime' to the last applied opTime or to the
+ // all-committed timestamp because the testing infrastructure won't use the
+ // $_internalReadAtClusterTime option in any test suite where rollback is
+ // expected to occur.
+
+ // The $_internalReadAtClusterTime option causes any storage-layer cursors
+ // created during plan execution to read from a consistent snapshot of data at
+ // the supplied clusterTime, even across yields.
+ opCtx->recoveryUnit()->setTimestampReadSource(
+ RecoveryUnit::ReadSource::kProvided, clusterTime);
+
+ // The $_internalReadAtClusterTime option also causes any storage-layer cursors
+ // created during plan execution to block on prepared transactions.
+ opCtx->recoveryUnit()->setIgnorePrepared(false);
+ }
+ }
+
if (cursorPin->lockPolicy() == ClientCursorParams::LockPolicy::kLocksInternally) {
if (!_request.nss.isCollectionlessCursorNamespace()) {
const boost::optional<int> dbProfilingLevel = boost::none;