summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMax Hirschhorn <max.hirschhorn@mongodb.com>2019-02-11 15:43:27 -0500
committerMax Hirschhorn <max.hirschhorn@mongodb.com>2019-02-11 15:43:27 -0500
commit9db1a8dffe753808bea0d8c47d9fc959eaea9ea0 (patch)
tree26b5750c5088d745ab1fb93596d010501b0b4cbe /src
parent691ab6da0c38f52f32c1028a8fa7447997ced255 (diff)
downloadmongo-9db1a8dffe753808bea0d8c47d9fc959eaea9ea0.tar.gz
SERVER-39169 Add $_internalReadAtClusterTime option to find and dbHash.
The new $_internalReadAtClusterTime option replaces all usages of running the dbHash command inside of a multi-statement transaction. It can be used to read from a consistent snapshot in place of specifying an atClusterTime read concern. Unlike multi-statement transactions, the new $_internalReadAtClusterTime option doesn't cause locks to be left on the server after returning a network response. It instead restores the snapshot to read from as part of handling the request.
Diffstat (limited to 'src')
-rw-r--r--src/mongo/db/commands.cpp8
-rw-r--r--src/mongo/db/commands/dbhash.cpp99
-rw-r--r--src/mongo/db/commands/find_cmd.cpp66
-rw-r--r--src/mongo/db/commands/getmore_cmd.cpp26
-rw-r--r--src/mongo/db/query/query_request.cpp17
-rw-r--r--src/mongo/db/query/query_request.h8
-rw-r--r--src/mongo/db/transaction_validation.cpp1
-rw-r--r--src/mongo/shell/replsettest.js24
8 files changed, 215 insertions, 34 deletions
diff --git a/src/mongo/db/commands.cpp b/src/mongo/db/commands.cpp
index c4323d6a36f..507dbe78f82 100644
--- a/src/mongo/db/commands.cpp
+++ b/src/mongo/db/commands.cpp
@@ -129,10 +129,6 @@ const StringMap<int> txnCmdWhitelist = {{"abortTransaction", 1},
{"voteAbortTransaction", 1},
{"voteCommitTransaction", 1}};
-// The command names that are allowed in a multi-document transaction only when test commands are
-// enabled.
-const StringMap<int> txnCmdForTestingWhitelist = {{"dbHash", 1}};
-
// The commands that can be run on the 'admin' database in multi-document transactions.
const StringMap<int> txnAdminCommands = {{"abortTransaction", 1},
@@ -451,9 +447,7 @@ Status CommandHelpers::canUseTransactions(StringData dbName, StringData cmdName)
"http://dochub.mongodb.org/core/transaction-count for a recommended alternative."};
}
- if (txnCmdWhitelist.find(cmdName) == txnCmdWhitelist.cend() &&
- !(getTestCommandsEnabled() &&
- txnCmdForTestingWhitelist.find(cmdName) != txnCmdForTestingWhitelist.cend())) {
+ if (txnCmdWhitelist.find(cmdName) == txnCmdWhitelist.cend()) {
return {ErrorCodes::OperationNotSupportedInTransaction,
str::stream() << "Cannot run '" << cmdName << "' in a multi-document transaction."};
}
diff --git a/src/mongo/db/commands/dbhash.cpp b/src/mongo/db/commands/dbhash.cpp
index 669f707111a..0ed79280f17 100644
--- a/src/mongo/db/commands/dbhash.cpp
+++ b/src/mongo/db/commands/dbhash.cpp
@@ -43,10 +43,14 @@
#include "mongo/db/catalog/database_catalog_entry.h"
#include "mongo/db/catalog/index_catalog.h"
#include "mongo/db/commands.h"
+#include "mongo/db/commands/test_commands_enabled.h"
#include "mongo/db/db_raii.h"
#include "mongo/db/exec/working_set_common.h"
+#include "mongo/db/logical_clock.h"
#include "mongo/db/namespace_string.h"
#include "mongo/db/query/internal_plans.h"
+#include "mongo/db/repl/replication_coordinator.h"
+#include "mongo/db/storage/storage_engine.h"
#include "mongo/db/transaction_participant.h"
#include "mongo/stdx/mutex.h"
#include "mongo/util/log.h"
@@ -70,13 +74,6 @@ public:
return ReadWriteType::kRead;
}
- bool supportsReadConcern(const std::string& dbName,
- const BSONObj& cmdObj,
- repl::ReadConcernLevel level) const override {
- return level == repl::ReadConcernLevel::kLocalReadConcern ||
- level == repl::ReadConcernLevel::kSnapshotReadConcern;
- }
-
AllowedOnSecondary secondaryAllowed(ServiceContext*) const override {
return AllowedOnSecondary::kAlways;
}
@@ -114,14 +111,80 @@ public:
str::stream() << "Invalid db name: " << ns,
NamespaceString::validDBName(ns, NamespaceString::DollarInDbNameBehavior::Allow));
+ if (auto elem = cmdObj["$_internalReadAtClusterTime"]) {
+ uassert(ErrorCodes::InvalidOptions,
+ "The '$_internalReadAtClusterTime' option is only supported when testing"
+ " commands are enabled",
+ getTestCommandsEnabled());
+
+ auto* replCoord = repl::ReplicationCoordinator::get(opCtx);
+ uassert(ErrorCodes::InvalidOptions,
+ "The '$_internalReadAtClusterTime' option is only supported when replication is"
+ " enabled",
+ replCoord->isReplEnabled());
+
+ auto* storageEngine = opCtx->getServiceContext()->getStorageEngine();
+ uassert(ErrorCodes::InvalidOptions,
+ "The '$_internalReadAtClusterTime' option is only supported by storage engines"
+ " that support document-level concurrency",
+ storageEngine->supportsDocLocking());
+
+ uassert(ErrorCodes::TypeMismatch,
+ "The '$_internalReadAtClusterTime' option must be a Timestamp",
+ elem.type() == BSONType::bsonTimestamp);
+
+ auto targetClusterTime = elem.timestamp();
+
+ // We aren't holding the global lock in intent mode, so it is possible after comparing
+ // 'targetClusterTime' to 'lastAppliedOpTime' for the last applied opTime to go
+ // backwards or for the term to change due to replication rollback. This isn't an actual
+ // concern because the testing infrastructure won't use the $_internalReadAtClusterTime
+ // option in any test suite where rollback is expected to occur.
+ auto lastAppliedOpTime = replCoord->getMyLastAppliedOpTime();
+
+ uassert(ErrorCodes::InvalidOptions,
+ str::stream() << "$_internalReadAtClusterTime value must not be greater"
+ " than the last applied opTime. Requested clusterTime: "
+ << targetClusterTime.toString()
+ << "; last applied opTime: "
+ << lastAppliedOpTime.toString(),
+ lastAppliedOpTime.getTimestamp() >= targetClusterTime);
+
+ // We aren't holding the global lock in intent mode, so it is possible for the global
+ // storage engine to have been destructed already as a result of the server shutting
+ // down. This isn't an actual concern because the testing infrastructure won't use the
+ // $_internalReadAtClusterTime option in any test suite where clean shutdown is expected
+ // to occur concurrently with tests running.
+ auto allCommittedTime = storageEngine->getAllCommittedTimestamp();
+ invariant(!allCommittedTime.isNull());
+
+ uassert(ErrorCodes::InvalidOptions,
+ str::stream() << "$_internalReadAtClusterTime value must not be greater"
+ " than the all-committed timestamp. Requested clusterTime: "
+ << targetClusterTime.toString()
+ << "; all-committed timestamp: "
+ << allCommittedTime.toString(),
+ allCommittedTime >= targetClusterTime);
+
+ // The $_internalReadAtClusterTime option causes any storage-layer cursors created
+ // during plan execution to read from a consistent snapshot of data at the supplied
+ // clusterTime, even across yields.
+ opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kProvided,
+ targetClusterTime);
+
+ // The $_internalReadAtClusterTime option also causes any storage-layer cursors created
+ // during plan execution to block on prepared transactions.
+ opCtx->recoveryUnit()->setIgnorePrepared(false);
+ }
+
// We lock the entire database in S-mode in order to ensure that the contents will not
// change for the snapshot.
auto lockMode = LockMode::MODE_S;
- auto txnParticipant = TransactionParticipant::get(opCtx);
- if (txnParticipant && txnParticipant->inMultiDocumentTransaction()) {
- // However, if we are inside a multi-statement transaction, then we only need to lock
- // the database in intent mode to ensure that none of the collections get dropped.
- lockMode = getLockModeForQuery(opCtx, boost::none);
+ if (opCtx->recoveryUnit()->getTimestampReadSource() ==
+ RecoveryUnit::ReadSource::kProvided) {
+ // However, if we are performing a read at a timestamp, then we only need to lock the
+ // database in intent mode to ensure that none of the collections get dropped.
+ lockMode = LockMode::MODE_IS;
}
AutoGetDb autoDb(opCtx, ns, lockMode);
Database* db = autoDb.getDb();
@@ -220,16 +283,14 @@ private:
return "";
boost::optional<Lock::CollectionLock> collLock;
- auto txnParticipant = TransactionParticipant::get(opCtx);
- if (txnParticipant && txnParticipant->inMultiDocumentTransaction()) {
- // When inside a multi-statement transaction, we are only holding the database lock in
+ if (opCtx->recoveryUnit()->getTimestampReadSource() ==
+ RecoveryUnit::ReadSource::kProvided) {
+ // When performing a read at a timestamp, we are only holding the database lock in
// intent mode. We need to also acquire the collection lock in intent mode to ensure
// reading from the consistent snapshot doesn't overlap with any catalog operations on
// the collection.
- invariant(
- opCtx->lockState()->isDbLockedForMode(db->name(), getLockModeForQuery(opCtx, ns)));
- collLock.emplace(
- opCtx->lockState(), fullCollectionName, getLockModeForQuery(opCtx, ns));
+ invariant(opCtx->lockState()->isDbLockedForMode(db->name(), MODE_IS));
+ collLock.emplace(opCtx->lockState(), fullCollectionName, MODE_IS);
auto minSnapshot = collection->getMinimumVisibleSnapshot();
auto mySnapshot = opCtx->recoveryUnit()->getPointInTimeReadTimestamp();
diff --git a/src/mongo/db/commands/find_cmd.cpp b/src/mongo/db/commands/find_cmd.cpp
index 0fde452ab39..a13d09d2fb7 100644
--- a/src/mongo/db/commands/find_cmd.cpp
+++ b/src/mongo/db/commands/find_cmd.cpp
@@ -37,6 +37,7 @@
#include "mongo/db/clientcursor.h"
#include "mongo/db/commands.h"
#include "mongo/db/commands/run_aggregate.h"
+#include "mongo/db/commands/test_commands_enabled.h"
#include "mongo/db/curop_failpoint_helpers.h"
#include "mongo/db/cursor_manager.h"
#include "mongo/db/db_raii.h"
@@ -53,6 +54,7 @@
#include "mongo/db/service_context.h"
#include "mongo/db/stats/counters.h"
#include "mongo/db/stats/server_read_concern_metrics.h"
+#include "mongo/db/storage/storage_engine.h"
#include "mongo/db/transaction_participant.h"
#include "mongo/rpc/get_status_from_command_result.h"
#include "mongo/util/log.h"
@@ -262,12 +264,76 @@ public:
!txnParticipant->inActiveOrKilledMultiDocumentTransaction() ||
!qr->isReadOnce());
+ uassert(ErrorCodes::InvalidOptions,
+ "The '$_internalReadAtClusterTime' option is only supported when testing"
+ " commands are enabled",
+ !qr->getReadAtClusterTime() || getTestCommandsEnabled());
+
+ uassert(ErrorCodes::InvalidOptions,
+ "The '$_internalReadAtClusterTime' option is only supported when replication is"
+ " enabled",
+ !qr->getReadAtClusterTime() || replCoord->isReplEnabled());
+
+ auto* storageEngine = opCtx->getServiceContext()->getStorageEngine();
+ uassert(ErrorCodes::InvalidOptions,
+ "The '$_internalReadAtClusterTime' option is only supported by storage engines"
+ " that support document-level concurrency",
+ !qr->getReadAtClusterTime() || storageEngine->supportsDocLocking());
+
// Validate term before acquiring locks, if provided.
if (auto term = qr->getReplicationTerm()) {
// Note: updateTerm returns ok if term stayed the same.
uassertStatusOK(replCoord->updateTerm(opCtx, *term));
}
+ // We call RecoveryUnit::setTimestampReadSource() before acquiring a lock on the
+ // collection via AutoGetCollectionForRead in order to ensure the comparison to the
+ // collection's minimum visible snapshot is accurate.
+ if (auto targetClusterTime = qr->getReadAtClusterTime()) {
+ // We aren't holding the global lock in intent mode, so it is possible after
+ // comparing 'targetClusterTime' to 'lastAppliedOpTime' for the last applied opTime
+ // to go backwards or for the term to change due to replication rollback. This isn't
+ // an actual concern because the testing infrastructure won't use the
+ // $_internalReadAtClusterTime option in any test suite where rollback is expected
+ // to occur.
+ auto lastAppliedOpTime = replCoord->getMyLastAppliedOpTime();
+
+ uassert(ErrorCodes::InvalidOptions,
+ str::stream() << "$_internalReadAtClusterTime value must not be greater"
+ " than the last applied opTime. Requested clusterTime: "
+ << targetClusterTime->toString()
+ << "; last applied opTime: "
+ << lastAppliedOpTime.toString(),
+ lastAppliedOpTime.getTimestamp() >= targetClusterTime);
+
+ // We aren't holding the global lock in intent mode, so it is possible for the
+ // global storage engine to have been destructed already as a result of the server
+ // shutting down. This isn't an actual concern because the testing infrastructure
+ // won't use the $_internalReadAtClusterTime option in any test suite where clean
+ // shutdown is expected to occur concurrently with tests running.
+ auto allCommittedTime = storageEngine->getAllCommittedTimestamp();
+ invariant(!allCommittedTime.isNull());
+
+ uassert(ErrorCodes::InvalidOptions,
+ str::stream() << "$_internalReadAtClusterTime value must not be greater"
+ " than the all-committed timestamp. Requested"
+ " clusterTime: "
+ << targetClusterTime->toString()
+ << "; all-committed timestamp: "
+ << allCommittedTime.toString(),
+ allCommittedTime >= targetClusterTime);
+
+ // The $_internalReadAtClusterTime option causes any storage-layer cursors created
+ // during plan execution to read from a consistent snapshot of data at the supplied
+ // clusterTime, even across yields.
+ opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kProvided,
+ targetClusterTime);
+
+ // The $_internalReadAtClusterTime option also causes any storage-layer cursors
+ // created during plan execution to block on prepared transactions.
+ opCtx->recoveryUnit()->setIgnorePrepared(false);
+ }
+
// Acquire locks. If the query is on a view, we release our locks and convert the query
// request into an aggregation command.
boost::optional<AutoGetCollectionForReadCommand> ctx;
diff --git a/src/mongo/db/commands/getmore_cmd.cpp b/src/mongo/db/commands/getmore_cmd.cpp
index 350383d706f..3c6f5c5ad5b 100644
--- a/src/mongo/db/commands/getmore_cmd.cpp
+++ b/src/mongo/db/commands/getmore_cmd.cpp
@@ -307,6 +307,32 @@ public:
auto cursorManager = CursorManager::get(opCtx);
auto cursorPin = uassertStatusOK(cursorManager->pinCursor(opCtx, _request.cursorid));
+ {
+ // We call RecoveryUnit::setTimestampReadSource() before acquiring a lock on the
+ // collection via AutoGetCollectionForRead in order to ensure the comparison to the
+ // collection's minimum visible snapshot is accurate.
+ PlanExecutor* exec = cursorPin->getExecutor();
+ const auto* cq = exec->getCanonicalQuery();
+
+ if (auto clusterTime =
+ (cq ? cq->getQueryRequest().getReadAtClusterTime() : boost::none)) {
+ // We don't compare 'clusterTime' to the last applied opTime or to the
+ // all-committed timestamp because the testing infrastructure won't use the
+ // $_internalReadAtClusterTime option in any test suite where rollback is
+ // expected to occur.
+
+ // The $_internalReadAtClusterTime option causes any storage-layer cursors
+ // created during plan execution to read from a consistent snapshot of data at
+ // the supplied clusterTime, even across yields.
+ opCtx->recoveryUnit()->setTimestampReadSource(
+ RecoveryUnit::ReadSource::kProvided, clusterTime);
+
+ // The $_internalReadAtClusterTime option also causes any storage-layer cursors
+ // created during plan execution to block on prepared transactions.
+ opCtx->recoveryUnit()->setIgnorePrepared(false);
+ }
+ }
+
if (cursorPin->lockPolicy() == ClientCursorParams::LockPolicy::kLocksInternally) {
if (!_request.nss.isCollectionlessCursorNamespace()) {
const boost::optional<int> dbProfilingLevel = boost::none;
diff --git a/src/mongo/db/query/query_request.cpp b/src/mongo/db/query/query_request.cpp
index 7f02487ff7f..42c338bb5ec 100644
--- a/src/mongo/db/query/query_request.cpp
+++ b/src/mongo/db/query/query_request.cpp
@@ -103,6 +103,7 @@ const char kTermField[] = "term";
const char kOptionsField[] = "options";
const char kReadOnceField[] = "readOnce";
const char kAllowSpeculativeMajorityReadField[] = "allowSpeculativeMajorityRead";
+const char kInternalReadAtClusterTimeField[] = "$_internalReadAtClusterTime";
// Field names for sorting options.
const char kNaturalSortField[] = "$natural";
@@ -381,6 +382,12 @@ StatusWith<unique_ptr<QueryRequest>> QueryRequest::parseFromFindCommand(unique_p
return status;
}
qr->_allowSpeculativeMajorityRead = el.boolean();
+ } else if (fieldName == kInternalReadAtClusterTimeField) {
+ Status status = checkFieldType(el, BSONType::bsonTimestamp);
+ if (!status.isOK()) {
+ return status;
+ }
+ qr->_internalReadAtClusterTime = el.timestamp();
} else if (!isGenericArgument(fieldName)) {
return Status(ErrorCodes::FailedToParse,
str::stream() << "Failed to parse: " << cmdObj.toString() << ". "
@@ -549,6 +556,10 @@ void QueryRequest::asFindCommandInternal(BSONObjBuilder* cmdBuilder) const {
if (_allowSpeculativeMajorityRead) {
cmdBuilder->append(kAllowSpeculativeMajorityReadField, true);
}
+
+ if (_internalReadAtClusterTime) {
+ cmdBuilder->append(kInternalReadAtClusterTimeField, *_internalReadAtClusterTime);
+ }
}
void QueryRequest::addReturnKeyMetaProj() {
@@ -1044,6 +1055,12 @@ StatusWith<BSONObj> QueryRequest::asAggregationCommand() const {
<< " not supported in aggregation."};
}
+ if (_internalReadAtClusterTime) {
+ return {ErrorCodes::InvalidPipelineOperator,
+ str::stream() << "Option " << kInternalReadAtClusterTimeField
+ << " not supported in aggregation."};
+ }
+
// Now that we've successfully validated this QR, begin building the aggregation command.
aggregationBuilder.append("aggregate", _nss.coll());
diff --git a/src/mongo/db/query/query_request.h b/src/mongo/db/query/query_request.h
index 1d23544f7cd..8028ba452c0 100644
--- a/src/mongo/db/query/query_request.h
+++ b/src/mongo/db/query/query_request.h
@@ -394,6 +394,10 @@ public:
return _allowSpeculativeMajorityRead;
}
+ boost::optional<Timestamp> getReadAtClusterTime() const {
+ return _internalReadAtClusterTime;
+ }
+
/**
* Return options as a bit vector.
*/
@@ -522,6 +526,10 @@ private:
bool _allowSpeculativeMajorityRead = false;
boost::optional<long long> _replicationTerm;
+
+ // The Timestamp that RecoveryUnit::setTimestampReadSource() should be called with. The optional
+ // should only ever be engaged when testing commands are enabled.
+ boost::optional<Timestamp> _internalReadAtClusterTime;
};
} // namespace mongo
diff --git a/src/mongo/db/transaction_validation.cpp b/src/mongo/db/transaction_validation.cpp
index b0e95e61b12..19ae87ccca6 100644
--- a/src/mongo/db/transaction_validation.cpp
+++ b/src/mongo/db/transaction_validation.cpp
@@ -50,7 +50,6 @@ const StringMap<int> sessionCheckOutList = {{"abortTransaction", 1},
{"applyOps", 1},
{"commitTransaction", 1},
{"count", 1},
- {"dbHash", 1},
{"delete", 1},
{"distinct", 1},
{"doTxn", 1},
diff --git a/src/mongo/shell/replsettest.js b/src/mongo/shell/replsettest.js
index ea8766968e6..9efa8ac7f2e 100644
--- a/src/mongo/shell/replsettest.js
+++ b/src/mongo/shell/replsettest.js
@@ -1494,11 +1494,16 @@ var ReplSetTest = function(opts) {
this.getHashesUsingSessions = function(sessions, dbName, {
filterCapped: filterCapped = true,
- filterMapReduce: filterMapReduce = true,
+ filterMapReduce: filterMapReduce = true, readAtClusterTime,
} = {}) {
return sessions.map(session => {
+ const commandObj = {dbHash: 1};
+ if (readAtClusterTime !== undefined) {
+ commandObj.$_internalReadAtClusterTime = readAtClusterTime;
+ }
+
const db = session.getDatabase(dbName);
- const res = assert.commandWorked(db.runCommand({dbHash: 1}));
+ const res = assert.commandWorked(db.runCommand(commandObj));
// The "capped" field in the dbHash command response is new as of MongoDB 4.0.
const cappedCollections = new Set(filterCapped ? res.capped : []);
@@ -1528,7 +1533,7 @@ var ReplSetTest = function(opts) {
};
this.getCollectionDiffUsingSessions = function(
- primarySession, secondarySession, dbName, collNameOrUUID) {
+ primarySession, secondarySession, dbName, collNameOrUUID, readAtClusterTime) {
function PeekableCursor(cursor) {
let _stashedDoc;
@@ -1557,11 +1562,16 @@ var ReplSetTest = function(opts) {
const primaryDB = primarySession.getDatabase(dbName);
const secondaryDB = secondarySession.getDatabase(dbName);
- const primaryCursor = new PeekableCursor(new DBCommandCursor(
- primaryDB, primaryDB.runCommand({find: collNameOrUUID, sort: {_id: 1}})));
+ const commandObj = {find: collNameOrUUID, sort: {_id: 1}};
+ if (readAtClusterTime !== undefined) {
+ commandObj.$_internalReadAtClusterTime = readAtClusterTime;
+ }
+
+ const primaryCursor =
+ new PeekableCursor(new DBCommandCursor(primaryDB, primaryDB.runCommand(commandObj)));
- const secondaryCursor = new PeekableCursor(new DBCommandCursor(
- secondaryDB, secondaryDB.runCommand({find: collNameOrUUID, sort: {_id: 1}})));
+ const secondaryCursor = new PeekableCursor(
+ new DBCommandCursor(secondaryDB, secondaryDB.runCommand(commandObj)));
while (primaryCursor.hasNext() && secondaryCursor.hasNext()) {
const primaryDoc = primaryCursor.peekNext();