summaryrefslogtreecommitdiff
path: root/src/mongo/db/commands/dbcommands.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/mongo/db/commands/dbcommands.cpp')
-rw-r--r--src/mongo/db/commands/dbcommands.cpp183
1 files changed, 173 insertions, 10 deletions
diff --git a/src/mongo/db/commands/dbcommands.cpp b/src/mongo/db/commands/dbcommands.cpp
index ab85d4c9470..648c1fb2447 100644
--- a/src/mongo/db/commands/dbcommands.cpp
+++ b/src/mongo/db/commands/dbcommands.cpp
@@ -84,7 +84,6 @@
#include "mongo/db/query/get_executor.h"
#include "mongo/db/query/internal_plans.h"
#include "mongo/db/query/query_planner.h"
-#include "mongo/db/read_concern.h"
#include "mongo/db/repair_database.h"
#include "mongo/db/repl/optime.h"
#include "mongo/db/repl/read_concern_args.h"
@@ -93,6 +92,7 @@
#include "mongo/db/repl/replication_coordinator_global.h"
#include "mongo/db/s/operation_sharding_state.h"
#include "mongo/db/s/sharding_state.h"
+#include "mongo/db/server_parameters.h"
#include "mongo/db/stats/storage_stats.h"
#include "mongo/db/write_concern.h"
#include "mongo/rpc/metadata.h"
@@ -123,6 +123,15 @@ using std::stringstream;
using std::unique_ptr;
namespace {
+
+// This is a special flag that allows for testing of snapshot behavior by skipping the replication
+// related checks and isolating the storage/query side of snapshotting.
+bool testingSnapshotBehaviorInIsolation = false;
+ExportedServerParameter<bool, ServerParameterType::kStartupOnly> TestingSnapshotBehaviorInIsolation(
+ ServerParameterSet::getGlobal(),
+ "testingSnapshotBehaviorInIsolation",
+ &testingSnapshotBehaviorInIsolation);
+
void registerErrorImpl(OperationContext* opCtx, const DBException& exception) {
CurOp::get(opCtx)->debug().exceptionInfo = exception.getInfo();
}
@@ -135,7 +144,7 @@ MONGO_INITIALIZER(InitializeRegisterErrorHandler)(InitializerContext* const) {
* For replica set members it returns the last known op time from opCtx. Otherwise will return
* uninitialized logical time.
*/
-LogicalTime _getClientOperationTime(OperationContext* opCtx) {
+LogicalTime getClientOperationTime(OperationContext* opCtx) {
repl::ReplicationCoordinator* replCoord =
repl::ReplicationCoordinator::get(opCtx->getClient()->getServiceContext());
const bool isReplSet =
@@ -156,9 +165,9 @@ LogicalTime _getClientOperationTime(OperationContext* opCtx) {
*
* TODO: SERVER-28419 Do not compute operationTime if replica set does not propagate clusterTime.
*/
-LogicalTime _computeOperationTime(OperationContext* opCtx,
- LogicalTime startOperationTime,
- repl::ReadConcernLevel level) {
+LogicalTime computeOperationTime(OperationContext* opCtx,
+ LogicalTime startOperationTime,
+ repl::ReadConcernLevel level) {
repl::ReplicationCoordinator* replCoord =
repl::ReplicationCoordinator::get(opCtx->getClient()->getServiceContext());
const bool isReplSet =
@@ -168,7 +177,7 @@ LogicalTime _computeOperationTime(OperationContext* opCtx,
return LogicalTime();
}
- auto operationTime = _getClientOperationTime(opCtx);
+ auto operationTime = getClientOperationTime(opCtx);
invariant(operationTime >= startOperationTime);
// If the last operationTime has not changed, consider this command a read, and, for replica set
@@ -183,6 +192,160 @@ LogicalTime _computeOperationTime(OperationContext* opCtx,
return operationTime;
}
+
+Status makeNoopWriteIfNeeded(OperationContext* opCtx, LogicalTime clusterTime) {
+ repl::ReplicationCoordinator* const replCoord = repl::ReplicationCoordinator::get(opCtx);
+ auto lastAppliedTime = LogicalTime(replCoord->getMyLastAppliedOpTime().getTimestamp());
+ if (clusterTime > lastAppliedTime) {
+ auto shardingState = ShardingState::get(opCtx);
+ // standalone replica set, so there is no need to advance the OpLog on the primary.
+ if (!shardingState->enabled()) {
+ return Status::OK();
+ }
+
+ auto myShard =
+ Grid::get(opCtx)->shardRegistry()->getShard(opCtx, shardingState->getShardName());
+ if (!myShard.isOK()) {
+ return myShard.getStatus();
+ }
+
+ auto swRes = myShard.getValue()->runCommand(
+ opCtx,
+ ReadPreferenceSetting(ReadPreference::PrimaryOnly),
+ "admin",
+ BSON("applyOpLogNote" << 1 << "clusterTime" << clusterTime.asTimestamp() << "data"
+ << BSON("append noop write" << 1)),
+ Shard::RetryPolicy::kIdempotent);
+ return swRes.getStatus();
+ }
+ return Status::OK();
+}
+
+Status waitForReadConcern(OperationContext* opCtx, const repl::ReadConcernArgs& readConcernArgs) {
+ repl::ReplicationCoordinator* const replCoord = repl::ReplicationCoordinator::get(opCtx);
+
+ if (readConcernArgs.getLevel() == repl::ReadConcernLevel::kLinearizableReadConcern) {
+ if (replCoord->getReplicationMode() != repl::ReplicationCoordinator::modeReplSet) {
+ // For master/slave and standalone nodes, Linearizable Read is not supported.
+ return {ErrorCodes::NotAReplicaSet,
+ "node needs to be a replica set member to use read concern"};
+ }
+
+ // Replica sets running pv0 do not support linearizable read concern until further testing
+ // is completed (SERVER-27025).
+ if (!replCoord->isV1ElectionProtocol()) {
+ return {
+ ErrorCodes::IncompatibleElectionProtocol,
+ "Replica sets running protocol version 0 do not support readConcern: linearizable"};
+ }
+
+ if (readConcernArgs.getArgsOpTime()) {
+ return {ErrorCodes::FailedToParse,
+ "afterOpTime not compatible with linearizable read concern"};
+ }
+
+ if (!replCoord->getMemberState().primary()) {
+ return {ErrorCodes::NotMaster,
+ "cannot satisfy linearizable read concern on non-primary node"};
+ }
+ }
+
+ auto afterClusterTime = readConcernArgs.getArgsClusterTime();
+ if (afterClusterTime) {
+ auto currentTime = LogicalClock::get(opCtx)->getClusterTime().getTime();
+ if (currentTime < *afterClusterTime) {
+ return {ErrorCodes::InvalidOptions,
+ "readConcern afterClusterTime must not be greater than clusterTime value"};
+ }
+ }
+
+ // Skip waiting for the OpTime when testing snapshot behavior
+ if (!testingSnapshotBehaviorInIsolation && !readConcernArgs.isEmpty()) {
+ if (afterClusterTime) {
+ auto status = makeNoopWriteIfNeeded(opCtx, *afterClusterTime);
+ if (!status.isOK()) {
+ LOG(1) << "failed noop write due to " << status.toString();
+ }
+ }
+
+ auto status = replCoord->waitUntilOpTimeForRead(opCtx, readConcernArgs);
+ if (!status.isOK()) {
+ return status;
+ }
+ }
+
+ if ((replCoord->getReplicationMode() == repl::ReplicationCoordinator::Mode::modeReplSet ||
+ testingSnapshotBehaviorInIsolation) &&
+ readConcernArgs.getLevel() == repl::ReadConcernLevel::kMajorityReadConcern) {
+ // ReadConcern Majority is not supported in ProtocolVersion 0.
+ if (!testingSnapshotBehaviorInIsolation && !replCoord->isV1ElectionProtocol()) {
+ return {ErrorCodes::ReadConcernMajorityNotEnabled,
+ str::stream() << "Replica sets running protocol version 0 do not support "
+ "readConcern: majority"};
+ }
+
+ const int debugLevel = serverGlobalParams.clusterRole == ClusterRole::ConfigServer ? 1 : 2;
+
+ LOG(debugLevel) << "Waiting for 'committed' snapshot to be available for reading: "
+ << readConcernArgs;
+
+ Status status = opCtx->recoveryUnit()->setReadFromMajorityCommittedSnapshot();
+
+ // Wait until a snapshot is available.
+ while (status == ErrorCodes::ReadConcernMajorityNotAvailableYet) {
+ LOG(debugLevel) << "Snapshot not available yet.";
+ replCoord->waitUntilSnapshotCommitted(opCtx, SnapshotName::min());
+ status = opCtx->recoveryUnit()->setReadFromMajorityCommittedSnapshot();
+ }
+
+ if (!status.isOK()) {
+ return status;
+ }
+
+ LOG(debugLevel) << "Using 'committed' snapshot: " << CurOp::get(opCtx)->query();
+ }
+
+ return Status::OK();
+}
+
+Status waitForLinearizableReadConcern(OperationContext* opCtx) {
+
+ repl::ReplicationCoordinator* replCoord =
+ repl::ReplicationCoordinator::get(opCtx->getClient()->getServiceContext());
+
+ {
+ Lock::DBLock lk(opCtx, "local", MODE_IX);
+ Lock::CollectionLock lock(opCtx->lockState(), "local.oplog.rs", MODE_IX);
+
+ if (!replCoord->canAcceptWritesForDatabase(opCtx, "admin")) {
+ return {ErrorCodes::NotMaster,
+ "No longer primary when waiting for linearizable read concern"};
+ }
+
+ MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN {
+
+ WriteUnitOfWork uow(opCtx);
+ opCtx->getClient()->getServiceContext()->getOpObserver()->onOpMessage(
+ opCtx,
+ BSON("msg"
+ << "linearizable read"));
+ uow.commit();
+ }
+ MONGO_WRITE_CONFLICT_RETRY_LOOP_END(
+ opCtx, "waitForLinearizableReadConcern", "local.rs.oplog");
+ }
+ WriteConcernOptions wc = WriteConcernOptions(
+ WriteConcernOptions::kMajority, WriteConcernOptions::SyncMode::UNSET, 0);
+
+ repl::OpTime lastOpApplied = repl::ReplClientInfo::forClient(opCtx->getClient()).getLastOp();
+ auto awaitReplResult = replCoord->awaitReplication(opCtx, lastOpApplied, wc);
+ if (awaitReplResult.status == ErrorCodes::WriteConcernFailed) {
+ return Status(ErrorCodes::LinearizableReadConcernError,
+ "Failed to confirm that read was linearizable.");
+ }
+ return awaitReplResult.status;
+}
+
} // namespace
@@ -1435,7 +1598,7 @@ bool Command::run(OperationContext* opCtx,
std::string errmsg;
bool result;
- auto startOperationTime = _getClientOperationTime(opCtx);
+ auto startOperationTime = getClientOperationTime(opCtx);
if (!supportsWriteConcern(cmd)) {
if (commandSpecifiesWriteConcern(cmd)) {
auto result = appendCommandStatus(
@@ -1505,7 +1668,7 @@ bool Command::run(OperationContext* opCtx,
appendCommandStatus(inPlaceReplyBob, result, errmsg);
- auto operationTime = _computeOperationTime(
+ auto operationTime = computeOperationTime(
opCtx, startOperationTime, readConcernArgsStatus.getValue().getLevel());
// An uninitialized operation time means the cluster time is not propagated, so the operation
@@ -1710,13 +1873,13 @@ void mongo::execCommandDatabase(OperationContext* opCtx,
BSONObjBuilder metadataBob;
appendReplyMetadata(opCtx, request, &metadataBob);
- // Ideally this should be using _computeOperationTime, but with the code
+ // Ideally this should be using computeOperationTime, but with the code
// structured as it currently is we don't know the startOperationTime or
// readConcern at this point. Using the cluster time instead of the actual
// operation time is correct, but can result in extra waiting on subsequent
// afterClusterTime reads.
//
- // TODO: SERVER-28445 change this to use _computeOperationTime once the exception handling
+ // TODO: SERVER-28445 change this to use computeOperationTime once the exception handling
// path is moved into Command::run()
auto operationTime = LogicalClock::get(opCtx)->getClusterTime().getTime();