1 files changed, 173 insertions, 10 deletions
diff --git a/src/mongo/db/commands/dbcommands.cpp b/src/mongo/db/commands/dbcommands.cpp
index ab85d4c9470..648c1fb2447 100644
--- a/src/mongo/db/commands/dbcommands.cpp
+++ b/src/mongo/db/commands/dbcommands.cpp
@@ -84,7 +84,6 @@
 #include "mongo/db/query/get_executor.h"
 #include "mongo/db/query/internal_plans.h"
 #include "mongo/db/query/query_planner.h"
-#include "mongo/db/read_concern.h"
 #include "mongo/db/repair_database.h"
 #include "mongo/db/repl/optime.h"
 #include "mongo/db/repl/read_concern_args.h"
@@ -93,6 +92,7 @@
 #include "mongo/db/repl/replication_coordinator_global.h"
 #include "mongo/db/s/operation_sharding_state.h"
 #include "mongo/db/s/sharding_state.h"
+#include "mongo/db/server_parameters.h"
 #include "mongo/db/stats/storage_stats.h"
 #include "mongo/db/write_concern.h"
 #include "mongo/rpc/metadata.h"
@@ -123,6 +123,15 @@ using std::stringstream;
 using std::unique_ptr;
 
 namespace {
+
+// This is a special flag that allows for testing of snapshot behavior by skipping the replication
+// related checks and isolating the storage/query side of snapshotting.
+bool testingSnapshotBehaviorInIsolation = false;
+ExportedServerParameter<bool, ServerParameterType::kStartupOnly> TestingSnapshotBehaviorInIsolation(
+    ServerParameterSet::getGlobal(),
+    "testingSnapshotBehaviorInIsolation",
+    &testingSnapshotBehaviorInIsolation);
+
 void registerErrorImpl(OperationContext* opCtx, const DBException& exception) {
     CurOp::get(opCtx)->debug().exceptionInfo = exception.getInfo();
 }
@@ -135,7 +144,7 @@ MONGO_INITIALIZER(InitializeRegisterErrorHandler)(InitializerContext* const) {
  * For replica set members it returns the last known op time from opCtx. Otherwise will return
  * uninitialized logical time.
  */
-LogicalTime _getClientOperationTime(OperationContext* opCtx) {
+LogicalTime getClientOperationTime(OperationContext* opCtx) {
     repl::ReplicationCoordinator* replCoord =
         repl::ReplicationCoordinator::get(opCtx->getClient()->getServiceContext());
     const bool isReplSet =
@@ -156,9 +165,9 @@ LogicalTime _getClientOperationTime(OperationContext* opCtx) {
  *
  * TODO: SERVER-28419 Do not compute operationTime if replica set does not propagate clusterTime.
  */
-LogicalTime _computeOperationTime(OperationContext* opCtx,
-                                  LogicalTime startOperationTime,
-                                  repl::ReadConcernLevel level) {
+LogicalTime computeOperationTime(OperationContext* opCtx,
+                                 LogicalTime startOperationTime,
+                                 repl::ReadConcernLevel level) {
     repl::ReplicationCoordinator* replCoord =
         repl::ReplicationCoordinator::get(opCtx->getClient()->getServiceContext());
     const bool isReplSet =
@@ -168,7 +177,7 @@ LogicalTime _computeOperationTime(OperationContext* opCtx,
         return LogicalTime();
     }
 
-    auto operationTime = _getClientOperationTime(opCtx);
+    auto operationTime = getClientOperationTime(opCtx);
     invariant(operationTime >= startOperationTime);
 
     // If the last operationTime has not changed, consider this command a read, and, for replica set
@@ -183,6 +192,160 @@ LogicalTime _computeOperationTime(OperationContext* opCtx,
 
     return operationTime;
 }
+
+Status makeNoopWriteIfNeeded(OperationContext* opCtx, LogicalTime clusterTime) {
+    repl::ReplicationCoordinator* const replCoord = repl::ReplicationCoordinator::get(opCtx);
+    auto lastAppliedTime = LogicalTime(replCoord->getMyLastAppliedOpTime().getTimestamp());
+    if (clusterTime > lastAppliedTime) {
+        auto shardingState = ShardingState::get(opCtx);
+        // standalone replica set, so there is no need to advance the OpLog on the primary.
+        if (!shardingState->enabled()) {
+            return Status::OK();
+        }
+
+        auto myShard =
+            Grid::get(opCtx)->shardRegistry()->getShard(opCtx, shardingState->getShardName());
+        if (!myShard.isOK()) {
+            return myShard.getStatus();
+        }
+
+        auto swRes = myShard.getValue()->runCommand(
+            opCtx,
+            ReadPreferenceSetting(ReadPreference::PrimaryOnly),
+            "admin",
+            BSON("applyOpLogNote" << 1 << "clusterTime" << clusterTime.asTimestamp() << "data"
+                                  << BSON("append noop write" << 1)),
+            Shard::RetryPolicy::kIdempotent);
+        return swRes.getStatus();
+    }
+    return Status::OK();
+}
+
+Status waitForReadConcern(OperationContext* opCtx, const repl::ReadConcernArgs& readConcernArgs) {
+    repl::ReplicationCoordinator* const replCoord = repl::ReplicationCoordinator::get(opCtx);
+
+    if (readConcernArgs.getLevel() == repl::ReadConcernLevel::kLinearizableReadConcern) {
+        if (replCoord->getReplicationMode() != repl::ReplicationCoordinator::modeReplSet) {
+            // For master/slave and standalone nodes, Linearizable Read is not supported.
+            return {ErrorCodes::NotAReplicaSet,
+                    "node needs to be a replica set member to use read concern"};
+        }
+
+        // Replica sets running pv0 do not support linearizable read concern until further testing
+        // is completed (SERVER-27025).
+        if (!replCoord->isV1ElectionProtocol()) {
+            return {
+                ErrorCodes::IncompatibleElectionProtocol,
+                "Replica sets running protocol version 0 do not support readConcern: linearizable"};
+        }
+
+        if (readConcernArgs.getArgsOpTime()) {
+            return {ErrorCodes::FailedToParse,
+                    "afterOpTime not compatible with linearizable read concern"};
+        }
+
+        if (!replCoord->getMemberState().primary()) {
+            return {ErrorCodes::NotMaster,
+                    "cannot satisfy linearizable read concern on non-primary node"};
+        }
+    }
+
+    auto afterClusterTime = readConcernArgs.getArgsClusterTime();
+    if (afterClusterTime) {
+        auto currentTime = LogicalClock::get(opCtx)->getClusterTime().getTime();
+        if (currentTime < *afterClusterTime) {
+            return {ErrorCodes::InvalidOptions,
+                    "readConcern afterClusterTime must not be greater than clusterTime value"};
+        }
+    }
+
+    // Skip waiting for the OpTime when testing snapshot behavior
+    if (!testingSnapshotBehaviorInIsolation && !readConcernArgs.isEmpty()) {
+        if (afterClusterTime) {
+            auto status = makeNoopWriteIfNeeded(opCtx, *afterClusterTime);
+            if (!status.isOK()) {
+                LOG(1) << "failed noop write due to " << status.toString();
+            }
+        }
+
+        auto status = replCoord->waitUntilOpTimeForRead(opCtx, readConcernArgs);
+        if (!status.isOK()) {
+            return status;
+        }
+    }
+
+    if ((replCoord->getReplicationMode() == repl::ReplicationCoordinator::Mode::modeReplSet ||
+         testingSnapshotBehaviorInIsolation) &&
+        readConcernArgs.getLevel() == repl::ReadConcernLevel::kMajorityReadConcern) {
+        // ReadConcern Majority is not supported in ProtocolVersion 0.
+        if (!testingSnapshotBehaviorInIsolation && !replCoord->isV1ElectionProtocol()) {
+            return {ErrorCodes::ReadConcernMajorityNotEnabled,
+                    str::stream() << "Replica sets running protocol version 0 do not support "
+                                     "readConcern: majority"};
+        }
+
+        const int debugLevel = serverGlobalParams.clusterRole == ClusterRole::ConfigServer ? 1 : 2;
+
+        LOG(debugLevel) << "Waiting for 'committed' snapshot to be available for reading: "
+                        << readConcernArgs;
+
+        Status status = opCtx->recoveryUnit()->setReadFromMajorityCommittedSnapshot();
+
+        // Wait until a snapshot is available.
+        while (status == ErrorCodes::ReadConcernMajorityNotAvailableYet) {
+            LOG(debugLevel) << "Snapshot not available yet.";
+            replCoord->waitUntilSnapshotCommitted(opCtx, SnapshotName::min());
+            status = opCtx->recoveryUnit()->setReadFromMajorityCommittedSnapshot();
+        }
+
+        if (!status.isOK()) {
+            return status;
+        }
+
+        LOG(debugLevel) << "Using 'committed' snapshot: " << CurOp::get(opCtx)->query();
+    }
+
+    return Status::OK();
+}
+
+Status waitForLinearizableReadConcern(OperationContext* opCtx) {
+
+    repl::ReplicationCoordinator* replCoord =
+        repl::ReplicationCoordinator::get(opCtx->getClient()->getServiceContext());
+
+    {
+        Lock::DBLock lk(opCtx, "local", MODE_IX);
+        Lock::CollectionLock lock(opCtx->lockState(), "local.oplog.rs", MODE_IX);
+
+        if (!replCoord->canAcceptWritesForDatabase(opCtx, "admin")) {
+            return {ErrorCodes::NotMaster,
+                    "No longer primary when waiting for linearizable read concern"};
+        }
+
+        MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN {
+
+            WriteUnitOfWork uow(opCtx);
+            opCtx->getClient()->getServiceContext()->getOpObserver()->onOpMessage(
+                opCtx,
+                BSON("msg"
+                     << "linearizable read"));
+            uow.commit();
+        }
+        MONGO_WRITE_CONFLICT_RETRY_LOOP_END(
+            opCtx, "waitForLinearizableReadConcern", "local.rs.oplog");
+    }
+    WriteConcernOptions wc = WriteConcernOptions(
+        WriteConcernOptions::kMajority, WriteConcernOptions::SyncMode::UNSET, 0);
+
+    repl::OpTime lastOpApplied = repl::ReplClientInfo::forClient(opCtx->getClient()).getLastOp();
+    auto awaitReplResult = replCoord->awaitReplication(opCtx, lastOpApplied, wc);
+    if (awaitReplResult.status == ErrorCodes::WriteConcernFailed) {
+        return Status(ErrorCodes::LinearizableReadConcernError,
+                      "Failed to confirm that read was linearizable.");
+    }
+    return awaitReplResult.status;
+}
+
 }  // namespace
 
 
@@ -1435,7 +1598,7 @@ bool Command::run(OperationContext* opCtx,
 
     std::string errmsg;
     bool result;
-    auto startOperationTime = _getClientOperationTime(opCtx);
+    auto startOperationTime = getClientOperationTime(opCtx);
     if (!supportsWriteConcern(cmd)) {
         if (commandSpecifiesWriteConcern(cmd)) {
             auto result = appendCommandStatus(
@@ -1505,7 +1668,7 @@ bool Command::run(OperationContext* opCtx,
 
     appendCommandStatus(inPlaceReplyBob, result, errmsg);
 
-    auto operationTime = _computeOperationTime(
+    auto operationTime = computeOperationTime(
         opCtx, startOperationTime, readConcernArgsStatus.getValue().getLevel());
 
     // An uninitialized operation time means the cluster time is not propagated, so the operation
@@ -1710,13 +1873,13 @@ void mongo::execCommandDatabase(OperationContext* opCtx,
         BSONObjBuilder metadataBob;
         appendReplyMetadata(opCtx, request, &metadataBob);
 
-        // Ideally this should be using _computeOperationTime, but with the code
+        // Ideally this should be using computeOperationTime, but with the code
         // structured as it currently is we don't know the startOperationTime or
         // readConcern at this point. Using the cluster time instead of the actual
         // operation time is correct, but can result in extra waiting on subsequent
         // afterClusterTime reads.
         //
-        // TODO: SERVER-28445 change this to use _computeOperationTime once the exception handling
+        // TODO: SERVER-28445 change this to use computeOperationTime once the exception handling
         // path is moved into Command::run()
         auto operationTime = LogicalClock::get(opCtx)->getClusterTime().getTime();