/** * Copyright (C) 2018-present MongoDB, Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the Server Side Public License, version 1, * as published by MongoDB, Inc. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * Server Side Public License for more details. * * You should have received a copy of the Server Side Public License * along with this program. If not, see * . * * As a special exception, the copyright holders give permission to link the * code of portions of this program with the OpenSSL library under certain * conditions as described in each individual source file and distribute * linked combinations including the program with the OpenSSL library. You * must comply with the Server Side Public License in all respects for * all of the code used other than as permitted herein. If you modify file(s) * with this exception, you may extend this exception to your version of the * file(s), but you are not obligated to do so. If you do not wish to do so, * delete this exception statement from your version. If you delete this * exception statement from all source files in the program, then also delete * it in the license file. */ #define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kCommand #include "mongo/platform/basic.h" #include "mongo/db/read_concern.h" #include "mongo/base/status.h" #include "mongo/base/status_with.h" #include "mongo/db/commands.h" #include "mongo/db/concurrency/d_concurrency.h" #include "mongo/db/concurrency/write_conflict_exception.h" #include "mongo/db/curop.h" #include "mongo/db/logical_clock.h" #include "mongo/db/op_observer.h" #include "mongo/db/operation_context.h" #include "mongo/db/repl/read_concern_args.h" #include "mongo/db/repl/repl_client_info.h" #include "mongo/db/repl/replication_coordinator.h" #include "mongo/db/s/sharding_state.h" #include "mongo/db/server_options.h" #include "mongo/db/server_parameters.h" #include "mongo/s/client/shard_registry.h" #include "mongo/s/grid.h" #include "mongo/util/log.h" namespace mongo { namespace { /** * Synchronize writeRequests */ class WriteRequestSynchronizer; const auto getWriteRequestsSynchronizer = ServiceContext::declareDecoration(); class WriteRequestSynchronizer { public: WriteRequestSynchronizer() = default; /** * Returns a tuple if it can find the one that happened after or * at clusterTime. * Returns a tuple otherwise. */ std::tuple>> getOrCreateWriteRequest( LogicalTime clusterTime) { stdx::unique_lock lock(_mutex); auto lastEl = _writeRequests.rbegin(); if (lastEl != _writeRequests.rend() && lastEl->first >= clusterTime.asTimestamp()) { return std::make_tuple(false, lastEl->second); } else { auto newWriteRequest = std::make_shared>(); _writeRequests[clusterTime.asTimestamp()] = newWriteRequest; return std::make_tuple(true, newWriteRequest); } } /** * Erases writeRequest that happened at clusterTime */ void deleteWriteRequest(LogicalTime clusterTime) { stdx::unique_lock lock(_mutex); auto el = _writeRequests.find(clusterTime.asTimestamp()); invariant(el != _writeRequests.end()); invariant(el->second); el->second.reset(); _writeRequests.erase(el); } private: stdx::mutex _mutex; std::map>> _writeRequests; }; MONGO_EXPORT_SERVER_PARAMETER(waitForSecondaryBeforeNoopWriteMS, int, 10); /** * Schedule a write via appendOplogNote command to the primary of this replica set. */ Status makeNoopWriteIfNeeded(OperationContext* opCtx, LogicalTime clusterTime) { repl::ReplicationCoordinator* const replCoord = repl::ReplicationCoordinator::get(opCtx); invariant(replCoord->isReplEnabled()); auto& writeRequests = getWriteRequestsSynchronizer(opCtx->getClient()->getServiceContext()); auto lastAppliedOpTime = LogicalTime(replCoord->getMyLastAppliedOpTime().getTimestamp()); // secondaries may lag primary so wait first to avoid unnecessary noop writes. if (clusterTime > lastAppliedOpTime && replCoord->getMemberState().secondary()) { auto deadline = Date_t::now() + Milliseconds(waitForSecondaryBeforeNoopWriteMS.load()); auto readConcernArgs = repl::ReadConcernArgs(clusterTime, repl::ReadConcernLevel::kLocalReadConcern); auto waitStatus = replCoord->waitUntilOpTimeForReadUntil(opCtx, readConcernArgs, deadline); lastAppliedOpTime = LogicalTime(replCoord->getMyLastAppliedOpTime().getTimestamp()); if (!waitStatus.isOK()) { LOG(1) << "Wait for clusterTime: " << clusterTime.toString() << " until deadline: " << deadline << " failed with " << waitStatus.toString(); } } auto status = Status::OK(); int remainingAttempts = 3; // this loop addresses the case when two or more threads need to advance the opLog time but the // one that waits for the notification gets the later clusterTime, so when the request finishes // it needs to be repeated with the later time. while (clusterTime > lastAppliedOpTime) { auto shardingState = ShardingState::get(opCtx); // standalone replica set, so there is no need to advance the OpLog on the primary. if (!shardingState->enabled()) { return Status::OK(); } auto myShard = Grid::get(opCtx)->shardRegistry()->getShard(opCtx, shardingState->getShardName()); if (!myShard.isOK()) { return myShard.getStatus(); } if (!remainingAttempts--) { std::stringstream ss; ss << "Requested clusterTime" << clusterTime.toString() << " is greater than the last primary OpTime: " << lastAppliedOpTime.toString() << " no retries left"; return Status(ErrorCodes::InternalError, ss.str()); } auto myWriteRequest = writeRequests.getOrCreateWriteRequest(clusterTime); if (std::get<0>(myWriteRequest)) { // Its a new request try { LOG(2) << "New appendOplogNote request on clusterTime: " << clusterTime.toString() << " remaining attempts: " << remainingAttempts; auto swRes = myShard.getValue()->runCommand( opCtx, ReadPreferenceSetting(ReadPreference::PrimaryOnly), "admin", BSON("appendOplogNote" << 1 << "maxClusterTime" << clusterTime.asTimestamp() << "data" << BSON("noop write for afterClusterTime read concern" << 1)), Shard::RetryPolicy::kIdempotent); status = swRes.getStatus(); std::get<1>(myWriteRequest)->set(status); writeRequests.deleteWriteRequest(clusterTime); } catch (const DBException& ex) { status = ex.toStatus(); // signal the writeRequest to unblock waiters std::get<1>(myWriteRequest)->set(status); writeRequests.deleteWriteRequest(clusterTime); } } else { LOG(2) << "Join appendOplogNote request on clusterTime: " << clusterTime.toString() << " remaining attempts: " << remainingAttempts; try { status = std::get<1>(myWriteRequest)->get(opCtx); } catch (const DBException& ex) { return ex.toStatus(); } } // If the write status is ok need to wait for the oplog to replicate. if (status.isOK()) { return status; } lastAppliedOpTime = LogicalTime(replCoord->getMyLastAppliedOpTime().getTimestamp()); } // This is when the noop write failed but the opLog caught up to clusterTime by replicating. if (!status.isOK()) { LOG(1) << "Reached clusterTime " << lastAppliedOpTime.toString() << " but failed noop write due to " << status.toString(); } return Status::OK(); } } // namespace Status waitForReadConcern(OperationContext* opCtx, const repl::ReadConcernArgs& readConcernArgs, bool allowAfterClusterTime) { repl::ReplicationCoordinator* const replCoord = repl::ReplicationCoordinator::get(opCtx); invariant(replCoord); if (readConcernArgs.getLevel() == repl::ReadConcernLevel::kLinearizableReadConcern) { if (replCoord->getReplicationMode() != repl::ReplicationCoordinator::modeReplSet) { // For master/slave and standalone nodes, Linearizable Read is not supported. return {ErrorCodes::NotAReplicaSet, "node needs to be a replica set member to use read concern"}; } // Replica sets running pv0 do not support linearizable read concern until further testing // is completed (SERVER-27025). if (!replCoord->isV1ElectionProtocol()) { return { ErrorCodes::IncompatibleElectionProtocol, "Replica sets running protocol version 0 do not support readConcern: linearizable"}; } if (readConcernArgs.getArgsOpTime()) { return {ErrorCodes::FailedToParse, "afterOpTime not compatible with linearizable read concern"}; } if (!replCoord->getMemberState().primary()) { return {ErrorCodes::NotMaster, "cannot satisfy linearizable read concern on non-primary node"}; } } auto afterClusterTime = readConcernArgs.getArgsClusterTime(); if (afterClusterTime) { if (!allowAfterClusterTime) { return {ErrorCodes::InvalidOptions, "afterClusterTime is not allowed for this command"}; } if ((serverGlobalParams.featureCompatibility.getVersion() != ServerGlobalParams::FeatureCompatibility::Version::kFullyUpgradedTo36) && ShardingState::get(opCtx)->enabled()) { return {ErrorCodes::InvalidOptions, "readConcern afterClusterTime is not available in featureCompatibilityVersion " "3.4 in a sharded cluster"}; } auto currentTime = LogicalClock::get(opCtx)->getClusterTime(); if (currentTime < *afterClusterTime) { return {ErrorCodes::InvalidOptions, "readConcern afterClusterTime must not be greater than clusterTime value"}; } } auto pointInTime = readConcernArgs.getArgsPointInTime(); if (pointInTime) { fassertStatusOK(39345, opCtx->recoveryUnit()->selectSnapshot(pointInTime->asTimestamp())); } if (!readConcernArgs.isEmpty()) { if (replCoord->isReplEnabled() && afterClusterTime) { auto status = makeNoopWriteIfNeeded(opCtx, *afterClusterTime); if (!status.isOK()) { LOG(0) << "Failed noop write at clusterTime: " << afterClusterTime->toString() << " due to " << status.toString(); } } if (replCoord->isReplEnabled() || !afterClusterTime) { auto status = replCoord->waitUntilOpTimeForRead(opCtx, readConcernArgs); if (!status.isOK()) { return status; } } } if (readConcernArgs.getLevel() == repl::ReadConcernLevel::kMajorityReadConcern && replCoord->getReplicationMode() == repl::ReplicationCoordinator::Mode::modeReplSet) { // ReadConcern Majority is not supported in ProtocolVersion 0. if (!replCoord->isV1ElectionProtocol()) { return {ErrorCodes::ReadConcernMajorityNotEnabled, str::stream() << "Replica sets running protocol version 0 do not support " "readConcern: majority"}; } const int debugLevel = serverGlobalParams.clusterRole == ClusterRole::ConfigServer ? 1 : 2; LOG(debugLevel) << "Waiting for 'committed' snapshot to be available for reading: " << readConcernArgs; Status status = opCtx->recoveryUnit()->setReadFromMajorityCommittedSnapshot(); // Wait until a snapshot is available. while (status == ErrorCodes::ReadConcernMajorityNotAvailableYet) { LOG(debugLevel) << "Snapshot not available yet."; replCoord->waitUntilSnapshotCommitted(opCtx, Timestamp()); status = opCtx->recoveryUnit()->setReadFromMajorityCommittedSnapshot(); } if (!status.isOK()) { return status; } LOG(debugLevel) << "Using 'committed' snapshot: " << CurOp::get(opCtx)->opDescription(); } return Status::OK(); } Status waitForLinearizableReadConcern(OperationContext* opCtx) { repl::ReplicationCoordinator* replCoord = repl::ReplicationCoordinator::get(opCtx->getClient()->getServiceContext()); { Lock::DBLock lk(opCtx, "local", MODE_IX); Lock::CollectionLock lock(opCtx->lockState(), "local.oplog.rs", MODE_IX); if (!replCoord->canAcceptWritesForDatabase(opCtx, "admin")) { return {ErrorCodes::NotMaster, "No longer primary when waiting for linearizable read concern"}; } writeConflictRetry(opCtx, "waitForLinearizableReadConcern", "local.rs.oplog", [&opCtx] { WriteUnitOfWork uow(opCtx); opCtx->getClient()->getServiceContext()->getOpObserver()->onOpMessage( opCtx, BSON("msg" << "linearizable read")); uow.commit(); }); } WriteConcernOptions wc = WriteConcernOptions( WriteConcernOptions::kMajority, WriteConcernOptions::SyncMode::UNSET, 0); repl::OpTime lastOpApplied = repl::ReplClientInfo::forClient(opCtx->getClient()).getLastOp(); auto awaitReplResult = replCoord->awaitReplication(opCtx, lastOpApplied, wc); if (awaitReplResult.status == ErrorCodes::WriteConcernFailed) { return Status(ErrorCodes::LinearizableReadConcernError, "Failed to confirm that read was linearizable."); } return awaitReplResult.status; } } // namespace mongo