From 756a87f2e86d8f67259b5995d8f1cf7dcc27f7a6 Mon Sep 17 00:00:00 2001 From: Lingzhi Deng Date: Thu, 26 Dec 2019 10:56:06 -0500 Subject: SERVER-39364: Get the last oplog entry from the storage engine for setLastOpToSystemLastOpTime --- src/mongo/base/error_codes.yml | 3 +- src/mongo/db/ops/write_ops_exec.cpp | 2 +- src/mongo/db/repl/repl_client_info.cpp | 41 +++++++++++++++++++++- src/mongo/db/repl/repl_client_info.h | 15 ++++++-- src/mongo/db/repl/replication_coordinator.h | 12 +++++++ src/mongo/db/repl/replication_coordinator_impl.cpp | 15 ++++++++ src/mongo/db/repl/replication_coordinator_impl.h | 2 ++ src/mongo/db/repl/replication_coordinator_mock.cpp | 4 +++ src/mongo/db/repl/replication_coordinator_mock.h | 2 ++ src/mongo/db/repl/replication_coordinator_noop.cpp | 4 +++ src/mongo/db/repl/replication_coordinator_noop.h | 2 ++ src/mongo/db/s/collection_range_deleter.cpp | 8 +++-- src/mongo/db/s/txn_two_phase_commit_cmds.cpp | 2 +- .../storage/wiredtiger/wiredtiger_record_store.cpp | 6 +++- .../embedded/replication_coordinator_embedded.cpp | 4 +++ .../embedded/replication_coordinator_embedded.h | 2 ++ 16 files changed, 113 insertions(+), 11 deletions(-) (limited to 'src/mongo') diff --git a/src/mongo/base/error_codes.yml b/src/mongo/base/error_codes.yml index fc806464ca7..9f1a4c5f8e5 100644 --- a/src/mongo/base/error_codes.yml +++ b/src/mongo/base/error_codes.yml @@ -307,7 +307,8 @@ error_codes: - {code: 280,name: ChangeStreamFatalError,categories: [NonResumableChangeStreamError]} # The two codes below are for internal use only and must never be returned in a network response - - {code: 281,name: TransactionCoordinatorSteppingDown} # Gets converted to InterruptedDueToReplStateChange + # TransactionCoordinatorSteppingDown gets converted to InterruptedDueToReplStateChange + - {code: 281,name: TransactionCoordinatorSteppingDown,categories: [Interruption]} - {code: 282,name: TransactionCoordinatorReachedAbortDecision} - {code: 283,name: WouldChangeOwningShard,extra: WouldChangeOwningShardInfo} diff --git a/src/mongo/db/ops/write_ops_exec.cpp b/src/mongo/db/ops/write_ops_exec.cpp index 7c93bc3303a..196fb827e86 100644 --- a/src/mongo/db/ops/write_ops_exec.cpp +++ b/src/mongo/db/ops/write_ops_exec.cpp @@ -167,7 +167,7 @@ public: // here. No-op updates will not generate a new lastOp, so we still need the // guard to fire in that case. Operations on the local DB aren't replicated, so they // don't need to bump the lastOp. - replClientInfo().setLastOpToSystemLastOpTime(_opCtx); + replClientInfo().setLastOpToSystemLastOpTimeIgnoringInterrupt(_opCtx); LOG(5) << "Set last op to system time: " << replClientInfo().getLastOp().getTimestamp(); } } diff --git a/src/mongo/db/repl/repl_client_info.cpp b/src/mongo/db/repl/repl_client_info.cpp index dc4c04342c7..0690e7b483e 100644 --- a/src/mongo/db/repl/repl_client_info.cpp +++ b/src/mongo/db/repl/repl_client_info.cpp @@ -69,7 +69,33 @@ void ReplClientInfo::setLastOp(OperationContext* opCtx, const OpTime& ot) { void ReplClientInfo::setLastOpToSystemLastOpTime(OperationContext* opCtx) { auto replCoord = repl::ReplicationCoordinator::get(opCtx->getServiceContext()); if (replCoord->isReplEnabled() && opCtx->writesAreReplicated()) { - auto systemOpTime = replCoord->getMyLastAppliedOpTime(); + OpTime systemOpTime; + auto status = [&] { + try { + // Get the latest OpTime from oplog. + systemOpTime = replCoord->getLatestWriteOpTime(opCtx); + return Status::OK(); + } catch (const DBException& e) { + // Fall back to use my lastAppliedOpTime if we failed to get the latest OpTime from + // storage. In most cases, it is safe to ignore errors because if + // getLatestWriteOpTime throws, we cannot use the same opCtx to wait for + // writeConcern anyways. But getLastError from the same client could use a different + // opCtx to wait for the lastOp. So this is a best effort attempt to set the lastOp + // to the in-memory lastAppliedOpTime (which could be lagged). And this is a known + // bug in getLastError. + systemOpTime = replCoord->getMyLastAppliedOpTime(); + if (e.toStatus() == ErrorCodes::OplogOperationUnsupported || + e.toStatus() == ErrorCodes::NamespaceNotFound || + e.toStatus() == ErrorCodes::CollectionIsEmpty || + ErrorCodes::isNotMasterError(e.toStatus())) { + // It is ok if the storage engine does not support getLatestOplogTimestamp() or + // if the oplog is empty. If the node stepped down in between, it is correct to + // use lastAppliedOpTime as last OpTime. + return Status::OK(); + } + return e.toStatus(); + } + }(); // If the system optime has gone backwards, that must mean that there was a rollback. // This is safe, but the last op for a Client should never go backwards, so just leave @@ -84,6 +110,19 @@ void ReplClientInfo::setLastOpToSystemLastOpTime(OperationContext* opCtx) { } lastOpInfo(opCtx).lastOpSetExplicitly = true; + + // Throw if getLatestWriteOpTime failed. + uassertStatusOK(status); + } +} + +void ReplClientInfo::setLastOpToSystemLastOpTimeIgnoringInterrupt(OperationContext* opCtx) { + try { + repl::ReplClientInfo::forClient(opCtx->getClient()).setLastOpToSystemLastOpTime(opCtx); + } catch (const ExceptionForCat& e) { + // In most cases, it is safe to ignore interruption errors because we cannot use the same + // OperationContext to wait for writeConcern anyways. + LOG(2) << "Ignoring set last op interruption error: " << e.toStatus(); } } diff --git a/src/mongo/db/repl/repl_client_info.h b/src/mongo/db/repl/repl_client_info.h index 9f45ec49a24..232c95460fe 100644 --- a/src/mongo/db/repl/repl_client_info.h +++ b/src/mongo/db/repl/repl_client_info.h @@ -69,12 +69,21 @@ public: } /** - * Use this to set the LastOp to the latest known OpTime in the oplog. - * This is necessary when doing no-op writes, as we need to set the client's lastOp to a proper - * value for write concern wait to work. + * Use this to set the LastOp to the latest known OpTime in the oplog. On primary, The OpTime + * used consists of the timestamp of the latest oplog entry on disk and the current term. On + * secondaries, lastAppliedOpTime is used. Using lastAppliedOpTime on secondaries is the desired + * behavior, since secondaries do timestamped reads at the lastApplied. + * + * Setting the lastOp to the latest OpTime is necessary when doing no-op writes, as we need to + * set the client's lastOp to a proper value for write concern wait to work. */ void setLastOpToSystemLastOpTime(OperationContext* opCtx); + /** + * Same as setLastOpToSystemLastOpTime but ignores interruption errors. + */ + void setLastOpToSystemLastOpTimeIgnoringInterrupt(OperationContext* opCtx); + private: static const long long kUninitializedTerm = -1; diff --git a/src/mongo/db/repl/replication_coordinator.h b/src/mongo/db/repl/replication_coordinator.h index d51920f0c95..b46b7ed6804 100644 --- a/src/mongo/db/repl/replication_coordinator.h +++ b/src/mongo/db/repl/replication_coordinator.h @@ -953,6 +953,18 @@ public: RaftMongoSpecActionEnum action, boost::optional oplogReadTimestamp = boost::none) const {} + /** + * Returns the OpTime that consists of the timestamp of the latest oplog entry and the current + * term. + * This function throws if: + * 1. It is called on secondaries. + * 2. OperationContext times out or is interrupted. + * 3. Oplog collection does not exist. + * 4. Oplog collection is empty. + * 5. Getting latest oplog timestamp is not supported by the storage engine. + */ + virtual OpTime getLatestWriteOpTime(OperationContext* opCtx) const = 0; + protected: ReplicationCoordinator(); }; diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index 28578e2f2d0..9eb3a3848af 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -64,6 +64,7 @@ #include "mongo/db/repl/data_replicator_external_state_initial_sync.h" #include "mongo/db/repl/is_master_response.h" #include "mongo/db/repl/last_vote.h" +#include "mongo/db/repl/local_oplog_info.h" #include "mongo/db/repl/read_concern_args.h" #include "mongo/db/repl/repl_client_info.h" #include "mongo/db/repl/repl_set_config_checks.h" @@ -1974,6 +1975,20 @@ std::shared_ptr ReplicationCoordinatorImpl::awaitIsMaste return statusWithIsMaster.getValue(); } +OpTime ReplicationCoordinatorImpl::getLatestWriteOpTime(OperationContext* opCtx) const { + ShouldNotConflictWithSecondaryBatchApplicationBlock noPBWMBlock(opCtx->lockState()); + Lock::GlobalLock globalLock(opCtx, MODE_IS); + // Check if the node is primary after acquiring global IS lock. + uassert(ErrorCodes::NotMaster, + "Not primary so can't get latest write optime", + canAcceptNonLocalWrites()); + auto oplog = LocalOplogInfo::get(opCtx)->getCollection(); + uassert(ErrorCodes::NamespaceNotFound, "oplog collection does not exist.", oplog); + auto latestOplogTimestamp = + uassertStatusOK(oplog->getRecordStore()->getLatestOplogTimestamp(opCtx)); + return OpTime(latestOplogTimestamp, getTerm()); +} + void ReplicationCoordinatorImpl::_killConflictingOpsOnStepUpAndStepDown( AutoGetRstlForStepUpStepDown* arsc, ErrorCodes::Error reason) { const OperationContext* rstlOpCtx = arsc->getOpCtx(); diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h index dfc1a414ddc..5fd6f46dd94 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.h +++ b/src/mongo/db/repl/replication_coordinator_impl.h @@ -348,6 +348,8 @@ public: RaftMongoSpecActionEnum action, boost::optional oplogReadTimestamp = boost::none) const override; + virtual OpTime getLatestWriteOpTime(OperationContext* opCtx) const override; + // ================== Test support API =================== /** diff --git a/src/mongo/db/repl/replication_coordinator_mock.cpp b/src/mongo/db/repl/replication_coordinator_mock.cpp index 57d9b02d7e5..835029ac9a9 100644 --- a/src/mongo/db/repl/replication_coordinator_mock.cpp +++ b/src/mongo/db/repl/replication_coordinator_mock.cpp @@ -566,5 +566,9 @@ std::shared_ptr ReplicationCoordinatorMock::awaitIsMaste return response; } +OpTime ReplicationCoordinatorMock::getLatestWriteOpTime(OperationContext* opCtx) const { + return getMyLastAppliedOpTime(); +} + } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator_mock.h b/src/mongo/db/repl/replication_coordinator_mock.h index be9e97dc932..2893920d1e5 100644 --- a/src/mongo/db/repl/replication_coordinator_mock.h +++ b/src/mongo/db/repl/replication_coordinator_mock.h @@ -323,6 +323,8 @@ public: boost::optional clientTopologyVersion, boost::optional deadline) const override; + virtual OpTime getLatestWriteOpTime(OperationContext* opCtx) const override; + private: ServiceContext* const _service; ReplSettings _settings; diff --git a/src/mongo/db/repl/replication_coordinator_noop.cpp b/src/mongo/db/repl/replication_coordinator_noop.cpp index a3a6b93775b..6f502574645 100644 --- a/src/mongo/db/repl/replication_coordinator_noop.cpp +++ b/src/mongo/db/repl/replication_coordinator_noop.cpp @@ -470,5 +470,9 @@ std::shared_ptr ReplicationCoordinatorNoOp::awaitIsMaste MONGO_UNREACHABLE; } +OpTime ReplicationCoordinatorNoOp::getLatestWriteOpTime(OperationContext* opCtx) const { + return getMyLastAppliedOpTime(); +} + } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator_noop.h b/src/mongo/db/repl/replication_coordinator_noop.h index 5828943fa97..ea825567454 100644 --- a/src/mongo/db/repl/replication_coordinator_noop.h +++ b/src/mongo/db/repl/replication_coordinator_noop.h @@ -260,6 +260,8 @@ public: boost::optional clientTopologyVersion, boost::optional deadline) const final; + OpTime getLatestWriteOpTime(OperationContext* opCtx) const override; + private: ServiceContext* const _service; }; diff --git a/src/mongo/db/s/collection_range_deleter.cpp b/src/mongo/db/s/collection_range_deleter.cpp index 15a37946f98..ef93a23c06f 100644 --- a/src/mongo/db/s/collection_range_deleter.cpp +++ b/src/mongo/db/s/collection_range_deleter.cpp @@ -340,12 +340,14 @@ boost::optional CollectionRangeDeleter::cleanUpNextRange( LOG(0) << "Waiting for majority replication of local deletions in " << nss.ns() << " range " << redact(range->toString()); - repl::ReplClientInfo::forClient(opCtx->getClient()).setLastOpToSystemLastOpTime(opCtx); - const auto clientOpTime = repl::ReplClientInfo::forClient(opCtx->getClient()).getLastOp(); - // Wait for replication outside the lock const auto replicationStatus = [&] { try { + repl::ReplClientInfo::forClient(opCtx->getClient()) + .setLastOpToSystemLastOpTime(opCtx); + const auto clientOpTime = + repl::ReplClientInfo::forClient(opCtx->getClient()).getLastOp(); + WriteConcernResult unusedWCResult; return waitForWriteConcern( opCtx, clientOpTime, kMajorityWriteConcern, &unusedWCResult); diff --git a/src/mongo/db/s/txn_two_phase_commit_cmds.cpp b/src/mongo/db/s/txn_two_phase_commit_cmds.cpp index 21293128b58..30e1d90ae9e 100644 --- a/src/mongo/db/s/txn_two_phase_commit_cmds.cpp +++ b/src/mongo/db/s/txn_two_phase_commit_cmds.cpp @@ -244,7 +244,7 @@ public: // (in all cases except the one where this command aborts the local participant), so // ensure waiting for the client's writeConcern of the decision. repl::ReplClientInfo::forClient(opCtx->getClient()) - .setLastOpToSystemLastOpTime(opCtx); + .setLastOpToSystemLastOpTimeIgnoringInterrupt(opCtx); }); if (coordinatorDecisionFuture) { diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp index 6388ef65ead..ab58970826a 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp @@ -1406,7 +1406,11 @@ StatusWith WiredTigerRecordStore::getLatestOplogTimestamp( WT_SESSION* sess = sessRaii->getSession(); WT_CURSOR* cursor; invariantWTOK(sess->open_cursor(sess, _uri.c_str(), nullptr, nullptr, &cursor)); - invariantWTOK(cursor->prev(cursor)); + int ret = cursor->prev(cursor); + if (ret == WT_NOTFOUND) { + return Status(ErrorCodes::CollectionIsEmpty, "oplog is empty"); + } + invariantWTOK(ret); RecordId recordId = getKey(cursor); invariantWTOK(sess->reset(sess)); diff --git a/src/mongo/embedded/replication_coordinator_embedded.cpp b/src/mongo/embedded/replication_coordinator_embedded.cpp index 9d3ce294c02..f8010210e61 100644 --- a/src/mongo/embedded/replication_coordinator_embedded.cpp +++ b/src/mongo/embedded/replication_coordinator_embedded.cpp @@ -496,5 +496,9 @@ std::shared_ptr ReplicationCoordinatorEmbedded::aw UASSERT_NOT_IMPLEMENTED; }; +OpTime ReplicationCoordinatorEmbedded::getLatestWriteOpTime(OperationContext* opCtx) const { + return getMyLastAppliedOpTime(); +} + } // namespace embedded } // namespace mongo diff --git a/src/mongo/embedded/replication_coordinator_embedded.h b/src/mongo/embedded/replication_coordinator_embedded.h index 732379e8423..eac2c5e97ae 100644 --- a/src/mongo/embedded/replication_coordinator_embedded.h +++ b/src/mongo/embedded/replication_coordinator_embedded.h @@ -267,6 +267,8 @@ public: boost::optional previous, boost::optional deadline) const override; + repl::OpTime getLatestWriteOpTime(OperationContext* opCtx) const override; + private: // Back pointer to the ServiceContext that has started the instance. ServiceContext* const _service; -- cgit v1.2.1