diff options
author | Samyukta Lanka <samy.lanka@mongodb.com> | 2019-09-26 02:04:06 +0000 |
---|---|---|
committer | evergreen <evergreen@mongodb.com> | 2019-09-26 02:04:06 +0000 |
commit | 39ccddf56533f16c855a78d0fdb1a721672b77d9 (patch) | |
tree | d3e04847f1d1a9cbadc3b669d12f13af5f613cd3 | |
parent | b67f95d4d58a40a084c2efadaa074b03cd87d2f2 (diff) | |
download | mongo-39ccddf56533f16c855a78d0fdb1a721672b77d9.tar.gz |
SERVER-41508 Track the time the new term oplog entry was majority committed in replSetStatus on primaries
(cherry picked from commit 6d3a44af67e206e1d61cfd904ef62c0ba015805a)
SERVER-43398 Fix race in replSetGetStatus_new_term_oplog_entry_fields.js
(cherry picked from commit 5f0fa89444bf7bdffee8f9107f803d6a00970dc8)
14 files changed, 155 insertions, 11 deletions
diff --git a/jstests/replsets/replSetGetStatus_new_term_oplog_entry_fields.js b/jstests/replsets/replSetGetStatus_new_term_oplog_entry_fields.js new file mode 100644 index 00000000000..ec286d323bb --- /dev/null +++ b/jstests/replsets/replSetGetStatus_new_term_oplog_entry_fields.js @@ -0,0 +1,64 @@ +/** + * Tests that the 'newTermStartDate' and 'wMajorityWriteAvailabilityDate' fields of the + * replSetGetStatus 'electionCandidateMetrics' section are present only when they should be. + */ + +(function() { +"use strict"; +load("jstests/libs/write_concern_util.js"); +load("jstests/replsets/rslib.js"); + +const name = jsTestName(); +const rst = new ReplSetTest({name: name, nodes: 3}); + +rst.startSet(); +rst.initiateWithHighElectionTimeout(); +rst.awaitReplication(); + +stopServerReplication(rst.nodes); + +// Step up one of the secondaries. +const newPrimary = rst.getSecondary(); +assert.soonNoExcept(function() { + assert.commandWorked(newPrimary.adminCommand({replSetStepUp: 1})); + rst.awaitNodesAgreeOnPrimary(rst.kDefaultTimeoutMS, rst.nodes, rst.getNodeId(newPrimary)); + return newPrimary.adminCommand('replSetGetStatus').myState === ReplSetTest.State.PRIMARY; +}, 'failed to step up node ' + newPrimary.host, rst.kDefaultTimeoutMS); + +// Wait until the new primary completes the transition to primary and writes a no-op. +assert.eq(rst.getPrimary(), newPrimary); + +// Check that the 'electionCandidateMetrics' section of the replSetGetStatus response has the +// 'newTermStartDate' field once the transition to primary is complete. +let res = assert.commandWorked(newPrimary.adminCommand({replSetGetStatus: 1})); +assert(res.electionCandidateMetrics, + () => "Response should have an 'electionCandidateMetrics' field: " + tojson(res)); +assert(res.electionCandidateMetrics.newTermStartDate, + () => "Response should have an 'electionCandidateMetrics.newTermStartDate' field: " + + tojson(res.electionCandidateMetrics)); + +// Check that the 'electionCandidateMetrics' section of the replSetGetStatus response does not have +// the 'wMajorityWriteAvailabilityDate' field before the new term oplog entry has been replicated. +assert( + !res.electionCandidateMetrics.wMajorityWriteAvailabilityDate, + () => + "Response should not have an 'electionCandidateMetrics.wMajorityWriteAvailabilityDate' field: " + + tojson(res.electionCandidateMetrics)); + +restartReplSetReplication(rst); +rst.awaitLastOpCommitted(); + +// Check that the 'electionCandidateMetrics' section of the replSetGetStatus response has the +// 'wMajorityWriteAvailabilityDate' field once the new term oplog entry is in the committed +// snapshot. +res = assert.commandWorked(newPrimary.adminCommand({replSetGetStatus: 1})); +assert(res.electionCandidateMetrics, + () => "Response should have an 'electionCandidateMetrics' field: " + tojson(res)); +assert( + res.electionCandidateMetrics.wMajorityWriteAvailabilityDate, + () => + "Response should have an 'electionCandidateMetrics.wMajorityWriteAvailabilityDate' field: " + + tojson(res.electionCandidateMetrics)); + +rst.stopSet(); +})(); diff --git a/src/mongo/db/repl/replication_coordinator.h b/src/mongo/db/repl/replication_coordinator.h index 2f8bd118552..5c56aac38c5 100644 --- a/src/mongo/db/repl/replication_coordinator.h +++ b/src/mongo/db/repl/replication_coordinator.h @@ -853,6 +853,12 @@ public: virtual size_t getNumUncommittedSnapshots() = 0; /** + * Creates a CallbackWaiter that waits for w:majority write concern to be satisfied up to opTime + * before setting the 'wMajorityWriteAvailabilityDate' election candidate metric. + */ + virtual void createWMajorityWriteAvailabilityDateWaiter(OpTime opTime) = 0; + + /** * Returns a new WriteConcernOptions based on "wc" but with UNSET syncMode reset to JOURNAL or * NONE based on our rsConfig. */ diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp index ba98e84b2d0..bcaf5d94b72 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp @@ -492,6 +492,9 @@ OpTime ReplicationCoordinatorExternalStateImpl::onTransitionToPrimary(OperationC auto newTermStartDate = loadLastOpTimeAndWallTimeResult.getValue().wallTime; ReplicationMetrics::get(opCtx).setNewTermStartDate(newTermStartDate); + auto replCoord = ReplicationCoordinator::get(opCtx); + replCoord->createWMajorityWriteAvailabilityDateWaiter(opTimeToReturn); + _shardingOnTransitionToPrimaryHook(opCtx); _dropAllTempCollections(opCtx); diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index 7f433b3e837..aafca514fd2 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -879,6 +879,7 @@ void ReplicationCoordinatorImpl::shutdown(OperationContext* opCtx) { } _replicationWaiterList.signalAll_inlock(); _opTimeWaiterList.signalAll_inlock(); + _wMajorityWriteAvailabilityWaiter.reset(); _currentCommittedSnapshotCond.notify_all(); _initialSyncer.swap(initialSyncerCopy); } @@ -2118,6 +2119,7 @@ void ReplicationCoordinatorImpl::stepDown(OperationContext* opCtx, // Clear the node's election candidate metrics since it is no longer primary. ReplicationMetrics::get(opCtx).clearElectionCandidateMetrics(); + _wMajorityWriteAvailabilityWaiter.reset(); _topCoord->finishUnconditionalStepDown(); @@ -2702,6 +2704,7 @@ void ReplicationCoordinatorImpl::_finishReplSetReconfig(OperationContext* opCtx, // Clear the node's election candidate metrics since it is no longer primary. ReplicationMetrics::get(opCtx).clearElectionCandidateMetrics(); + _wMajorityWriteAvailabilityWaiter.reset(); } else { // Release the rstl lock as the node might have stepped down due to // other unconditional step down code paths like learning new term via heartbeat & @@ -3181,7 +3184,7 @@ void ReplicationCoordinatorImpl::incrementNumCatchUpOpsIfCatchingUp(int numOps) void ReplicationCoordinatorImpl::signalDropPendingCollectionsRemovedFromStorage() { stdx::lock_guard<stdx::mutex> lock(_mutex); - _wakeReadyWaiters_inlock(); + _wakeReadyWaiters(lock); } boost::optional<Timestamp> ReplicationCoordinatorImpl::getRecoveryTimestamp() { @@ -3273,10 +3276,26 @@ ReplicationCoordinatorImpl::_setCurrentRSConfig(WithLock lk, return action; } -void ReplicationCoordinatorImpl::_wakeReadyWaiters_inlock() { +void ReplicationCoordinatorImpl::_wakeReadyWaiters(WithLock lk) { _replicationWaiterList.signalIf_inlock([this](Waiter* waiter) { return _doneWaitingForReplication_inlock(waiter->opTime, *waiter->writeConcern); }); + + if (_wMajorityWriteAvailabilityWaiter) { + WriteConcernOptions kMajorityWriteConcern( + WriteConcernOptions::kMajority, + WriteConcernOptions::SyncMode::UNSET, + // The timeout isn't used by _doneWaitingForReplication_inlock. + WriteConcernOptions::kNoTimeout); + kMajorityWriteConcern = + _populateUnsetWriteConcernOptionsSyncMode(lk, kMajorityWriteConcern); + + if (_doneWaitingForReplication_inlock(_wMajorityWriteAvailabilityWaiter->opTime, + kMajorityWriteConcern)) { + _wMajorityWriteAvailabilityWaiter->notify_inlock(); + _wMajorityWriteAvailabilityWaiter.reset(); + } + } } Status ReplicationCoordinatorImpl::processReplSetUpdatePosition(const UpdatePositionArgs& updates, @@ -3504,7 +3523,7 @@ void ReplicationCoordinatorImpl::_updateLastCommittedOpTimeAndWallTime(WithLock // check satisfied. We must do this regardless of whether we updated the lastCommittedOpTime, // as lastCommittedOpTime may be based on durable optimes whereas some waiters may be // waiting on applied (but not necessarily durable) optimes. - _wakeReadyWaiters_inlock(); + _wakeReadyWaiters(lk); } boost::optional<OpTimeAndWallTime> ReplicationCoordinatorImpl::_chooseStableOpTimeFromCandidates( @@ -3650,7 +3669,7 @@ void ReplicationCoordinatorImpl::_setStableTimestampForStorage(WithLock lk) { if (serverGlobalParams.enableMajorityReadConcern) { // When majority read concern is enabled, the committed snapshot is set to the new // stable optime. - if (_updateCommittedSnapshot_inlock(stableOpTime.value())) { + if (_updateCommittedSnapshot(lk, stableOpTime.value())) { // Update the stable timestamp for the storage engine. _storage->setStableTimestamp(getServiceContext(), stableOpTime->opTime.getTimestamp()); @@ -3666,7 +3685,7 @@ void ReplicationCoordinatorImpl::_setStableTimestampForStorage(WithLock lk) { // lastCommittedOpTime is set to be the lastApplied which can be ahead of the // allCommitted. auto newCommittedSnapshot = std::min(lastCommittedOpTime, *stableOpTime); - _updateCommittedSnapshot_inlock(newCommittedSnapshot); + _updateCommittedSnapshot(lk, newCommittedSnapshot); } // Set the stable timestamp regardless of whether the majority commit point moved // forward. @@ -3923,8 +3942,17 @@ size_t ReplicationCoordinatorImpl::getNumUncommittedSnapshots() { return _uncommittedSnapshotsSize.load(); } -bool ReplicationCoordinatorImpl::_updateCommittedSnapshot_inlock( - const OpTimeAndWallTime& newCommittedSnapshot) { +void ReplicationCoordinatorImpl::createWMajorityWriteAvailabilityDateWaiter(OpTime opTime) { + stdx::lock_guard<stdx::mutex> lk(_mutex); + auto opTimeCB = [this, opTime]() { + ReplicationMetrics::get(getServiceContext()) + .setWMajorityWriteAvailabilityDate(_replExecutor->now()); + }; + _wMajorityWriteAvailabilityWaiter = std::make_unique<CallbackWaiter>(opTime, opTimeCB); +} + +bool ReplicationCoordinatorImpl::_updateCommittedSnapshot( + WithLock lk, const OpTimeAndWallTime& newCommittedSnapshot) { if (gTestingSnapshotBehaviorInIsolation) { return false; } @@ -3956,7 +3984,7 @@ bool ReplicationCoordinatorImpl::_updateCommittedSnapshot_inlock( _externalState->updateCommittedSnapshot(newCommittedSnapshot.opTime); // Wake up any threads waiting for read concern or write concern. - _wakeReadyWaiters_inlock(); + _wakeReadyWaiters(lk); return true; } @@ -4007,11 +4035,16 @@ EventHandle ReplicationCoordinatorImpl::_makeEvent() { WriteConcernOptions ReplicationCoordinatorImpl::populateUnsetWriteConcernOptionsSyncMode( WriteConcernOptions wc) { + stdx::lock_guard<stdx::mutex> lock(_mutex); + return _populateUnsetWriteConcernOptionsSyncMode(lock, wc); +} +WriteConcernOptions ReplicationCoordinatorImpl::_populateUnsetWriteConcernOptionsSyncMode( + WithLock lk, WriteConcernOptions wc) { WriteConcernOptions writeConcern(wc); if (writeConcern.syncMode == WriteConcernOptions::SyncMode::UNSET) { if (writeConcern.wMode == WriteConcernOptions::kMajority && - getWriteConcernMajorityShouldJournal()) { + getWriteConcernMajorityShouldJournal_inlock()) { writeConcern.syncMode = WriteConcernOptions::SyncMode::JOURNAL; } else { writeConcern.syncMode = WriteConcernOptions::SyncMode::NONE; diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h index 57c31f707aa..8d6753f6f58 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.h +++ b/src/mongo/db/repl/replication_coordinator_impl.h @@ -316,6 +316,8 @@ public: virtual size_t getNumUncommittedSnapshots() override; + virtual void createWMajorityWriteAvailabilityDateWaiter(OpTime opTime) override; + virtual WriteConcernOptions populateUnsetWriteConcernOptionsSyncMode( WriteConcernOptions wc) override; @@ -733,6 +735,13 @@ private: void _resetMyLastOpTimes(WithLock lk); /** + * Returns a new WriteConcernOptions based on "wc" but with UNSET syncMode reset to JOURNAL or + * NONE based on our rsConfig. + */ + WriteConcernOptions _populateUnsetWriteConcernOptionsSyncMode(WithLock lk, + WriteConcernOptions wc); + + /** * Returns the _writeConcernMajorityJournalDefault of our current _rsConfig. */ bool getWriteConcernMajorityShouldJournal_inlock() const; @@ -773,7 +782,7 @@ private: /** * Helper to wake waiters in _replicationWaiterList that are doneWaitingForReplication. */ - void _wakeReadyWaiters_inlock(); + void _wakeReadyWaiters(WithLock lk); /** * Scheduled to cause the ReplicationCoordinator to reconsider any state that might @@ -1195,7 +1204,7 @@ private: * * Returns true if the value was updated to `newCommittedSnapshot`. */ - bool _updateCommittedSnapshot_inlock(const OpTimeAndWallTime& newCommittedSnapshot); + bool _updateCommittedSnapshot(WithLock lk, const OpTimeAndWallTime& newCommittedSnapshot); /** * A helper method that returns the current stable optime based on the current commit point and @@ -1391,6 +1400,9 @@ private: // Does *not* own the WaiterInfos. WaiterList _opTimeWaiterList; // (M) + // Waiter waiting on w:majority write availability. + std::unique_ptr<CallbackWaiter> _wMajorityWriteAvailabilityWaiter; // (M) + // Set to true when we are in the process of shutting down replication. bool _inShutdown; // (M) diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp index ea12a516ba1..b1464c0de9b 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp @@ -68,6 +68,7 @@ public: // Clear the node's election candidate metrics if it loses either the dry-run or actual // election, since it will not become primary. ReplicationMetrics::get(getGlobalServiceContext()).clearElectionCandidateMetrics(); + _replCoord->_wMajorityWriteAvailabilityWaiter.reset(); } void dismiss() { diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index 309c5e2e708..6fa02d27a39 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -428,6 +428,7 @@ void ReplicationCoordinatorImpl::_stepDownFinish( // Clear the node's election candidate metrics since it is no longer primary. ReplicationMetrics::get(opCtx.get()).clearElectionCandidateMetrics(); + _wMajorityWriteAvailabilityWaiter.reset(); _topCoord->finishUnconditionalStepDown(); @@ -654,6 +655,7 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish( // Clear the node's election candidate metrics since it is no longer primary. ReplicationMetrics::get(opCtx.get()).clearElectionCandidateMetrics(); + _wMajorityWriteAvailabilityWaiter.reset(); } else { // Release the rstl lock as the node might have stepped down due to // other unconditional step down code paths like learning new term via heartbeat & diff --git a/src/mongo/db/repl/replication_coordinator_mock.cpp b/src/mongo/db/repl/replication_coordinator_mock.cpp index cd8bfb9dc3a..6f95f6a5fd5 100644 --- a/src/mongo/db/repl/replication_coordinator_mock.cpp +++ b/src/mongo/db/repl/replication_coordinator_mock.cpp @@ -509,6 +509,10 @@ size_t ReplicationCoordinatorMock::getNumUncommittedSnapshots() { return 0; } +void ReplicationCoordinatorMock::createWMajorityWriteAvailabilityDateWaiter(OpTime opTime) { + return; +} + WriteConcernOptions ReplicationCoordinatorMock::populateUnsetWriteConcernOptionsSyncMode( WriteConcernOptions wc) { if (wc.syncMode == WriteConcernOptions::SyncMode::UNSET) { diff --git a/src/mongo/db/repl/replication_coordinator_mock.h b/src/mongo/db/repl/replication_coordinator_mock.h index cb3bab9e157..f3cc223981e 100644 --- a/src/mongo/db/repl/replication_coordinator_mock.h +++ b/src/mongo/db/repl/replication_coordinator_mock.h @@ -274,6 +274,8 @@ public: virtual size_t getNumUncommittedSnapshots() override; + virtual void createWMajorityWriteAvailabilityDateWaiter(OpTime opTime) override; + virtual WriteConcernOptions populateUnsetWriteConcernOptionsSyncMode( WriteConcernOptions wc) override; diff --git a/src/mongo/db/repl/replication_metrics.cpp b/src/mongo/db/repl/replication_metrics.cpp index b6fb3289f35..eae67e976fd 100644 --- a/src/mongo/db/repl/replication_metrics.cpp +++ b/src/mongo/db/repl/replication_metrics.cpp @@ -283,6 +283,11 @@ void ReplicationMetrics::setNewTermStartDate(Date_t newTermStartDate) { _electionCandidateMetrics.setNewTermStartDate(newTermStartDate); } +void ReplicationMetrics::setWMajorityWriteAvailabilityDate(Date_t wMajorityWriteAvailabilityDate) { + stdx::lock_guard<stdx::mutex> lk(_mutex); + _electionCandidateMetrics.setWMajorityWriteAvailabilityDate(wMajorityWriteAvailabilityDate); +} + boost::optional<OpTime> ReplicationMetrics::getTargetCatchupOpTime_forTesting() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _electionCandidateMetrics.getTargetCatchupOpTime(); @@ -306,6 +311,7 @@ void ReplicationMetrics::clearElectionCandidateMetrics() { _electionCandidateMetrics.setTargetCatchupOpTime(boost::none); _electionCandidateMetrics.setNumCatchUpOps(boost::none); _electionCandidateMetrics.setNewTermStartDate(boost::none); + _electionCandidateMetrics.setWMajorityWriteAvailabilityDate(boost::none); _nodeIsCandidateOrPrimary = false; } diff --git a/src/mongo/db/repl/replication_metrics.h b/src/mongo/db/repl/replication_metrics.h index a169e8a0da5..9d868533cfc 100644 --- a/src/mongo/db/repl/replication_metrics.h +++ b/src/mongo/db/repl/replication_metrics.h @@ -85,6 +85,7 @@ public: void setTargetCatchupOpTime(OpTime opTime); void setNumCatchUpOps(int numCatchUpOps); void setNewTermStartDate(Date_t newTermStartDate); + void setWMajorityWriteAvailabilityDate(Date_t wMajorityWriteAvailabilityDate); boost::optional<OpTime> getTargetCatchupOpTime_forTesting(); diff --git a/src/mongo/db/repl/replication_metrics.idl b/src/mongo/db/repl/replication_metrics.idl index c17140b9125..2613eac59b9 100644 --- a/src/mongo/db/repl/replication_metrics.idl +++ b/src/mongo/db/repl/replication_metrics.idl @@ -139,6 +139,10 @@ structs: description: "Time the new term oplog entry was written" type: date optional: true + wMajorityWriteAvailabilityDate: + description: "Time w:majority write concern is satisfied for new term oplog entry" + type: date + optional: true ElectionParticipantMetrics: description: "Stores metrics that are specific to the last election in which the node voted" diff --git a/src/mongo/embedded/replication_coordinator_embedded.cpp b/src/mongo/embedded/replication_coordinator_embedded.cpp index b11d0106d77..767a8369d31 100644 --- a/src/mongo/embedded/replication_coordinator_embedded.cpp +++ b/src/mongo/embedded/replication_coordinator_embedded.cpp @@ -464,6 +464,10 @@ size_t ReplicationCoordinatorEmbedded::getNumUncommittedSnapshots() { UASSERT_NOT_IMPLEMENTED; } +void ReplicationCoordinatorEmbedded::createWMajorityWriteAvailabilityDateWaiter(OpTime opTime) { + UASSERT_NOT_IMPLEMENTED; +} + void ReplicationCoordinatorEmbedded::dropAllSnapshots() { UASSERT_NOT_IMPLEMENTED; } diff --git a/src/mongo/embedded/replication_coordinator_embedded.h b/src/mongo/embedded/replication_coordinator_embedded.h index 1246adf7e93..e39955ea365 100644 --- a/src/mongo/embedded/replication_coordinator_embedded.h +++ b/src/mongo/embedded/replication_coordinator_embedded.h @@ -247,6 +247,8 @@ public: size_t getNumUncommittedSnapshots() override; + virtual void createWMajorityWriteAvailabilityDateWaiter(repl::OpTime opTime) override; + Status stepUpIfEligible(bool skipDryRun) override; Status abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) override; |