diff options
author | Dianna Hohensee <dianna.hohensee@mongodb.com> | 2019-11-05 19:58:36 +0000 |
---|---|---|
committer | evergreen <evergreen@mongodb.com> | 2019-11-05 19:58:36 +0000 |
commit | ec44fef95a3e7a6620df1ff67796b9d4566aeecb (patch) | |
tree | 7fdc6b1251eeaec380479a3c09081da835f9f182 | |
parent | 9561ea73bc0004fc1835430f9789546484c1e7e7 (diff) | |
download | mongo-ec44fef95a3e7a6620df1ff67796b9d4566aeecb.tar.gz |
SERVER-41391 clear the oplogTruncateAfterPoint timestamp on stepdown after there are no more active writes
11 files changed, 83 insertions, 0 deletions
diff --git a/src/mongo/db/repl/replication_consistency_markers.h b/src/mongo/db/repl/replication_consistency_markers.h index 894aec89f66..c59abdac678 100644 --- a/src/mongo/db/repl/replication_consistency_markers.h +++ b/src/mongo/db/repl/replication_consistency_markers.h @@ -144,6 +144,14 @@ public: // -------- Oplog Truncate After Point ---------- /** + * Ensures that the fast-count counter for the oplogTruncateAfterPoint collection is properly + * set. An unclean shutdown can result in a miscount, if the persisted size store is not updated + * before the crash. Rollback usually handles this for user collections, but local, unreplicated + * collections are not adjusted. + */ + virtual void ensureFastCountOnOplogTruncateAfterPoint(OperationContext* opCtx) = 0; + + /** * The oplog truncate after point is set to the beginning of a batch of oplog entries before * the oplog entries are written into the oplog, and reset before we begin applying the batch. * On startup all oplog entries with a value >= the oplog truncate after point should be diff --git a/src/mongo/db/repl/replication_consistency_markers_impl.cpp b/src/mongo/db/repl/replication_consistency_markers_impl.cpp index 406ad96c9fd..d45aa9c492f 100644 --- a/src/mongo/db/repl/replication_consistency_markers_impl.cpp +++ b/src/mongo/db/repl/replication_consistency_markers_impl.cpp @@ -308,6 +308,40 @@ ReplicationConsistencyMarkersImpl::_getOplogTruncateAfterPointDocument( return oplogTruncateAfterPoint; } +void ReplicationConsistencyMarkersImpl::ensureFastCountOnOplogTruncateAfterPoint( + OperationContext* opCtx) { + LOG(3) << "Updating cached fast-count on collection " << _oplogTruncateAfterPointNss + << " in case an unclean shutdown caused it to become incorrect."; + + auto result = _storageInterface->findSingleton(opCtx, _oplogTruncateAfterPointNss); + + if (result.getStatus() == ErrorCodes::NamespaceNotFound) { + return; + } + + if (result.getStatus() == ErrorCodes::CollectionIsEmpty) { + // The count is updated before successful commit of a write, so unclean shutdown can leave + // the value incorrectly set to one. + invariant( + _storageInterface->setCollectionCount(opCtx, _oplogTruncateAfterPointNss, 0).isOK()); + return; + } + + if (result.getStatus() == ErrorCodes::TooManyMatchingDocuments) { + fassert(51265, + {result.getStatus().code(), + str::stream() << "More than one document was found in the '" + << kDefaultOplogTruncateAfterPointNamespace + << "' collection. Users should not write to this collection. Please " + "delete the excess documents"}); + } + fassert(51266, result.getStatus()); + + // We can safely set a count of one. We know that we only ever write one document, and the + // success of findSingleton above confirms only one document exists in the collection. + invariant(_storageInterface->setCollectionCount(opCtx, _oplogTruncateAfterPointNss, 1).isOK()); +} + void ReplicationConsistencyMarkersImpl::_upsertOplogTruncateAfterPointDocument( OperationContext* opCtx, const BSONObj& updateSpec) { fassert(40512, diff --git a/src/mongo/db/repl/replication_consistency_markers_impl.h b/src/mongo/db/repl/replication_consistency_markers_impl.h index cb3b8ecf6cc..4cb924eeea0 100644 --- a/src/mongo/db/repl/replication_consistency_markers_impl.h +++ b/src/mongo/db/repl/replication_consistency_markers_impl.h @@ -69,6 +69,7 @@ public: void setMinValid(OperationContext* opCtx, const OpTime& minValid) override; void setMinValidToAtLeast(OperationContext* opCtx, const OpTime& minValid) override; + void ensureFastCountOnOplogTruncateAfterPoint(OperationContext* opCtx) override; void setOplogTruncateAfterPoint(OperationContext* opCtx, const Timestamp& timestamp) override; Timestamp getOplogTruncateAfterPoint(OperationContext* opCtx) const override; diff --git a/src/mongo/db/repl/replication_consistency_markers_mock.cpp b/src/mongo/db/repl/replication_consistency_markers_mock.cpp index 5c698190445..0001bdc2616 100644 --- a/src/mongo/db/repl/replication_consistency_markers_mock.cpp +++ b/src/mongo/db/repl/replication_consistency_markers_mock.cpp @@ -80,6 +80,9 @@ void ReplicationConsistencyMarkersMock::setMinValidToAtLeast(OperationContext* o _minValid = std::max(_minValid, minValid); } +void ReplicationConsistencyMarkersMock::ensureFastCountOnOplogTruncateAfterPoint( + OperationContext* opCtx) {} + void ReplicationConsistencyMarkersMock::setOplogTruncateAfterPoint(OperationContext* opCtx, const Timestamp& timestamp) { stdx::lock_guard<Latch> lock(_minValidBoundariesMutex); diff --git a/src/mongo/db/repl/replication_consistency_markers_mock.h b/src/mongo/db/repl/replication_consistency_markers_mock.h index 3fe3c2670f5..eff8bf2961b 100644 --- a/src/mongo/db/repl/replication_consistency_markers_mock.h +++ b/src/mongo/db/repl/replication_consistency_markers_mock.h @@ -61,6 +61,7 @@ public: void setMinValid(OperationContext* opCtx, const OpTime& minValid) override; void setMinValidToAtLeast(OperationContext* opCtx, const OpTime& minValid) override; + void ensureFastCountOnOplogTruncateAfterPoint(OperationContext* opCtx) override; void setOplogTruncateAfterPoint(OperationContext* opCtx, const Timestamp& timestamp) override; Timestamp getOplogTruncateAfterPoint(OperationContext* opCtx) const override; diff --git a/src/mongo/db/repl/replication_coordinator_external_state.h b/src/mongo/db/repl/replication_coordinator_external_state.h index dd6f4e507ac..e16c35fb016 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state.h +++ b/src/mongo/db/repl/replication_coordinator_external_state.h @@ -218,6 +218,12 @@ public: virtual void shardingOnStepDownHook() = 0; /** + * Clears oplog visibility state. All of the oplog is safely visible because there are no oplog + * writes during stepdown. + */ + virtual void clearOplogVisibilityStateForStepDown() = 0; + + /** * Notifies the bgsync and syncSourceFeedback threads to choose a new sync source. */ virtual void signalApplierToChooseNewSyncSource() = 0; diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp index 23611a79206..2d9712e5a11 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp @@ -698,6 +698,26 @@ void ReplicationCoordinatorExternalStateImpl::shardingOnStepDownHook() { } } +void ReplicationCoordinatorExternalStateImpl::clearOplogVisibilityStateForStepDown() { + auto opCtx = cc().getOperationContext(); + // Temporarily turn off flow control ticketing. Getting a ticket can stall on a ticket being + // available, which may have to wait for the ticket refresher to run, which in turn blocks on + // the repl _mutex to check whether we are primary or not: this is a deadlock because stepdown + // already holds the repl _mutex! + auto originalFlowControlSetting = opCtx->shouldParticipateInFlowControl(); + ON_BLOCK_EXIT([&] { opCtx->setShouldParticipateInFlowControl(originalFlowControlSetting); }); + opCtx->setShouldParticipateInFlowControl(false); + + // We can clear the oplogTruncateAfterPoint because we know there are no concurrent user writes + // during stepdown and therefore presently no oplog holes. + // + // This value is updated periodically while in PRIMARY mode to protect against oplog holes on + // unclean shutdown. The value must then be cleared on stepdown because stepup expects the value + // to be unset. Batch application, in mode SECONDARY, also uses the value to protect against + // unclean shutdown, and will handle both setting AND unsetting the value. + _replicationProcess->getConsistencyMarkers()->setOplogTruncateAfterPoint(opCtx, Timestamp()); +} + void ReplicationCoordinatorExternalStateImpl::_shardingOnTransitionToPrimaryHook( OperationContext* opCtx) { if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) { diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.h b/src/mongo/db/repl/replication_coordinator_external_state_impl.h index fc52e77f280..4f25122898f 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_impl.h +++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.h @@ -93,6 +93,7 @@ public: virtual HostAndPort getClientHostAndPort(const OperationContext* opCtx); virtual void closeConnections(); virtual void shardingOnStepDownHook(); + virtual void clearOplogVisibilityStateForStepDown() override; virtual void signalApplierToChooseNewSyncSource(); virtual void stopProducer(); virtual void startProducerIfStopped(); diff --git a/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp b/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp index 75bdac91439..053f4d460a8 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp +++ b/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp @@ -228,6 +228,8 @@ void ReplicationCoordinatorExternalStateMock::closeConnections() { void ReplicationCoordinatorExternalStateMock::shardingOnStepDownHook() {} +void ReplicationCoordinatorExternalStateMock::clearOplogVisibilityStateForStepDown() {} + void ReplicationCoordinatorExternalStateMock::signalApplierToChooseNewSyncSource() {} void ReplicationCoordinatorExternalStateMock::stopProducer() {} diff --git a/src/mongo/db/repl/replication_coordinator_external_state_mock.h b/src/mongo/db/repl/replication_coordinator_external_state_mock.h index 0f1ed300583..afcc16ce995 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_mock.h +++ b/src/mongo/db/repl/replication_coordinator_external_state_mock.h @@ -84,6 +84,7 @@ public: virtual StatusWith<OpTimeAndWallTime> loadLastOpTimeAndWallTime(OperationContext* opCtx); virtual void closeConnections(); virtual void shardingOnStepDownHook(); + virtual void clearOplogVisibilityStateForStepDown() override; virtual void signalApplierToChooseNewSyncSource(); virtual void stopProducer(); virtual void startProducerIfStopped(); diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index 3fe72a5d6f1..b391fa27cdf 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -417,6 +417,10 @@ bool ReplicationCoordinatorImpl::_startLoadLocalConfig(OperationContext* opCtx) // initial sync has completed, it also sees these collections. fassert(50708, _replicationProcess->getConsistencyMarkers()->createInternalCollections(opCtx)); + // Ensure (update if needed) the in-memory count for the oplogTruncateAfterPoint collection + // matches the collection contents. + _replicationProcess->getConsistencyMarkers()->ensureFastCountOnOplogTruncateAfterPoint(opCtx); + _replicationProcess->getConsistencyMarkers()->initializeMinValidDocument(opCtx); fassert(51240, _externalState->createLocalLastVoteCollection(opCtx)); @@ -2992,8 +2996,10 @@ void ReplicationCoordinatorImpl::_performPostMemberStateUpdateAction( _externalState->closeConnections(); /* FALLTHROUGH */ case kActionSteppedDown: + // This code must be safe to run on node rollback and node removal! _externalState->shardingOnStepDownHook(); _externalState->stopNoopWriter(); + _externalState->clearOplogVisibilityStateForStepDown(); break; case kActionStartSingleNodeElection: // In protocol version 1, single node replset will run an election instead of |