SERVER-41391 clear the oplogTruncateAfterPoint timestamp on stepdown after there are no more active writes

author: Dianna Hohensee <dianna.hohensee@mongodb.com> 2019-11-05 19:58:36 +0000
committer: evergreen <evergreen@mongodb.com> 2019-11-05 19:58:36 +0000
commit: ec44fef95a3e7a6620df1ff67796b9d4566aeecb (patch)
tree: 7fdc6b1251eeaec380479a3c09081da835f9f182
parent: 9561ea73bc0004fc1835430f9789546484c1e7e7 (diff)
download: mongo-ec44fef95a3e7a6620df1ff67796b9d4566aeecb.tar.gz
11 files changed, 83 insertions, 0 deletions
diff --git a/src/mongo/db/repl/replication_consistency_markers.h b/src/mongo/db/repl/replication_consistency_markers.h
index 894aec89f66..c59abdac678 100644
--- a/src/mongo/db/repl/replication_consistency_markers.h
+++ b/src/mongo/db/repl/replication_consistency_markers.h
@@ -144,6 +144,14 @@ public:
     // -------- Oplog Truncate After Point ----------
 
     /**
+     * Ensures that the fast-count counter for the oplogTruncateAfterPoint collection is properly
+     * set. An unclean shutdown can result in a miscount, if the persisted size store is not updated
+     * before the crash. Rollback usually handles this for user collections, but local, unreplicated
+     * collections are not adjusted.
+     */
+    virtual void ensureFastCountOnOplogTruncateAfterPoint(OperationContext* opCtx) = 0;
+
+    /**
      * The oplog truncate after point is set to the beginning of a batch of oplog entries before
      * the oplog entries are written into the oplog, and reset before we begin applying the batch.
      * On startup all oplog entries with a value >= the oplog truncate after point should be
diff --git a/src/mongo/db/repl/replication_consistency_markers_impl.cpp b/src/mongo/db/repl/replication_consistency_markers_impl.cpp
index 406ad96c9fd..d45aa9c492f 100644
--- a/src/mongo/db/repl/replication_consistency_markers_impl.cpp
+++ b/src/mongo/db/repl/replication_consistency_markers_impl.cpp
@@ -308,6 +308,40 @@ ReplicationConsistencyMarkersImpl::_getOplogTruncateAfterPointDocument(
     return oplogTruncateAfterPoint;
 }
 
+void ReplicationConsistencyMarkersImpl::ensureFastCountOnOplogTruncateAfterPoint(
+    OperationContext* opCtx) {
+    LOG(3) << "Updating cached fast-count on collection " << _oplogTruncateAfterPointNss
+           << " in case an unclean shutdown caused it to become incorrect.";
+
+    auto result = _storageInterface->findSingleton(opCtx, _oplogTruncateAfterPointNss);
+
+    if (result.getStatus() == ErrorCodes::NamespaceNotFound) {
+        return;
+    }
+
+    if (result.getStatus() == ErrorCodes::CollectionIsEmpty) {
+        // The count is updated before successful commit of a write, so unclean shutdown can leave
+        // the value incorrectly set to one.
+        invariant(
+            _storageInterface->setCollectionCount(opCtx, _oplogTruncateAfterPointNss, 0).isOK());
+        return;
+    }
+
+    if (result.getStatus() == ErrorCodes::TooManyMatchingDocuments) {
+        fassert(51265,
+                {result.getStatus().code(),
+                 str::stream() << "More than one document was found in the '"
+                               << kDefaultOplogTruncateAfterPointNamespace
+                               << "' collection. Users should not write to this collection. Please "
+                                  "delete the excess documents"});
+    }
+    fassert(51266, result.getStatus());
+
+    // We can safely set a count of one. We know that we only ever write one document, and the
+    // success of findSingleton above confirms only one document exists in the collection.
+    invariant(_storageInterface->setCollectionCount(opCtx, _oplogTruncateAfterPointNss, 1).isOK());
+}
+
 void ReplicationConsistencyMarkersImpl::_upsertOplogTruncateAfterPointDocument(
     OperationContext* opCtx, const BSONObj& updateSpec) {
     fassert(40512,
diff --git a/src/mongo/db/repl/replication_consistency_markers_impl.h b/src/mongo/db/repl/replication_consistency_markers_impl.h
index cb3b8ecf6cc..4cb924eeea0 100644
--- a/src/mongo/db/repl/replication_consistency_markers_impl.h
+++ b/src/mongo/db/repl/replication_consistency_markers_impl.h
@@ -69,6 +69,7 @@ public:
     void setMinValid(OperationContext* opCtx, const OpTime& minValid) override;
     void setMinValidToAtLeast(OperationContext* opCtx, const OpTime& minValid) override;
 
+    void ensureFastCountOnOplogTruncateAfterPoint(OperationContext* opCtx) override;
     void setOplogTruncateAfterPoint(OperationContext* opCtx, const Timestamp& timestamp) override;
     Timestamp getOplogTruncateAfterPoint(OperationContext* opCtx) const override;
 
diff --git a/src/mongo/db/repl/replication_consistency_markers_mock.cpp b/src/mongo/db/repl/replication_consistency_markers_mock.cpp
index 5c698190445..0001bdc2616 100644
--- a/src/mongo/db/repl/replication_consistency_markers_mock.cpp
+++ b/src/mongo/db/repl/replication_consistency_markers_mock.cpp
@@ -80,6 +80,9 @@ void ReplicationConsistencyMarkersMock::setMinValidToAtLeast(OperationContext* o
     _minValid = std::max(_minValid, minValid);
 }
 
+void ReplicationConsistencyMarkersMock::ensureFastCountOnOplogTruncateAfterPoint(
+    OperationContext* opCtx) {}
+
 void ReplicationConsistencyMarkersMock::setOplogTruncateAfterPoint(OperationContext* opCtx,
                                                                    const Timestamp& timestamp) {
     stdx::lock_guard<Latch> lock(_minValidBoundariesMutex);
diff --git a/src/mongo/db/repl/replication_consistency_markers_mock.h b/src/mongo/db/repl/replication_consistency_markers_mock.h
index 3fe3c2670f5..eff8bf2961b 100644
--- a/src/mongo/db/repl/replication_consistency_markers_mock.h
+++ b/src/mongo/db/repl/replication_consistency_markers_mock.h
@@ -61,6 +61,7 @@ public:
     void setMinValid(OperationContext* opCtx, const OpTime& minValid) override;
     void setMinValidToAtLeast(OperationContext* opCtx, const OpTime& minValid) override;
 
+    void ensureFastCountOnOplogTruncateAfterPoint(OperationContext* opCtx) override;
     void setOplogTruncateAfterPoint(OperationContext* opCtx, const Timestamp& timestamp) override;
     Timestamp getOplogTruncateAfterPoint(OperationContext* opCtx) const override;
 
diff --git a/src/mongo/db/repl/replication_coordinator_external_state.h b/src/mongo/db/repl/replication_coordinator_external_state.h
index dd6f4e507ac..e16c35fb016 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state.h
+++ b/src/mongo/db/repl/replication_coordinator_external_state.h
@@ -218,6 +218,12 @@ public:
     virtual void shardingOnStepDownHook() = 0;
 
     /**
+     * Clears oplog visibility state. All of the oplog is safely visible because there are no oplog
+     * writes during stepdown.
+     */
+    virtual void clearOplogVisibilityStateForStepDown() = 0;
+
+    /**
      * Notifies the bgsync and syncSourceFeedback threads to choose a new sync source.
      */
     virtual void signalApplierToChooseNewSyncSource() = 0;
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
index 23611a79206..2d9712e5a11 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
@@ -698,6 +698,26 @@ void ReplicationCoordinatorExternalStateImpl::shardingOnStepDownHook() {
     }
 }
 
+void ReplicationCoordinatorExternalStateImpl::clearOplogVisibilityStateForStepDown() {
+    auto opCtx = cc().getOperationContext();
+    // Temporarily turn off flow control ticketing. Getting a ticket can stall on a ticket being
+    // available, which may have to wait for the ticket refresher to run, which in turn blocks on
+    // the repl _mutex to check whether we are primary or not: this is a deadlock because stepdown
+    // already holds the repl _mutex!
+    auto originalFlowControlSetting = opCtx->shouldParticipateInFlowControl();
+    ON_BLOCK_EXIT([&] { opCtx->setShouldParticipateInFlowControl(originalFlowControlSetting); });
+    opCtx->setShouldParticipateInFlowControl(false);
+
+    // We can clear the oplogTruncateAfterPoint because we know there are no concurrent user writes
+    // during stepdown and therefore presently no oplog holes.
+    //
+    // This value is updated periodically while in PRIMARY mode to protect against oplog holes on
+    // unclean shutdown. The value must then be cleared on stepdown because stepup expects the value
+    // to be unset. Batch application, in mode SECONDARY, also uses the value to protect against
+    // unclean shutdown, and will handle both setting AND unsetting the value.
+    _replicationProcess->getConsistencyMarkers()->setOplogTruncateAfterPoint(opCtx, Timestamp());
+}
+
 void ReplicationCoordinatorExternalStateImpl::_shardingOnTransitionToPrimaryHook(
     OperationContext* opCtx) {
     if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) {
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.h b/src/mongo/db/repl/replication_coordinator_external_state_impl.h
index fc52e77f280..4f25122898f 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_impl.h
+++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.h
@@ -93,6 +93,7 @@ public:
     virtual HostAndPort getClientHostAndPort(const OperationContext* opCtx);
     virtual void closeConnections();
     virtual void shardingOnStepDownHook();
+    virtual void clearOplogVisibilityStateForStepDown() override;
     virtual void signalApplierToChooseNewSyncSource();
     virtual void stopProducer();
     virtual void startProducerIfStopped();
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp b/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp
index 75bdac91439..053f4d460a8 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp
+++ b/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp
@@ -228,6 +228,8 @@ void ReplicationCoordinatorExternalStateMock::closeConnections() {
 
 void ReplicationCoordinatorExternalStateMock::shardingOnStepDownHook() {}
 
+void ReplicationCoordinatorExternalStateMock::clearOplogVisibilityStateForStepDown() {}
+
 void ReplicationCoordinatorExternalStateMock::signalApplierToChooseNewSyncSource() {}
 
 void ReplicationCoordinatorExternalStateMock::stopProducer() {}
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_mock.h b/src/mongo/db/repl/replication_coordinator_external_state_mock.h
index 0f1ed300583..afcc16ce995 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_mock.h
+++ b/src/mongo/db/repl/replication_coordinator_external_state_mock.h
@@ -84,6 +84,7 @@ public:
     virtual StatusWith<OpTimeAndWallTime> loadLastOpTimeAndWallTime(OperationContext* opCtx);
     virtual void closeConnections();
     virtual void shardingOnStepDownHook();
+    virtual void clearOplogVisibilityStateForStepDown() override;
     virtual void signalApplierToChooseNewSyncSource();
     virtual void stopProducer();
     virtual void startProducerIfStopped();
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index 3fe72a5d6f1..b391fa27cdf 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -417,6 +417,10 @@ bool ReplicationCoordinatorImpl::_startLoadLocalConfig(OperationContext* opCtx)
     // initial sync has completed, it also sees these collections.
     fassert(50708, _replicationProcess->getConsistencyMarkers()->createInternalCollections(opCtx));
 
+    // Ensure (update if needed) the in-memory count for the oplogTruncateAfterPoint collection
+    // matches the collection contents.
+    _replicationProcess->getConsistencyMarkers()->ensureFastCountOnOplogTruncateAfterPoint(opCtx);
+
     _replicationProcess->getConsistencyMarkers()->initializeMinValidDocument(opCtx);
 
     fassert(51240, _externalState->createLocalLastVoteCollection(opCtx));
@@ -2992,8 +2996,10 @@ void ReplicationCoordinatorImpl::_performPostMemberStateUpdateAction(
             _externalState->closeConnections();
         /* FALLTHROUGH */
         case kActionSteppedDown:
+            // This code must be safe to run on node rollback and node removal!
             _externalState->shardingOnStepDownHook();
             _externalState->stopNoopWriter();
+            _externalState->clearOplogVisibilityStateForStepDown();
             break;
         case kActionStartSingleNodeElection:
             // In protocol version 1, single node replset will run an election instead of
author	Dianna Hohensee <dianna.hohensee@mongodb.com>	2019-11-05 19:58:36 +0000
committer	evergreen <evergreen@mongodb.com>	2019-11-05 19:58:36 +0000
commit	ec44fef95a3e7a6620df1ff67796b9d4566aeecb (patch)
tree	7fdc6b1251eeaec380479a3c09081da835f9f182
parent	9561ea73bc0004fc1835430f9789546484c1e7e7 (diff)
download	mongo-ec44fef95a3e7a6620df1ff67796b9d4566aeecb.tar.gz