diff options
author | Dianna Hohensee <dianna.hohensee@mongodb.com> | 2020-01-23 14:40:45 -0500 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-01-29 14:39:53 +0000 |
commit | 7442016d9fd45928370084da1899bc63240a3486 (patch) | |
tree | 85b01dcd00e9e0095b9906418753505a0f183049 /src | |
parent | a081c83758521e11a5f285541ea9f13928f3de0f (diff) | |
download | mongo-7442016d9fd45928370084da1899bc63240a3486.tar.gz |
SERVER-45721 truncation using the oplogTruncateAfterPoint is always non-inclusive of the oplogTruncateAfterPoint
Diffstat (limited to 'src')
-rw-r--r-- | src/mongo/db/op_observer_impl.cpp | 3 | ||||
-rw-r--r-- | src/mongo/db/repl/oplog_applier_impl.cpp | 3 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_consistency_markers.idl | 2 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_recovery.cpp | 49 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_recovery.h | 5 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_recovery_test.cpp | 9 | ||||
-rw-r--r-- | src/mongo/db/repl/rollback_impl.cpp | 39 | ||||
-rw-r--r-- | src/mongo/db/repl/rollback_impl.h | 7 | ||||
-rw-r--r-- | src/mongo/db/repl/rollback_impl_test.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h | 6 |
10 files changed, 55 insertions, 74 deletions
diff --git a/src/mongo/db/op_observer_impl.cpp b/src/mongo/db/op_observer_impl.cpp index dcaff636784..b514d532f5e 100644 --- a/src/mongo/db/op_observer_impl.cpp +++ b/src/mongo/db/op_observer_impl.cpp @@ -27,6 +27,8 @@ * it in the license file. */ +#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kReplication + #include "mongo/platform/basic.h" #include "mongo/db/op_observer_impl.h" @@ -64,6 +66,7 @@ #include "mongo/scripting/engine.h" #include "mongo/util/assert_util.h" #include "mongo/util/fail_point.h" +#include "mongo/util/log.h" namespace mongo { using repl::MutableOplogEntry; diff --git a/src/mongo/db/repl/oplog_applier_impl.cpp b/src/mongo/db/repl/oplog_applier_impl.cpp index fdd6a7a18d0..8b7d7ba3c41 100644 --- a/src/mongo/db/repl/oplog_applier_impl.cpp +++ b/src/mongo/db/repl/oplog_applier_impl.cpp @@ -624,7 +624,8 @@ StatusWith<OpTime> OplogApplierImpl::_applyOplogBatch(OperationContext* opCtx, // Write batch of ops into oplog. if (!getOptions().skipWritesToOplog) { - _consistencyMarkers->setOplogTruncateAfterPoint(opCtx, ops.front().getTimestamp()); + _consistencyMarkers->setOplogTruncateAfterPoint( + opCtx, _replCoord->getMyLastAppliedOpTime().getTimestamp()); scheduleWritesToOplog(opCtx, _storageInterface, _writerPool, ops); } diff --git a/src/mongo/db/repl/replication_consistency_markers.idl b/src/mongo/db/repl/replication_consistency_markers.idl index 6633e6a4fc8..df977970a30 100644 --- a/src/mongo/db/repl/replication_consistency_markers.idl +++ b/src/mongo/db/repl/replication_consistency_markers.idl @@ -66,7 +66,7 @@ structs: fields: oplogTruncateAfterPoint: type: timestamp - description: "The timestamp of the first oplog entry in a batch when we are writing oplog entries to the oplog after which the oplog may be inconsistent." + description: "The timestamp of an oplog entry after which the oplog may be inconsistent (have holes due to parallel writes)." _id: type: string description: "Always set to 'oplogTruncateAfterPoint' to easily retrieve it." diff --git a/src/mongo/db/repl/replication_recovery.cpp b/src/mongo/db/repl/replication_recovery.cpp index 2882bc0ade0..08aecd19258 100644 --- a/src/mongo/db/repl/replication_recovery.cpp +++ b/src/mongo/db/repl/replication_recovery.cpp @@ -410,7 +410,9 @@ void ReplicationRecoveryImpl::_recoverFromStableTimestamp(OperationContext* opCt OpTime topOfOplog) { invariant(!stableTimestamp.isNull()); invariant(!topOfOplog.isNull()); + const auto truncateAfterPoint = _consistencyMarkers->getOplogTruncateAfterPoint(opCtx); + log() << "Recovering from stable timestamp: " << stableTimestamp << " (top of oplog: " << topOfOplog << ", appliedThrough: " << appliedThrough << ", TruncateAfter: " << truncateAfterPoint << ")"; @@ -568,8 +570,7 @@ StatusWith<OpTime> ReplicationRecoveryImpl::_getTopOfOplog(OperationContext* opC } void ReplicationRecoveryImpl::_truncateOplogTo(OperationContext* opCtx, - Timestamp truncateTimestamp, - bool inclusive) { + Timestamp truncateTimestamp) { Timer timer; const NamespaceString oplogNss(NamespaceString::kRsOplogNamespace); AutoGetDb autoDb(opCtx, oplogNss.db(), MODE_IX); @@ -583,8 +584,15 @@ void ReplicationRecoveryImpl::_truncateOplogTo(OperationContext* opCtx, str::stream() << "Can't find " << NamespaceString::kRsOplogNamespace.ns())); } - // Scan through oplog in reverse, from latest entry to first, to find the truncateTimestamp. - RecordId oldestIDToDelete; // Non-null if there is something to delete. + // Truncate the oplog after (non-inclusive of) the truncateTimestamp. Scan through the oplog in + // reverse, from latest entry to first, to find an entry lte truncateTimestamp. Once such an + // entry is found, we will truncate inclusive of the previous entry found that is greater than + // the truncate point. If only one entry is found lte to the truncateTimestamp, then nothing is + // truncated: nothing was found greater than the truncateTimestamp. Note that the + // truncateTimestamp does not have to be an exact match to an oplog entry: any entries after + // that time will be truncated. + RecordId previousRecordId; + Timestamp topOfOplog; auto oplogRs = oplogCollection->getRecordStore(); auto oplogReverseCursor = oplogRs->getCursor(opCtx, /*forward=*/false); size_t count = 0; @@ -595,25 +603,32 @@ void ReplicationRecoveryImpl::_truncateOplogTo(OperationContext* opCtx, const auto tsElem = entry["ts"]; if (count == 1) { - if (tsElem.eoo()) + if (tsElem.eoo()) { LOG(2) << "Oplog tail entry: " << redact(entry); - else + } else { LOG(2) << "Oplog tail entry ts field: " << tsElem; + topOfOplog = tsElem.timestamp(); + } } - if (tsElem.timestamp() < truncateTimestamp) { + if (tsElem.timestamp() <= truncateTimestamp) { // If count == 1, that means that we have nothing to delete because everything in the - // oplog is < truncateTimestamp. + // oplog is <= truncateTimestamp. if (count != 1) { - invariant(!oldestIDToDelete.isNull()); - oplogCollection->cappedTruncateAfter(opCtx, oldestIDToDelete, inclusive); + log() << "Truncating oplog from [" << Timestamp(previousRecordId.repr()) << " to " + << topOfOplog << "]. Truncate after point is " << truncateTimestamp; + invariant(!previousRecordId.isNull()); + oplogCollection->cappedTruncateAfter(opCtx, previousRecordId, /*inclusive*/ true); + } else { + log() << "There is no oplog after " << truncateTimestamp + << " to truncate. The top of the oplog is " << topOfOplog; } log() << "Replication recovery oplog truncation finished in: " << timer.millis() << "ms"; return; } - oldestIDToDelete = id; + previousRecordId = id; } severe() << "Reached end of oplog looking for oplog entry before " << truncateTimestamp.toBSON() @@ -630,10 +645,6 @@ void ReplicationRecoveryImpl::_truncateOplogIfNeededAndThenClearOplogTruncateAft return; } - // We want to delete oplog inclusive of the 'oplogTruncateAfterPoint', but not inclusive of the - // stable timestamp if we end up using that value instead. - bool inclusive = true; - if (stableTimestamp && !stableTimestamp->isNull() && truncatePoint <= stableTimestamp) { AutoGetCollectionForRead oplog(opCtx, NamespaceString::kRsOplogNamespace); invariant(oplog.getCollection()); @@ -642,17 +653,15 @@ void ReplicationRecoveryImpl::_truncateOplogIfNeededAndThenClearOplogTruncateAft << ") is equal to or earlier than the stable timestamp (" << stableTimestamp.get() << "), so truncating after the stable timestamp instead"; - inclusive = false; truncatePoint = stableTimestamp.get(); } - log() << "Removing unapplied oplog entries starting at: " << truncatePoint.toBSON() << ", " - << (inclusive ? "inclusive" : "not inclusive"); - _truncateOplogTo(opCtx, truncatePoint, inclusive); + log() << "Removing unapplied oplog entries starting after: " << truncatePoint.toBSON(); + _truncateOplogTo(opCtx, truncatePoint); // Clear the oplogTruncateAfterPoint now that we have removed any holes that might exist in the // oplog -- and so that we do not truncate future entries erroneously. - _consistencyMarkers->setOplogTruncateAfterPoint(opCtx, {}); + _consistencyMarkers->setOplogTruncateAfterPoint(opCtx, Timestamp()); opCtx->recoveryUnit()->waitUntilDurable(opCtx); } diff --git a/src/mongo/db/repl/replication_recovery.h b/src/mongo/db/repl/replication_recovery.h index 0a891420b54..cef637b1ee8 100644 --- a/src/mongo/db/repl/replication_recovery.h +++ b/src/mongo/db/repl/replication_recovery.h @@ -131,10 +131,9 @@ private: StatusWith<OpTime> _getTopOfOplog(OperationContext* opCtx) const; /** - * Truncates the oplog after the "truncateTimestamp" entry. Includes the "truncateTimestamp" - * entry if "inclusive" is set to true. + * Truncates the oplog after the "truncateTimestamp" entry. */ - void _truncateOplogTo(OperationContext* opCtx, Timestamp truncateTimestamp, bool inclusive); + void _truncateOplogTo(OperationContext* opCtx, Timestamp truncateTimestamp); /** * Uses the oplogTruncateAfterPoint, accessed via '_consistencyMarkers', to decide whether to diff --git a/src/mongo/db/repl/replication_recovery_test.cpp b/src/mongo/db/repl/replication_recovery_test.cpp index 4831a612b7e..c13132a1f32 100644 --- a/src/mongo/db/repl/replication_recovery_test.cpp +++ b/src/mongo/db/repl/replication_recovery_test.cpp @@ -27,6 +27,8 @@ * it in the license file. */ +#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kReplication + #include "mongo/platform/basic.h" #include <memory> @@ -53,6 +55,7 @@ #include "mongo/unittest/death_test.h" #include "mongo/unittest/unittest.h" #include "mongo/util/assert_util.h" +#include "mongo/util/log.h" #include "mongo/util/str.h" namespace { @@ -467,7 +470,7 @@ TEST_F(ReplicationRecoveryTest, RecoveryTruncatesOplogAtOplogTruncateAfterPoint) ReplicationRecoveryImpl recovery(getStorageInterface(), getConsistencyMarkers()); auto opCtx = getOperationContext(); - getConsistencyMarkers()->setOplogTruncateAfterPoint(opCtx, Timestamp(4, 4)); + getConsistencyMarkers()->setOplogTruncateAfterPoint(opCtx, Timestamp(3, 3)); getConsistencyMarkers()->setAppliedThrough(opCtx, OpTime(Timestamp(3, 3), 1)); _setUpOplog(opCtx, getStorageInterface(), {1, 2, 3, 4, 5}); @@ -731,7 +734,7 @@ TEST_F(ReplicationRecoveryTest, RecoveryAppliesDocumentsWhenAppliedThroughIsBehi ReplicationRecoveryImpl recovery(getStorageInterface(), getConsistencyMarkers()); auto opCtx = getOperationContext(); - getConsistencyMarkers()->setOplogTruncateAfterPoint(opCtx, Timestamp(4, 4)); + getConsistencyMarkers()->setOplogTruncateAfterPoint(opCtx, Timestamp(3, 3)); getConsistencyMarkers()->setAppliedThrough(opCtx, OpTime(Timestamp(1, 1), 1)); _setUpOplog(opCtx, getStorageInterface(), {1, 2, 3, 4, 5}); @@ -748,7 +751,7 @@ void ReplicationRecoveryTest::testRecoveryAppliesDocumentsWithNoAppliedThroughAf ReplicationRecoveryImpl recovery(getStorageInterface(), getConsistencyMarkers()); auto opCtx = getOperationContext(); - getConsistencyMarkers()->setOplogTruncateAfterPoint(opCtx, Timestamp(4, 4)); + getConsistencyMarkers()->setOplogTruncateAfterPoint(opCtx, Timestamp(3, 3)); _setUpOplog(opCtx, getStorageInterface(), {1, 2, 3, 4, 5}); auto startingTS = Timestamp(1, 1); diff --git a/src/mongo/db/repl/rollback_impl.cpp b/src/mongo/db/repl/rollback_impl.cpp index 4bfdb8dd2d8..b61005e8878 100644 --- a/src/mongo/db/repl/rollback_impl.cpp +++ b/src/mongo/db/repl/rollback_impl.cpp @@ -502,11 +502,12 @@ void RollbackImpl::_runPhaseFromAbortToReconstructPreparedTxns( << " update operations and " << _observerInfo.rollbackCommandCounts[kDeleteCmdName] << " delete operations."; - // During replication recovery, we truncate all oplog entries with timestamps greater than - // or equal to the oplog truncate after point. As a result, we must find the oplog entry - // after the common point so we do not truncate the common point itself. If we entered - // rollback, we are guaranteed to have at least one oplog entry after the common point. - Timestamp truncatePoint = _findTruncateTimestamp(opCtx, commonPoint); + // During replication recovery, we truncate all oplog entries with timestamps greater than the + // oplog truncate after point. If we entered rollback, we are guaranteed to have at least one + // oplog entry after the common point. + log() << "Marking to truncate all oplog entries with timestamps greater than " + << commonPoint.getOpTime().getTimestamp(); + Timestamp truncatePoint = commonPoint.getOpTime().getTimestamp(); // Persist the truncate point to the 'oplogTruncateAfterPoint' document. We save this value so // that the replication recovery logic knows where to truncate the oplog. We save this value @@ -1005,34 +1006,6 @@ Status RollbackImpl::_checkAgainstTimeLimit( return Status::OK(); } -Timestamp RollbackImpl::_findTruncateTimestamp( - OperationContext* opCtx, RollBackLocalOperations::RollbackCommonPoint commonPoint) const { - - AutoGetCollectionForRead oplog(opCtx, NamespaceString::kRsOplogNamespace); - invariant(oplog.getCollection()); - auto oplogCursor = oplog.getCollection()->getCursor(opCtx, /*forward=*/true); - - auto commonPointRecord = oplogCursor->seekExact(commonPoint.getRecordId()); - auto commonPointOpTime = commonPoint.getOpTime(); - // Check that we've found the right document for the common point. - invariant(commonPointRecord); - auto commonPointTime = OpTime::parseFromOplogEntry(commonPointRecord->data.releaseToBson()); - invariant(commonPointTime.getStatus()); - invariant(commonPointTime.getValue() == commonPointOpTime, - str::stream() << "Common point: " << commonPointOpTime.toString() - << ", record found: " << commonPointTime.getValue().toString()); - - // Get the next document, which will be the first document to truncate. - auto truncatePointRecord = oplogCursor->next(); - invariant(truncatePointRecord); - auto truncatePointTime = OpTime::parseFromOplogEntry(truncatePointRecord->data.releaseToBson()); - invariant(truncatePointTime.getStatus()); - - log() << "Marking to truncate all oplog entries with timestamps greater than or equal to " - << truncatePointTime.getValue(); - return truncatePointTime.getValue().getTimestamp(); -} - boost::optional<BSONObj> RollbackImpl::_findDocumentById(OperationContext* opCtx, UUID uuid, NamespaceString nss, diff --git a/src/mongo/db/repl/rollback_impl.h b/src/mongo/db/repl/rollback_impl.h index d2a303c2e2c..450d799a3d5 100644 --- a/src/mongo/db/repl/rollback_impl.h +++ b/src/mongo/db/repl/rollback_impl.h @@ -342,13 +342,6 @@ private: Status _checkAgainstTimeLimit(RollBackLocalOperations::RollbackCommonPoint commonPoint); /** - * Finds the timestamp of the record after the common point to put into the oplog truncate - * after point. - */ - Timestamp _findTruncateTimestamp( - OperationContext* opCtx, RollBackLocalOperations::RollbackCommonPoint commonPoint) const; - - /** * Kills all user operations currently being performed. Since this node is a secondary, these * operations are all reads. */ diff --git a/src/mongo/db/repl/rollback_impl_test.cpp b/src/mongo/db/repl/rollback_impl_test.cpp index 39073633620..e33b10f60cc 100644 --- a/src/mongo/db/repl/rollback_impl_test.cpp +++ b/src/mongo/db/repl/rollback_impl_test.cpp @@ -576,7 +576,7 @@ TEST_F(RollbackImplTest, RollbackPersistsDocumentAfterCommonPointToOplogTruncate auto coll = _initializeCollection(_opCtx.get(), UUID::gen(), nss); ASSERT_OK(_rollback->runRollback(_opCtx.get())); - ASSERT_EQUALS(_truncatePoint, Timestamp(3, 3)); + ASSERT_EQUALS(_truncatePoint, Timestamp(2, 2)); } TEST_F(RollbackImplTest, RollbackImplResetsOptimesFromOplogAfterRollback) { @@ -839,7 +839,7 @@ TEST_F(RollbackImplTest, truncateAfterPoint = _replicationProcess->getConsistencyMarkers()->getOplogTruncateAfterPoint(_opCtx.get()); ASSERT_EQUALS(Timestamp(), truncateAfterPoint); - ASSERT_EQUALS(_truncatePoint, Timestamp(2, 2)); + ASSERT_EQUALS(_truncatePoint, Timestamp(1, 1)); ASSERT_EQ(_storageInterface->getFinalCollectionCount(uuid), 1); } @@ -863,7 +863,7 @@ TEST_F(RollbackImplTest, RollbackSucceedsAndTruncatesOplog) { _replicationProcess->getConsistencyMarkers()->getOplogTruncateAfterPoint(_opCtx.get()); ASSERT_EQUALS(Timestamp(), truncateAfterPoint); _assertDocsInOplog(_opCtx.get(), {1}); - ASSERT_EQUALS(_truncatePoint, Timestamp(2, 2)); + ASSERT_EQUALS(_truncatePoint, Timestamp(1, 1)); } DEATH_TEST_F(RollbackImplTest, diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h index a7f67537583..03fa957a678 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h @@ -548,8 +548,8 @@ private: extern FailPoint WTWriteConflictException; extern FailPoint WTWriteConflictExceptionForReads; -// Prevents oplog writes from being considered durable on the primary. Once activated, new writes -// will not be considered durable until deactivated. It is unspecified whether writes that commit -// before activation will become visible while active. +// Prevents oplog writes from becoming visible asynchronously. Once activated, new writes will not +// be seen by regular readers until deactivated. It is unspecified whether writes that commit before +// activation will become visible while active. extern FailPoint WTPausePrimaryOplogDurabilityLoop; } // namespace mongo |