summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorDianna Hohensee <dianna.hohensee@mongodb.com>2020-01-23 14:40:45 -0500
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-01-29 14:39:53 +0000
commit7442016d9fd45928370084da1899bc63240a3486 (patch)
tree85b01dcd00e9e0095b9906418753505a0f183049 /src
parenta081c83758521e11a5f285541ea9f13928f3de0f (diff)
downloadmongo-7442016d9fd45928370084da1899bc63240a3486.tar.gz
SERVER-45721 truncation using the oplogTruncateAfterPoint is always non-inclusive of the oplogTruncateAfterPoint
Diffstat (limited to 'src')
-rw-r--r--src/mongo/db/op_observer_impl.cpp3
-rw-r--r--src/mongo/db/repl/oplog_applier_impl.cpp3
-rw-r--r--src/mongo/db/repl/replication_consistency_markers.idl2
-rw-r--r--src/mongo/db/repl/replication_recovery.cpp49
-rw-r--r--src/mongo/db/repl/replication_recovery.h5
-rw-r--r--src/mongo/db/repl/replication_recovery_test.cpp9
-rw-r--r--src/mongo/db/repl/rollback_impl.cpp39
-rw-r--r--src/mongo/db/repl/rollback_impl.h7
-rw-r--r--src/mongo/db/repl/rollback_impl_test.cpp6
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h6
10 files changed, 55 insertions, 74 deletions
diff --git a/src/mongo/db/op_observer_impl.cpp b/src/mongo/db/op_observer_impl.cpp
index dcaff636784..b514d532f5e 100644
--- a/src/mongo/db/op_observer_impl.cpp
+++ b/src/mongo/db/op_observer_impl.cpp
@@ -27,6 +27,8 @@
* it in the license file.
*/
+#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kReplication
+
#include "mongo/platform/basic.h"
#include "mongo/db/op_observer_impl.h"
@@ -64,6 +66,7 @@
#include "mongo/scripting/engine.h"
#include "mongo/util/assert_util.h"
#include "mongo/util/fail_point.h"
+#include "mongo/util/log.h"
namespace mongo {
using repl::MutableOplogEntry;
diff --git a/src/mongo/db/repl/oplog_applier_impl.cpp b/src/mongo/db/repl/oplog_applier_impl.cpp
index fdd6a7a18d0..8b7d7ba3c41 100644
--- a/src/mongo/db/repl/oplog_applier_impl.cpp
+++ b/src/mongo/db/repl/oplog_applier_impl.cpp
@@ -624,7 +624,8 @@ StatusWith<OpTime> OplogApplierImpl::_applyOplogBatch(OperationContext* opCtx,
// Write batch of ops into oplog.
if (!getOptions().skipWritesToOplog) {
- _consistencyMarkers->setOplogTruncateAfterPoint(opCtx, ops.front().getTimestamp());
+ _consistencyMarkers->setOplogTruncateAfterPoint(
+ opCtx, _replCoord->getMyLastAppliedOpTime().getTimestamp());
scheduleWritesToOplog(opCtx, _storageInterface, _writerPool, ops);
}
diff --git a/src/mongo/db/repl/replication_consistency_markers.idl b/src/mongo/db/repl/replication_consistency_markers.idl
index 6633e6a4fc8..df977970a30 100644
--- a/src/mongo/db/repl/replication_consistency_markers.idl
+++ b/src/mongo/db/repl/replication_consistency_markers.idl
@@ -66,7 +66,7 @@ structs:
fields:
oplogTruncateAfterPoint:
type: timestamp
- description: "The timestamp of the first oplog entry in a batch when we are writing oplog entries to the oplog after which the oplog may be inconsistent."
+ description: "The timestamp of an oplog entry after which the oplog may be inconsistent (have holes due to parallel writes)."
_id:
type: string
description: "Always set to 'oplogTruncateAfterPoint' to easily retrieve it."
diff --git a/src/mongo/db/repl/replication_recovery.cpp b/src/mongo/db/repl/replication_recovery.cpp
index 2882bc0ade0..08aecd19258 100644
--- a/src/mongo/db/repl/replication_recovery.cpp
+++ b/src/mongo/db/repl/replication_recovery.cpp
@@ -410,7 +410,9 @@ void ReplicationRecoveryImpl::_recoverFromStableTimestamp(OperationContext* opCt
OpTime topOfOplog) {
invariant(!stableTimestamp.isNull());
invariant(!topOfOplog.isNull());
+
const auto truncateAfterPoint = _consistencyMarkers->getOplogTruncateAfterPoint(opCtx);
+
log() << "Recovering from stable timestamp: " << stableTimestamp
<< " (top of oplog: " << topOfOplog << ", appliedThrough: " << appliedThrough
<< ", TruncateAfter: " << truncateAfterPoint << ")";
@@ -568,8 +570,7 @@ StatusWith<OpTime> ReplicationRecoveryImpl::_getTopOfOplog(OperationContext* opC
}
void ReplicationRecoveryImpl::_truncateOplogTo(OperationContext* opCtx,
- Timestamp truncateTimestamp,
- bool inclusive) {
+ Timestamp truncateTimestamp) {
Timer timer;
const NamespaceString oplogNss(NamespaceString::kRsOplogNamespace);
AutoGetDb autoDb(opCtx, oplogNss.db(), MODE_IX);
@@ -583,8 +584,15 @@ void ReplicationRecoveryImpl::_truncateOplogTo(OperationContext* opCtx,
str::stream() << "Can't find " << NamespaceString::kRsOplogNamespace.ns()));
}
- // Scan through oplog in reverse, from latest entry to first, to find the truncateTimestamp.
- RecordId oldestIDToDelete; // Non-null if there is something to delete.
+ // Truncate the oplog after (non-inclusive of) the truncateTimestamp. Scan through the oplog in
+ // reverse, from latest entry to first, to find an entry lte truncateTimestamp. Once such an
+ // entry is found, we will truncate inclusive of the previous entry found that is greater than
+ // the truncate point. If only one entry is found lte to the truncateTimestamp, then nothing is
+ // truncated: nothing was found greater than the truncateTimestamp. Note that the
+ // truncateTimestamp does not have to be an exact match to an oplog entry: any entries after
+ // that time will be truncated.
+ RecordId previousRecordId;
+ Timestamp topOfOplog;
auto oplogRs = oplogCollection->getRecordStore();
auto oplogReverseCursor = oplogRs->getCursor(opCtx, /*forward=*/false);
size_t count = 0;
@@ -595,25 +603,32 @@ void ReplicationRecoveryImpl::_truncateOplogTo(OperationContext* opCtx,
const auto tsElem = entry["ts"];
if (count == 1) {
- if (tsElem.eoo())
+ if (tsElem.eoo()) {
LOG(2) << "Oplog tail entry: " << redact(entry);
- else
+ } else {
LOG(2) << "Oplog tail entry ts field: " << tsElem;
+ topOfOplog = tsElem.timestamp();
+ }
}
- if (tsElem.timestamp() < truncateTimestamp) {
+ if (tsElem.timestamp() <= truncateTimestamp) {
// If count == 1, that means that we have nothing to delete because everything in the
- // oplog is < truncateTimestamp.
+ // oplog is <= truncateTimestamp.
if (count != 1) {
- invariant(!oldestIDToDelete.isNull());
- oplogCollection->cappedTruncateAfter(opCtx, oldestIDToDelete, inclusive);
+ log() << "Truncating oplog from [" << Timestamp(previousRecordId.repr()) << " to "
+ << topOfOplog << "]. Truncate after point is " << truncateTimestamp;
+ invariant(!previousRecordId.isNull());
+ oplogCollection->cappedTruncateAfter(opCtx, previousRecordId, /*inclusive*/ true);
+ } else {
+ log() << "There is no oplog after " << truncateTimestamp
+ << " to truncate. The top of the oplog is " << topOfOplog;
}
log() << "Replication recovery oplog truncation finished in: " << timer.millis()
<< "ms";
return;
}
- oldestIDToDelete = id;
+ previousRecordId = id;
}
severe() << "Reached end of oplog looking for oplog entry before " << truncateTimestamp.toBSON()
@@ -630,10 +645,6 @@ void ReplicationRecoveryImpl::_truncateOplogIfNeededAndThenClearOplogTruncateAft
return;
}
- // We want to delete oplog inclusive of the 'oplogTruncateAfterPoint', but not inclusive of the
- // stable timestamp if we end up using that value instead.
- bool inclusive = true;
-
if (stableTimestamp && !stableTimestamp->isNull() && truncatePoint <= stableTimestamp) {
AutoGetCollectionForRead oplog(opCtx, NamespaceString::kRsOplogNamespace);
invariant(oplog.getCollection());
@@ -642,17 +653,15 @@ void ReplicationRecoveryImpl::_truncateOplogIfNeededAndThenClearOplogTruncateAft
<< ") is equal to or earlier than the stable timestamp (" << stableTimestamp.get()
<< "), so truncating after the stable timestamp instead";
- inclusive = false;
truncatePoint = stableTimestamp.get();
}
- log() << "Removing unapplied oplog entries starting at: " << truncatePoint.toBSON() << ", "
- << (inclusive ? "inclusive" : "not inclusive");
- _truncateOplogTo(opCtx, truncatePoint, inclusive);
+ log() << "Removing unapplied oplog entries starting after: " << truncatePoint.toBSON();
+ _truncateOplogTo(opCtx, truncatePoint);
// Clear the oplogTruncateAfterPoint now that we have removed any holes that might exist in the
// oplog -- and so that we do not truncate future entries erroneously.
- _consistencyMarkers->setOplogTruncateAfterPoint(opCtx, {});
+ _consistencyMarkers->setOplogTruncateAfterPoint(opCtx, Timestamp());
opCtx->recoveryUnit()->waitUntilDurable(opCtx);
}
diff --git a/src/mongo/db/repl/replication_recovery.h b/src/mongo/db/repl/replication_recovery.h
index 0a891420b54..cef637b1ee8 100644
--- a/src/mongo/db/repl/replication_recovery.h
+++ b/src/mongo/db/repl/replication_recovery.h
@@ -131,10 +131,9 @@ private:
StatusWith<OpTime> _getTopOfOplog(OperationContext* opCtx) const;
/**
- * Truncates the oplog after the "truncateTimestamp" entry. Includes the "truncateTimestamp"
- * entry if "inclusive" is set to true.
+ * Truncates the oplog after the "truncateTimestamp" entry.
*/
- void _truncateOplogTo(OperationContext* opCtx, Timestamp truncateTimestamp, bool inclusive);
+ void _truncateOplogTo(OperationContext* opCtx, Timestamp truncateTimestamp);
/**
* Uses the oplogTruncateAfterPoint, accessed via '_consistencyMarkers', to decide whether to
diff --git a/src/mongo/db/repl/replication_recovery_test.cpp b/src/mongo/db/repl/replication_recovery_test.cpp
index 4831a612b7e..c13132a1f32 100644
--- a/src/mongo/db/repl/replication_recovery_test.cpp
+++ b/src/mongo/db/repl/replication_recovery_test.cpp
@@ -27,6 +27,8 @@
* it in the license file.
*/
+#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kReplication
+
#include "mongo/platform/basic.h"
#include <memory>
@@ -53,6 +55,7 @@
#include "mongo/unittest/death_test.h"
#include "mongo/unittest/unittest.h"
#include "mongo/util/assert_util.h"
+#include "mongo/util/log.h"
#include "mongo/util/str.h"
namespace {
@@ -467,7 +470,7 @@ TEST_F(ReplicationRecoveryTest, RecoveryTruncatesOplogAtOplogTruncateAfterPoint)
ReplicationRecoveryImpl recovery(getStorageInterface(), getConsistencyMarkers());
auto opCtx = getOperationContext();
- getConsistencyMarkers()->setOplogTruncateAfterPoint(opCtx, Timestamp(4, 4));
+ getConsistencyMarkers()->setOplogTruncateAfterPoint(opCtx, Timestamp(3, 3));
getConsistencyMarkers()->setAppliedThrough(opCtx, OpTime(Timestamp(3, 3), 1));
_setUpOplog(opCtx, getStorageInterface(), {1, 2, 3, 4, 5});
@@ -731,7 +734,7 @@ TEST_F(ReplicationRecoveryTest, RecoveryAppliesDocumentsWhenAppliedThroughIsBehi
ReplicationRecoveryImpl recovery(getStorageInterface(), getConsistencyMarkers());
auto opCtx = getOperationContext();
- getConsistencyMarkers()->setOplogTruncateAfterPoint(opCtx, Timestamp(4, 4));
+ getConsistencyMarkers()->setOplogTruncateAfterPoint(opCtx, Timestamp(3, 3));
getConsistencyMarkers()->setAppliedThrough(opCtx, OpTime(Timestamp(1, 1), 1));
_setUpOplog(opCtx, getStorageInterface(), {1, 2, 3, 4, 5});
@@ -748,7 +751,7 @@ void ReplicationRecoveryTest::testRecoveryAppliesDocumentsWithNoAppliedThroughAf
ReplicationRecoveryImpl recovery(getStorageInterface(), getConsistencyMarkers());
auto opCtx = getOperationContext();
- getConsistencyMarkers()->setOplogTruncateAfterPoint(opCtx, Timestamp(4, 4));
+ getConsistencyMarkers()->setOplogTruncateAfterPoint(opCtx, Timestamp(3, 3));
_setUpOplog(opCtx, getStorageInterface(), {1, 2, 3, 4, 5});
auto startingTS = Timestamp(1, 1);
diff --git a/src/mongo/db/repl/rollback_impl.cpp b/src/mongo/db/repl/rollback_impl.cpp
index 4bfdb8dd2d8..b61005e8878 100644
--- a/src/mongo/db/repl/rollback_impl.cpp
+++ b/src/mongo/db/repl/rollback_impl.cpp
@@ -502,11 +502,12 @@ void RollbackImpl::_runPhaseFromAbortToReconstructPreparedTxns(
<< " update operations and " << _observerInfo.rollbackCommandCounts[kDeleteCmdName]
<< " delete operations.";
- // During replication recovery, we truncate all oplog entries with timestamps greater than
- // or equal to the oplog truncate after point. As a result, we must find the oplog entry
- // after the common point so we do not truncate the common point itself. If we entered
- // rollback, we are guaranteed to have at least one oplog entry after the common point.
- Timestamp truncatePoint = _findTruncateTimestamp(opCtx, commonPoint);
+ // During replication recovery, we truncate all oplog entries with timestamps greater than the
+ // oplog truncate after point. If we entered rollback, we are guaranteed to have at least one
+ // oplog entry after the common point.
+ log() << "Marking to truncate all oplog entries with timestamps greater than "
+ << commonPoint.getOpTime().getTimestamp();
+ Timestamp truncatePoint = commonPoint.getOpTime().getTimestamp();
// Persist the truncate point to the 'oplogTruncateAfterPoint' document. We save this value so
// that the replication recovery logic knows where to truncate the oplog. We save this value
@@ -1005,34 +1006,6 @@ Status RollbackImpl::_checkAgainstTimeLimit(
return Status::OK();
}
-Timestamp RollbackImpl::_findTruncateTimestamp(
- OperationContext* opCtx, RollBackLocalOperations::RollbackCommonPoint commonPoint) const {
-
- AutoGetCollectionForRead oplog(opCtx, NamespaceString::kRsOplogNamespace);
- invariant(oplog.getCollection());
- auto oplogCursor = oplog.getCollection()->getCursor(opCtx, /*forward=*/true);
-
- auto commonPointRecord = oplogCursor->seekExact(commonPoint.getRecordId());
- auto commonPointOpTime = commonPoint.getOpTime();
- // Check that we've found the right document for the common point.
- invariant(commonPointRecord);
- auto commonPointTime = OpTime::parseFromOplogEntry(commonPointRecord->data.releaseToBson());
- invariant(commonPointTime.getStatus());
- invariant(commonPointTime.getValue() == commonPointOpTime,
- str::stream() << "Common point: " << commonPointOpTime.toString()
- << ", record found: " << commonPointTime.getValue().toString());
-
- // Get the next document, which will be the first document to truncate.
- auto truncatePointRecord = oplogCursor->next();
- invariant(truncatePointRecord);
- auto truncatePointTime = OpTime::parseFromOplogEntry(truncatePointRecord->data.releaseToBson());
- invariant(truncatePointTime.getStatus());
-
- log() << "Marking to truncate all oplog entries with timestamps greater than or equal to "
- << truncatePointTime.getValue();
- return truncatePointTime.getValue().getTimestamp();
-}
-
boost::optional<BSONObj> RollbackImpl::_findDocumentById(OperationContext* opCtx,
UUID uuid,
NamespaceString nss,
diff --git a/src/mongo/db/repl/rollback_impl.h b/src/mongo/db/repl/rollback_impl.h
index d2a303c2e2c..450d799a3d5 100644
--- a/src/mongo/db/repl/rollback_impl.h
+++ b/src/mongo/db/repl/rollback_impl.h
@@ -342,13 +342,6 @@ private:
Status _checkAgainstTimeLimit(RollBackLocalOperations::RollbackCommonPoint commonPoint);
/**
- * Finds the timestamp of the record after the common point to put into the oplog truncate
- * after point.
- */
- Timestamp _findTruncateTimestamp(
- OperationContext* opCtx, RollBackLocalOperations::RollbackCommonPoint commonPoint) const;
-
- /**
* Kills all user operations currently being performed. Since this node is a secondary, these
* operations are all reads.
*/
diff --git a/src/mongo/db/repl/rollback_impl_test.cpp b/src/mongo/db/repl/rollback_impl_test.cpp
index 39073633620..e33b10f60cc 100644
--- a/src/mongo/db/repl/rollback_impl_test.cpp
+++ b/src/mongo/db/repl/rollback_impl_test.cpp
@@ -576,7 +576,7 @@ TEST_F(RollbackImplTest, RollbackPersistsDocumentAfterCommonPointToOplogTruncate
auto coll = _initializeCollection(_opCtx.get(), UUID::gen(), nss);
ASSERT_OK(_rollback->runRollback(_opCtx.get()));
- ASSERT_EQUALS(_truncatePoint, Timestamp(3, 3));
+ ASSERT_EQUALS(_truncatePoint, Timestamp(2, 2));
}
TEST_F(RollbackImplTest, RollbackImplResetsOptimesFromOplogAfterRollback) {
@@ -839,7 +839,7 @@ TEST_F(RollbackImplTest,
truncateAfterPoint =
_replicationProcess->getConsistencyMarkers()->getOplogTruncateAfterPoint(_opCtx.get());
ASSERT_EQUALS(Timestamp(), truncateAfterPoint);
- ASSERT_EQUALS(_truncatePoint, Timestamp(2, 2));
+ ASSERT_EQUALS(_truncatePoint, Timestamp(1, 1));
ASSERT_EQ(_storageInterface->getFinalCollectionCount(uuid), 1);
}
@@ -863,7 +863,7 @@ TEST_F(RollbackImplTest, RollbackSucceedsAndTruncatesOplog) {
_replicationProcess->getConsistencyMarkers()->getOplogTruncateAfterPoint(_opCtx.get());
ASSERT_EQUALS(Timestamp(), truncateAfterPoint);
_assertDocsInOplog(_opCtx.get(), {1});
- ASSERT_EQUALS(_truncatePoint, Timestamp(2, 2));
+ ASSERT_EQUALS(_truncatePoint, Timestamp(1, 1));
}
DEATH_TEST_F(RollbackImplTest,
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h
index a7f67537583..03fa957a678 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h
@@ -548,8 +548,8 @@ private:
extern FailPoint WTWriteConflictException;
extern FailPoint WTWriteConflictExceptionForReads;
-// Prevents oplog writes from being considered durable on the primary. Once activated, new writes
-// will not be considered durable until deactivated. It is unspecified whether writes that commit
-// before activation will become visible while active.
+// Prevents oplog writes from becoming visible asynchronously. Once activated, new writes will not
+// be seen by regular readers until deactivated. It is unspecified whether writes that commit before
+// activation will become visible while active.
extern FailPoint WTPausePrimaryOplogDurabilityLoop;
} // namespace mongo