summaryrefslogtreecommitdiff
path: root/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
diff options
context:
space:
mode:
authorDaniel Gottlieb <daniel.gottlieb@mongodb.com>2018-09-17 16:07:11 -0400
committerDaniel Gottlieb <daniel.gottlieb@mongodb.com>2018-09-17 16:07:11 -0400
commitbeba8d70803cc14768c577bc7ec1aff5c0c352ed (patch)
tree2ce61feed4d61c42a437245edfa475354ab5a429 /src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
parent9f634fbb3b49e1a591f054b097caf862f192f0c2 (diff)
downloadmongo-beba8d70803cc14768c577bc7ec1aff5c0c352ed.tar.gz
SERVER-36811: Save oplog dating back to oldest actively prepared transaction.
Diffstat (limited to 'src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp')
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp73
1 files changed, 70 insertions, 3 deletions
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
index 94eea809a29..79326f3f1cc 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
@@ -282,6 +282,26 @@ public:
const Timestamp stableTimestamp = _wiredTigerKVEngine->getStableTimestamp();
const Timestamp initialDataTimestamp = _wiredTigerKVEngine->getInitialDataTimestamp();
+
+ // The amount of oplog to keep is primarily dictated by a user setting. However, in
+ // unexpected cases, durable, recover to a timestamp storage engines may need to play
+ // forward from an oplog entry that would otherwise be truncated by the user
+ // setting. Furthermore with prepared transactions, oplog entries can refer to
+ // previous oplog entries.
+ //
+ // Live (replication) rollback will replay oplogs from exactly the stable
+ // timestamp. With prepared transactions, it may require some additional entries prior
+ // to the stable timestamp. These requirements are summarized in
+ // `getOplogNeededForRollback`. Truncating the oplog at this point is sufficient for
+ // in-memory configurations, but could cause an unrecoverable scenario if the node
+ // crashed and has to play from the last stable checkpoint.
+ //
+ // By recording the oplog needed for rollback "now", then taking a stable checkpoint,
+ // we can safely assume that the oplog needed for crash recovery has caught up to the
+ // recorded value. After the checkpoint, this value will be published such that actors
+ // which truncate the oplog can read an updated value.
+ const Timestamp oplogNeededForRollback =
+ _wiredTigerKVEngine->getOplogNeededForRollback();
try {
// Three cases:
//
@@ -316,8 +336,10 @@ public:
WT_SESSION* s = session->getSession();
invariantWTOK(s->checkpoint(s, "use_timestamp=true"));
- // Publish the checkpoint time after the checkpoint becomes durable.
+ // Now that the checkpoint is durable, publish the previously recorded stable
+ // timestamp and oplog needed to recover from it.
_lastStableCheckpointTimestamp.store(stableTimestamp.asULL());
+ _oplogNeededForCrashRecovery.store(oplogNeededForRollback.asULL());
}
} catch (const WriteConflictException&) {
// Temporary: remove this after WT-3483
@@ -363,6 +385,10 @@ public:
return _lastStableCheckpointTimestamp.load();
}
+ std::uint64_t getOplogNeededForCrashRecovery() const {
+ return _oplogNeededForCrashRecovery.load();
+ }
+
void shutdown() {
_shuttingDown.store(true);
{
@@ -391,6 +417,7 @@ private:
// checkpoint might have used a newer stable timestamp if stable was updated concurrently with
// checkpointing.
AtomicWord<std::uint64_t> _lastStableCheckpointTimestamp;
+ AtomicWord<std::uint64_t> _oplogNeededForCrashRecovery;
};
namespace {
@@ -569,7 +596,8 @@ WiredTigerKVEngine::WiredTigerKVEngine(const std::string& canonicalName,
if (!_readOnly && !_ephemeral) {
if (!_recoveryTimestamp.isNull()) {
setInitialDataTimestamp(_recoveryTimestamp);
- setStableTimestamp(_recoveryTimestamp);
+ // The `maximumTruncationTimestamp` is not persisted, so choose a conservative value.
+ setStableTimestamp(_recoveryTimestamp, Timestamp::min());
}
_checkpointThread =
@@ -1314,7 +1342,8 @@ MONGO_FAIL_POINT_DEFINE(WTPreserveSnapshotHistoryIndefinitely);
} // namespace
-void WiredTigerKVEngine::setStableTimestamp(Timestamp stableTimestamp) {
+void WiredTigerKVEngine::setStableTimestamp(Timestamp stableTimestamp,
+ boost::optional<Timestamp> maximumTruncationTimestamp) {
if (stableTimestamp.isNull()) {
return;
}
@@ -1353,6 +1382,24 @@ void WiredTigerKVEngine::setStableTimestamp(Timestamp stableTimestamp) {
_stableTimestamp.store(stableTimestamp.asULL());
}
+ // After publishing a stable timestamp to WT, we can publish the updated value for the
+ // necessary oplog to keep. Calls to this method require the min(stableTimestamp,
+ // maximumTruncationTimestamp) to be monotonically increasing. This allows us to safely record
+ // and publish a single value without additional concurrency control.
+ if (maximumTruncationTimestamp) {
+ // Until we discover otherwise, assume callers expect to obey the contract for proper
+ // oplog truncation.
+ DEV invariant(_oplogNeededForRollback.load() <=
+ std::min(maximumTruncationTimestamp->asULL(), stableTimestamp.asULL()));
+ _oplogNeededForRollback.store(
+ std::min(maximumTruncationTimestamp->asULL(), stableTimestamp.asULL()));
+ } else {
+ // If there is no maximumTruncationTimestamp at this stable timestamp, WT is free to
+ // truncate the oplog to any value behind the last stable timestamp, once it is
+ // checkpointed.
+ _oplogNeededForRollback.store(stableTimestamp.asULL());
+ }
+
if (_checkpointThread && !_checkpointThread->hasTriggeredFirstStableCheckpoint()) {
_checkpointThread->triggerFirstStableCheckpoint(
prevStable, Timestamp(_initialDataTimestamp.load()), stableTimestamp);
@@ -1576,6 +1623,26 @@ boost::optional<Timestamp> WiredTigerKVEngine::getLastStableRecoveryTimestamp()
return boost::none;
}
+Timestamp WiredTigerKVEngine::getOplogNeededForRollback() const {
+ // TODO: SERVER-36982 intends to allow holding onto minimum history (in front of the stable
+ // timestamp). If that results in never calling `StorageEngine::setStableTimestamp`, oplog
+ // will never be truncated. This method will need to be updated to accomodate that case, most
+ // simply by having this return `Timestamp::max()`.
+ return Timestamp(_oplogNeededForRollback.load());
+}
+
+boost::optional<Timestamp> WiredTigerKVEngine::getOplogNeededForCrashRecovery() const {
+ if (_ephemeral) {
+ return boost::none;
+ }
+
+ return Timestamp(_checkpointThread->getOplogNeededForCrashRecovery());
+}
+
+Timestamp WiredTigerKVEngine::getPinnedOplog() const {
+ return getOplogNeededForCrashRecovery().value_or(getOplogNeededForRollback());
+}
+
bool WiredTigerKVEngine::supportsReadConcernSnapshot() const {
return true;
}