summaryrefslogtreecommitdiff
path: root/src/mongo/db/storage
diff options
context:
space:
mode:
Diffstat (limited to 'src/mongo/db/storage')
-rw-r--r--src/mongo/db/storage/SConscript17
-rw-r--r--src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp60
-rw-r--r--src/mongo/db/storage/recovery_unit.h24
-rw-r--r--src/mongo/db/storage/snapshot_helper.cpp151
-rw-r--r--src/mongo/db/storage/snapshot_helper.h43
-rw-r--r--src/mongo/db/storage/snapshot_manager.h8
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp6
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp61
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp83
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.cpp33
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.h22
11 files changed, 398 insertions, 110 deletions
diff --git a/src/mongo/db/storage/SConscript b/src/mongo/db/storage/SConscript
index ccd5d048e1d..e32b8ba329a 100644
--- a/src/mongo/db/storage/SConscript
+++ b/src/mongo/db/storage/SConscript
@@ -30,6 +30,23 @@ env.Library(
)
env.Library(
+ target='snapshot_helper',
+ source=[
+ 'snapshot_helper.cpp',
+ ],
+ LIBDEPS=[
+ '$BUILD_DIR/mongo/base',
+ '$BUILD_DIR/mongo/db/namespace_string',
+ ],
+ LIBDEPS_PRIVATE=[
+ '$BUILD_DIR/mongo/db/concurrency/lock_manager',
+ '$BUILD_DIR/mongo/db/repl/read_concern_args',
+ '$BUILD_DIR/mongo/db/repl/repl_coordinator_interface',
+ 'recovery_unit_base',
+ ],
+ )
+
+env.Library(
target='duplicate_key_error_info',
source=[
'duplicate_key_error_info.cpp',
diff --git a/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp b/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp
index aeef71561ae..baecec3c8af 100644
--- a/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp
+++ b/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp
@@ -150,9 +150,9 @@ public:
return itCountOn(op);
}
- int itCountLocal() {
+ int itCountLastApplied() {
auto op = makeOperation();
- op->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kLastApplied);
+ op->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoOverlap);
return itCountOn(op);
}
@@ -176,14 +176,14 @@ public:
return std::string(record->data.data());
}
- boost::optional<Record> readRecordLocal(RecordId id) {
+ boost::optional<Record> readRecordLastApplied(RecordId id) {
auto op = makeOperation();
- op->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kLastApplied);
+ op->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoOverlap);
return readRecordOn(op, id);
}
- std::string readStringLocal(RecordId id) {
- auto record = readRecordLocal(id);
+ std::string readStringLastApplied(RecordId id) {
+ auto record = readRecordLastApplied(id);
ASSERT(record);
return std::string(record->data.data());
}
@@ -360,7 +360,7 @@ TEST_F(SnapshotManagerTests, UpdateAndDelete) {
ASSERT(!readRecordCommitted(id));
}
-TEST_F(SnapshotManagerTests, InsertAndReadOnLocalSnapshot) {
+TEST_F(SnapshotManagerTests, InsertAndReadOnLastAppliedSnapshot) {
if (!snapshotManager)
return; // This test is only for engines that DO support SnapshotManagers.
@@ -369,7 +369,7 @@ TEST_F(SnapshotManagerTests, InsertAndReadOnLocalSnapshot) {
auto id = insertRecordAndCommit();
auto afterInsert = fetchAndIncrementTimestamp();
- // Not reading on the last local timestamp returns the most recent data.
+ // Not reading on the last applied timestamp returns the most recent data.
auto op = makeOperation();
auto ru = op->recoveryUnit();
ru->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset);
@@ -379,18 +379,18 @@ TEST_F(SnapshotManagerTests, InsertAndReadOnLocalSnapshot) {
deleteRecordAndCommit(id);
auto afterDelete = fetchAndIncrementTimestamp();
- // Reading at the local snapshot timestamps returns data in order.
- snapshotManager->setLocalSnapshot(beforeInsert);
- ASSERT_EQ(itCountLocal(), 0);
- ASSERT(!readRecordLocal(id));
+ // Reading at the last applied snapshot timestamps returns data in order.
+ snapshotManager->setLastApplied(beforeInsert);
+ ASSERT_EQ(itCountLastApplied(), 0);
+ ASSERT(!readRecordLastApplied(id));
- snapshotManager->setLocalSnapshot(afterInsert);
- ASSERT_EQ(itCountLocal(), 1);
- ASSERT(readRecordLocal(id));
+ snapshotManager->setLastApplied(afterInsert);
+ ASSERT_EQ(itCountLastApplied(), 1);
+ ASSERT(readRecordLastApplied(id));
- snapshotManager->setLocalSnapshot(afterDelete);
- ASSERT_EQ(itCountLocal(), 0);
- ASSERT(!readRecordLocal(id));
+ snapshotManager->setLastApplied(afterDelete);
+ ASSERT_EQ(itCountLastApplied(), 0);
+ ASSERT(!readRecordLastApplied(id));
}
TEST_F(SnapshotManagerTests, UpdateAndDeleteOnLocalSnapshot) {
@@ -416,20 +416,20 @@ TEST_F(SnapshotManagerTests, UpdateAndDeleteOnLocalSnapshot) {
deleteRecordAndCommit(id);
auto afterDelete = fetchAndIncrementTimestamp();
- snapshotManager->setLocalSnapshot(beforeInsert);
- ASSERT_EQ(itCountLocal(), 0);
- ASSERT(!readRecordLocal(id));
+ snapshotManager->setLastApplied(beforeInsert);
+ ASSERT_EQ(itCountLastApplied(), 0);
+ ASSERT(!readRecordLastApplied(id));
- snapshotManager->setLocalSnapshot(afterInsert);
- ASSERT_EQ(itCountLocal(), 1);
- ASSERT_EQ(readStringLocal(id), "Aardvark");
+ snapshotManager->setLastApplied(afterInsert);
+ ASSERT_EQ(itCountLastApplied(), 1);
+ ASSERT_EQ(readStringLastApplied(id), "Aardvark");
- snapshotManager->setLocalSnapshot(afterUpdate);
- ASSERT_EQ(itCountLocal(), 1);
- ASSERT_EQ(readStringLocal(id), "Blue spotted stingray");
+ snapshotManager->setLastApplied(afterUpdate);
+ ASSERT_EQ(itCountLastApplied(), 1);
+ ASSERT_EQ(readStringLastApplied(id), "Blue spotted stingray");
- snapshotManager->setLocalSnapshot(afterDelete);
- ASSERT_EQ(itCountLocal(), 0);
- ASSERT(!readRecordLocal(id));
+ snapshotManager->setLastApplied(afterDelete);
+ ASSERT_EQ(itCountLastApplied(), 0);
+ ASSERT(!readRecordLastApplied(id));
}
} // namespace mongo
diff --git a/src/mongo/db/storage/recovery_unit.h b/src/mongo/db/storage/recovery_unit.h
index f8c70309ae2..eeddfafd624 100644
--- a/src/mongo/db/storage/recovery_unit.h
+++ b/src/mongo/db/storage/recovery_unit.h
@@ -253,7 +253,6 @@ public:
* - when using ReadSource::kNoOverlap, the timestamp chosen by the storage engine.
* - when using ReadSource::kAllDurableSnapshot, the timestamp chosen using the storage
* engine's all_durable timestamp.
- * - when using ReadSource::kLastApplied, the timestamp chosen using the storage engine's last
* applied timestamp. Can return boost::none if no timestamp has been established.
* - when using ReadSource::kMajorityCommitted, the majority committed timestamp chosen by the
* storage engine after a transaction has been opened or after a call to
@@ -399,11 +398,6 @@ public:
*/
kNoOverlap,
/**
- * Read from the last applied timestamp. New transactions start at the most up-to-date
- * timestamp.
- */
- kLastApplied,
- /**
* Read from the all_durable timestamp. New transactions will always read from the same
* timestamp and never advance.
*/
@@ -414,6 +408,24 @@ public:
kProvided
};
+ static std::string toString(ReadSource rs) {
+ switch (rs) {
+ case ReadSource::kUnset:
+ return "kUnset";
+ case ReadSource::kNoTimestamp:
+ return "kNoTimestamp";
+ case ReadSource::kMajorityCommitted:
+ return "kMajorityCommitted";
+ case ReadSource::kNoOverlap:
+ return "kNoOverlap";
+ case ReadSource::kAllDurableSnapshot:
+ return "kAllDurableSnapshot";
+ case ReadSource::kProvided:
+ return "kProvided";
+ }
+ MONGO_UNREACHABLE;
+ }
+
/**
* Sets which timestamp to use for read transactions. If 'provided' is supplied, only kProvided
* is an acceptable input.
diff --git a/src/mongo/db/storage/snapshot_helper.cpp b/src/mongo/db/storage/snapshot_helper.cpp
new file mode 100644
index 00000000000..777868f7830
--- /dev/null
+++ b/src/mongo/db/storage/snapshot_helper.cpp
@@ -0,0 +1,151 @@
+/**
+ * Copyright (C) 2020-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kStorage
+
+#include "mongo/platform/basic.h"
+
+#include "mongo/db/storage/snapshot_helper.h"
+
+#include "mongo/db/repl/read_concern_args.h"
+#include "mongo/db/repl/replication_coordinator.h"
+#include "mongo/logv2/log.h"
+
+namespace mongo {
+namespace SnapshotHelper {
+bool canSwitchReadSource(OperationContext* opCtx) {
+
+ // Most readConcerns have behavior controlled at higher levels. Local and available are the only
+ // ReadConcerns that should consider changing, since they read without a timestamp by default.
+ const auto readConcernLevel = repl::ReadConcernArgs::get(opCtx).getLevel();
+ if (readConcernLevel == repl::ReadConcernLevel::kLocalReadConcern ||
+ readConcernLevel == repl::ReadConcernLevel::kAvailableReadConcern) {
+ return true;
+ }
+
+ return false;
+}
+
+bool shouldReadAtNoOverlap(OperationContext* opCtx,
+ const NamespaceString& nss,
+ std::string* reason) {
+
+ // If this is true, then the operation opted-in to the PBWM lock, implying that it cannot read
+ // at no-overlap. It's important to note that it is possible for this to be false, but still be
+ // holding the PBWM lock, explained below.
+ if (opCtx->lockState()->shouldConflictWithSecondaryBatchApplication()) {
+ *reason = "conflicts with batch application";
+ return false;
+ }
+
+ // If we are already holding the PBWM lock, do not read at no-overlap. Snapshots acquired by an
+ // operation after a yield/restore must see all writes in the pre-yield snapshot. Once a
+ // snapshot is reading without a timestamp, we choose to continue acquiring snapshots without a
+ // timestamp. This is done in lieu of determining a timestamp far enough in the future that's
+ // guaranteed to observe all previous writes. This may occur when multiple collection locks are
+ // held concurrently, which is often the case when DBDirectClient is used.
+ if (opCtx->lockState()->isLockHeldForMode(resourceIdParallelBatchWriterMode, MODE_IS)) {
+ *reason = "PBWM lock is held";
+ LOGV2_DEBUG(20577, 1, "not reading at no-overlap because the PBWM lock is held");
+ return false;
+ }
+
+ // If we are in a replication state (like secondary or primary catch-up) where we are not
+ // accepting writes, we should read at no-overlap. If this node can accept writes, then no
+ // conflicting replication batches are being applied and we can read from the default snapshot.
+ if (repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesForDatabase(opCtx, "admin")) {
+ *reason = "primary";
+ return false;
+ }
+
+ // Non-replicated collections do not need to read at no-overlap, as those collections are not
+ // written by the replication system. However, the oplog is special, as it *is* written by the
+ // replication system.
+ if (!nss.isReplicated() && !nss.isOplog()) {
+ *reason = "unreplicated collection";
+ return false;
+ }
+
+ return true;
+}
+boost::optional<RecoveryUnit::ReadSource> getNewReadSource(OperationContext* opCtx,
+ const NamespaceString& nss) {
+ const bool canSwitch = canSwitchReadSource(opCtx);
+ if (!canSwitch) {
+ return boost::none;
+ }
+
+ const auto existing = opCtx->recoveryUnit()->getTimestampReadSource();
+ std::string reason;
+ const bool readAtNoOverlap = shouldReadAtNoOverlap(opCtx, nss, &reason);
+ if (existing == RecoveryUnit::ReadSource::kUnset) {
+ // Shifting from reading without a timestamp to reading with a timestamp can be dangerous
+ // because writes will appear to vanish. This case is intended for new reads on secondaries
+ // and query yield recovery after state transitions from primary to secondary.
+ if (readAtNoOverlap) {
+ LOGV2_DEBUG(4452901, 2, "Changing ReadSource to kNoOverlap", logAttrs(nss));
+ return RecoveryUnit::ReadSource::kNoOverlap;
+ }
+ } else if (existing == RecoveryUnit::ReadSource::kNoOverlap) {
+ // For some reason, we can no longer read at kNoOverlap.
+ // An operation that yields a timestamped snapshot must restore a snapshot with at least as
+ // large of a timestamp, or with proper consideration of rollback scenarios, no timestamp.
+ // Given readers do not survive rollbacks, it's okay to go from reading with a timestamp to
+ // reading without one. More writes will become visible.
+ if (!readAtNoOverlap) {
+ LOGV2_DEBUG(
+ 4452902, 2, "Changing ReadSource to kUnset", logAttrs(nss), "reason"_attr = reason);
+ // This shift to kUnset assumes that callers will not make future attempts to manipulate
+ // their ReadSources after performing reads at an un-timetamped snapshot. The only
+ // exception is callers of this function that may need to change from kUnset to
+ // kNoOverlap in the event of a catalog conflict or query yield.
+ return RecoveryUnit::ReadSource::kUnset;
+ }
+ }
+ return boost::none;
+}
+
+bool collectionChangesConflictWithRead(boost::optional<Timestamp> collectionMin,
+ boost::optional<Timestamp> readTimestamp) {
+ // This is the timestamp of the most recent catalog changes to this collection. If this is
+ // greater than any point in time read timestamps, we should either wait or return an error.
+ if (!collectionMin) {
+ return false;
+ }
+
+ // If we do not have a point in time to conflict with collectionMin, return.
+ if (!readTimestamp || readTimestamp->isNull()) {
+ return false;
+ }
+
+ // Return if there are no conflicting catalog changes with the readTimestamp.
+ return *collectionMin > readTimestamp;
+}
+} // namespace SnapshotHelper
+} // namespace mongo \ No newline at end of file
diff --git a/src/mongo/db/storage/snapshot_helper.h b/src/mongo/db/storage/snapshot_helper.h
new file mode 100644
index 00000000000..fa8fdd85f24
--- /dev/null
+++ b/src/mongo/db/storage/snapshot_helper.h
@@ -0,0 +1,43 @@
+/**
+ * Copyright (C) 2020-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/db/operation_context.h"
+
+namespace mongo {
+namespace SnapshotHelper {
+// Returns a ReadSource if we should change our current ReadSource. Returns boost::none otherwise.
+boost::optional<RecoveryUnit::ReadSource> getNewReadSource(OperationContext* opCtx,
+ const NamespaceString& nss);
+
+bool collectionChangesConflictWithRead(boost::optional<Timestamp> collectionMin,
+ boost::optional<Timestamp> readTimestamp);
+} // namespace SnapshotHelper
+} // namespace mongo
diff --git a/src/mongo/db/storage/snapshot_manager.h b/src/mongo/db/storage/snapshot_manager.h
index d41df1c5013..7839ef179ba 100644
--- a/src/mongo/db/storage/snapshot_manager.h
+++ b/src/mongo/db/storage/snapshot_manager.h
@@ -59,14 +59,14 @@ public:
virtual void setCommittedSnapshot(const Timestamp& timestamp) = 0;
/**
- * Sets the snapshot for the last stable timestamp for reading on secondaries.
+ * Sets the lastApplied timestamp.
*/
- virtual void setLocalSnapshot(const Timestamp& timestamp) = 0;
+ virtual void setLastApplied(const Timestamp& timestamp) = 0;
/**
- * Returns the local snapshot timestamp.
+ * Returns the lastApplied timestamp.
*/
- virtual boost::optional<Timestamp> getLocalSnapshot() = 0;
+ virtual boost::optional<Timestamp> getLastApplied() = 0;
/**
* Drops all snapshots and clears the "committed" snapshot.
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
index 661b0e009c1..352699a9de8 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
@@ -668,6 +668,12 @@ WiredTigerKVEngine::WiredTigerKVEngine(const std::string& canonicalName,
setInitialDataTimestamp(_recoveryTimestamp);
setOldestTimestamp(_recoveryTimestamp, false);
setStableTimestamp(_recoveryTimestamp, false);
+
+ _sessionCache->snapshotManager().setLastApplied(_recoveryTimestamp);
+ {
+ stdx::lock_guard<Latch> lk(_highestDurableTimestampMutex);
+ _highestSeenDurableTimestamp = _recoveryTimestamp.asULL();
+ }
}
}
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp
index 8cc45e0201e..f9d9751d464 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp
@@ -453,7 +453,6 @@ boost::optional<Timestamp> WiredTigerRecoveryUnit::getPointInTimeReadTimestamp()
// The following ReadSources can only establish a read timestamp when a transaction is
// opened.
case ReadSource::kNoOverlap:
- case ReadSource::kLastApplied:
case ReadSource::kAllDurableSnapshot:
break;
}
@@ -462,14 +461,14 @@ boost::optional<Timestamp> WiredTigerRecoveryUnit::getPointInTimeReadTimestamp()
getSession();
switch (_timestampReadSource) {
- case ReadSource::kLastApplied:
- // The lastApplied timestamp is not always available, so it is not possible to invariant
- // that it exists as other ReadSources do.
+ case ReadSource::kNoOverlap:
+ // The lastApplied and allDurable timestamps are not always available if the system has
+ // not accepted writes, so it is not possible to invariant that it exists as other
+ // ReadSources do.
if (!_readAtTimestamp.isNull()) {
return _readAtTimestamp;
}
return boost::none;
- case ReadSource::kNoOverlap:
case ReadSource::kAllDurableSnapshot:
invariant(!_readAtTimestamp.isNull());
return _readAtTimestamp;
@@ -515,17 +514,6 @@ void WiredTigerRecoveryUnit::_txnOpen() {
session, _prepareConflictBehavior, _roundUpPreparedTimestamps);
break;
}
- case ReadSource::kLastApplied: {
- if (_sessionCache->snapshotManager().getLocalSnapshot()) {
- _readAtTimestamp = _sessionCache->snapshotManager().beginTransactionOnLocalSnapshot(
- session, _prepareConflictBehavior, _roundUpPreparedTimestamps);
- } else {
- WiredTigerBeginTxnBlock(
- session, _prepareConflictBehavior, _roundUpPreparedTimestamps)
- .done();
- }
- break;
- }
case ReadSource::kNoOverlap: {
_readAtTimestamp = _beginTransactionAtNoOverlapTimestamp(session);
break;
@@ -555,8 +543,9 @@ void WiredTigerRecoveryUnit::_txnOpen() {
LOGV2_DEBUG(22414,
3,
- "WT begin_transaction for snapshot id {getSnapshotId_toNumber}",
- "getSnapshotId_toNumber"_attr = getSnapshotId().toNumber());
+ "WT begin_transaction",
+ "snapshotId"_attr = getSnapshotId().toNumber(),
+ "readSource"_attr = toString(_timestampReadSource));
}
Timestamp WiredTigerRecoveryUnit::_beginTransactionAtAllDurableTimestamp(WT_SESSION* session) {
@@ -578,8 +567,8 @@ Timestamp WiredTigerRecoveryUnit::_beginTransactionAtAllDurableTimestamp(WT_SESS
Timestamp WiredTigerRecoveryUnit::_beginTransactionAtNoOverlapTimestamp(WT_SESSION* session) {
- auto lastApplied = _sessionCache->snapshotManager().getLocalSnapshot();
- Timestamp allDurable = Timestamp(_oplogManager->fetchAllDurableValue(session->connection));
+ auto lastApplied = _sessionCache->snapshotManager().getLastApplied();
+ Timestamp allDurable = Timestamp(_sessionCache->getKVEngine()->getAllDurableTimestamp());
// When using timestamps for reads and writes, it's important that readers and writers don't
// overlap with the timestamps they use. In other words, at any point in the system there should
@@ -607,17 +596,34 @@ Timestamp WiredTigerRecoveryUnit::_beginTransactionAtNoOverlapTimestamp(WT_SESSI
// should read afterward.
Timestamp readTimestamp = (lastApplied) ? std::min(*lastApplied, allDurable) : allDurable;
+ if (readTimestamp.isNull()) {
+ // When there is not an all_durable or lastApplied timestamp available, read without a
+ // timestamp. Do not round up the read timestamp to the oldest timestamp.
+
+ // There is a race that allows new transactions to start between the time we check for a
+ // read timestamp and start our transaction, which can temporarily violate the contract of
+ // kNoOverlap. That is, writes will be visible that occur after the all_durable time. This
+ // is only possible for readers that start immediately after an initial sync that did not
+ // replicate any oplog entries. Future transactions will start reading at a timestamp once
+ // timestamped writes have been made.
+ WiredTigerBeginTxnBlock txnOpen(
+ session, _prepareConflictBehavior, _roundUpPreparedTimestamps);
+ LOGV2_DEBUG(4452900, 1, "no read timestamp available for kNoOverlap");
+ txnOpen.done();
+ return readTimestamp;
+ }
+
WiredTigerBeginTxnBlock txnOpen(session,
_prepareConflictBehavior,
_roundUpPreparedTimestamps,
RoundUpReadTimestamp::kRound);
auto status = txnOpen.setReadSnapshot(readTimestamp);
fassert(51066, status);
+ txnOpen.done();
- // We might have rounded to oldest between calling getAllDurable and setReadSnapshot. We need
- // to get the actual read timestamp we used.
+ // We might have rounded to oldest between calling getAllDurable and setReadSnapshot. We
+ // need to get the actual read timestamp we used.
readTimestamp = _getTransactionReadTimestamp(session);
- txnOpen.done();
return readTimestamp;
}
@@ -769,15 +775,14 @@ void WiredTigerRecoveryUnit::setTimestampReadSource(ReadSource readSource,
boost::optional<Timestamp> provided) {
LOGV2_DEBUG(22416,
3,
- "setting timestamp read source: {static_cast_int_readSource}, provided timestamp: "
- "{provided_provided_none}",
- "static_cast_int_readSource"_attr = static_cast<int>(readSource),
- "provided_provided_none"_attr = ((provided) ? provided->toString() : "none"));
+ "setting timestamp read source",
+ "readSource"_attr = toString(readSource),
+ "provided"_attr = ((provided) ? provided->toString() : "none"));
invariant(!_isActive() || _timestampReadSource == readSource,
str::stream() << "Current state: " << toString(_getState())
<< ". Invalid internal state while setting timestamp read source: "
- << static_cast<int>(readSource) << ", provided timestamp: "
+ << toString(readSource) << ", provided timestamp: "
<< (provided ? provided->toString() : "none"));
invariant(!provided == (readSource != ReadSource::kProvided));
invariant(!(provided && provided->isNull()));
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp
index 314a524063e..1f440d319b7 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp
@@ -163,11 +163,14 @@ public:
clientAndCtx2 = makeClientAndOpCtx(harnessHelper.get(), "reader");
ru1 = checked_cast<WiredTigerRecoveryUnit*>(clientAndCtx1.second->recoveryUnit());
ru2 = checked_cast<WiredTigerRecoveryUnit*>(clientAndCtx2.second->recoveryUnit());
+ snapshotManager = dynamic_cast<WiredTigerSnapshotManager*>(
+ harnessHelper->getEngine()->getSnapshotManager());
}
std::unique_ptr<WiredTigerRecoveryUnitHarnessHelper> harnessHelper;
ClientAndCtx clientAndCtx1, clientAndCtx2;
WiredTigerRecoveryUnit *ru1, *ru2;
+ WiredTigerSnapshotManager* snapshotManager;
private:
const char* wt_uri = "table:prepare_transaction";
@@ -180,6 +183,86 @@ TEST_F(WiredTigerRecoveryUnitTestFixture, SetReadSource) {
ASSERT_EQ(Timestamp(1, 1), ru1->getPointInTimeReadTimestamp());
}
+TEST_F(WiredTigerRecoveryUnitTestFixture, NoOverlapReadSource) {
+ OperationContext* opCtx1 = clientAndCtx1.second.get();
+ std::unique_ptr<RecordStore> rs(harnessHelper->createRecordStore(opCtx1, "a.b"));
+
+ const std::string str = str::stream() << "test";
+ const Timestamp ts1{1, 1};
+ const Timestamp ts2{1, 2};
+ const Timestamp ts3{1, 2};
+
+ RecordId rid1;
+ {
+ WriteUnitOfWork wuow(opCtx1);
+ StatusWith<RecordId> res = rs->insertRecord(opCtx1, str.c_str(), str.size() + 1, ts1);
+ ASSERT_OK(res);
+ wuow.commit();
+ rid1 = res.getValue();
+ snapshotManager->setLastApplied(ts1);
+ }
+
+ // Read without a timestamp. The write should be visible.
+ ASSERT_EQ(opCtx1->recoveryUnit()->getTimestampReadSource(), RecoveryUnit::ReadSource::kUnset);
+ RecordData unused;
+ ASSERT_TRUE(rs->findRecord(opCtx1, rid1, &unused));
+
+ // Read with kNoOverlap. The write should be visible.
+ opCtx1->recoveryUnit()->abandonSnapshot();
+ opCtx1->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoOverlap);
+ ASSERT_TRUE(rs->findRecord(opCtx1, rid1, &unused));
+
+ RecordId rid2, rid3;
+ {
+ // Start, but do not commit a transaction with opCtx2. This sets a timestamp at ts2, which
+ // creates a hole. kNoOverlap, which is a function of all_durable, will only be able to read
+ // at the time immediately before.
+ OperationContext* opCtx2 = clientAndCtx2.second.get();
+ WriteUnitOfWork wuow(opCtx2);
+ StatusWith<RecordId> res =
+ rs->insertRecord(opCtx2, str.c_str(), str.size() + 1, Timestamp());
+ ASSERT_OK(opCtx2->recoveryUnit()->setTimestamp(ts2));
+ ASSERT_OK(res);
+
+ // While holding open a transaction with opCtx2, perform an insert at ts3 with opCtx1. This
+ // creates a "hole".
+ {
+ WriteUnitOfWork wuow(opCtx1);
+ StatusWith<RecordId> res = rs->insertRecord(opCtx1, str.c_str(), str.size() + 1, ts3);
+ ASSERT_OK(res);
+ wuow.commit();
+ rid3 = res.getValue();
+ snapshotManager->setLastApplied(ts3);
+ }
+
+ // Read without a timestamp, and we should see the first and third records.
+ opCtx1->recoveryUnit()->abandonSnapshot();
+ opCtx1->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset);
+ ASSERT_TRUE(rs->findRecord(opCtx1, rid1, &unused));
+ ASSERT_FALSE(rs->findRecord(opCtx1, rid2, &unused));
+ ASSERT_TRUE(rs->findRecord(opCtx1, rid3, &unused));
+
+ // Now read at kNoOverlap. Since the transaction at ts2 has not committed, all_durable is
+ // held back to ts1. LastApplied has advanced to ts3, but because kNoOverlap is the minimum,
+ // we should only see one record.
+ opCtx1->recoveryUnit()->abandonSnapshot();
+ opCtx1->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoOverlap);
+ ASSERT_TRUE(rs->findRecord(opCtx1, rid1, &unused));
+ ASSERT_FALSE(rs->findRecord(opCtx1, rid2, &unused));
+ ASSERT_FALSE(rs->findRecord(opCtx1, rid3, &unused));
+
+ wuow.commit();
+ rid2 = res.getValue();
+ }
+
+ // Now that the hole has been closed, kNoOverlap should see all 3 records.
+ opCtx1->recoveryUnit()->abandonSnapshot();
+ opCtx1->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoOverlap);
+ ASSERT_TRUE(rs->findRecord(opCtx1, rid1, &unused));
+ ASSERT_TRUE(rs->findRecord(opCtx1, rid2, &unused));
+ ASSERT_TRUE(rs->findRecord(opCtx1, rid3, &unused));
+}
+
TEST_F(WiredTigerRecoveryUnitTestFixture, CreateAndCheckForCachePressure) {
int time = 1;
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.cpp
index 74988508a4c..efc2d8adfe5 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.cpp
@@ -48,17 +48,17 @@ void WiredTigerSnapshotManager::setCommittedSnapshot(const Timestamp& timestamp)
_committedSnapshot = timestamp;
}
-void WiredTigerSnapshotManager::setLocalSnapshot(const Timestamp& timestamp) {
- stdx::lock_guard<Latch> lock(_localSnapshotMutex);
+void WiredTigerSnapshotManager::setLastApplied(const Timestamp& timestamp) {
+ stdx::lock_guard<Latch> lock(_lastAppliedMutex);
if (timestamp.isNull())
- _localSnapshot = boost::none;
+ _lastApplied = boost::none;
else
- _localSnapshot = timestamp;
+ _lastApplied = timestamp;
}
-boost::optional<Timestamp> WiredTigerSnapshotManager::getLocalSnapshot() {
- stdx::lock_guard<Latch> lock(_localSnapshotMutex);
- return _localSnapshot;
+boost::optional<Timestamp> WiredTigerSnapshotManager::getLastApplied() {
+ stdx::lock_guard<Latch> lock(_lastAppliedMutex);
+ return _lastApplied;
}
void WiredTigerSnapshotManager::dropAllSnapshots() {
@@ -93,23 +93,4 @@ Timestamp WiredTigerSnapshotManager::beginTransactionOnCommittedSnapshot(
return *_committedSnapshot;
}
-Timestamp WiredTigerSnapshotManager::beginTransactionOnLocalSnapshot(
- WT_SESSION* session,
- PrepareConflictBehavior prepareConflictBehavior,
- RoundUpPreparedTimestamps roundUpPreparedTimestamps) const {
- WiredTigerBeginTxnBlock txnOpen(session, prepareConflictBehavior, roundUpPreparedTimestamps);
-
- stdx::lock_guard<Latch> lock(_localSnapshotMutex);
- invariant(_localSnapshot);
- LOGV2_DEBUG(22427,
- 3,
- "begin_transaction on local snapshot {localSnapshot_get}",
- "localSnapshot_get"_attr = _localSnapshot.get().toString());
- auto status = txnOpen.setReadSnapshot(_localSnapshot.get());
- fassert(50775, status);
-
- txnOpen.done();
- return *_localSnapshot;
-}
-
} // namespace mongo
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.h b/src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.h
index 1726a7d4c2b..b285c694e70 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.h
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.h
@@ -51,8 +51,8 @@ public:
WiredTigerSnapshotManager() = default;
void setCommittedSnapshot(const Timestamp& timestamp) final;
- void setLocalSnapshot(const Timestamp& timestamp) final;
- boost::optional<Timestamp> getLocalSnapshot() final;
+ void setLastApplied(const Timestamp& timestamp) final;
+ boost::optional<Timestamp> getLastApplied() final;
void dropAllSnapshots() final;
//
@@ -70,16 +70,6 @@ public:
RoundUpPreparedTimestamps roundUpPreparedTimestamps) const;
/**
- * Starts a transaction on the last stable local timestamp, set by setLocalSnapshot.
- *
- * Throws if no local snapshot has been set.
- */
- Timestamp beginTransactionOnLocalSnapshot(
- WT_SESSION* session,
- PrepareConflictBehavior prepareConflictBehavior,
- RoundUpPreparedTimestamps roundUpPreparedTimestamps) const;
-
- /**
* Returns lowest SnapshotName that could possibly be used by a future call to
* beginTransactionOnCommittedSnapshot, or boost::none if there is currently no committed
* snapshot.
@@ -95,9 +85,9 @@ private:
MONGO_MAKE_LATCH("WiredTigerSnapshotManager::_committedSnapshotMutex");
boost::optional<Timestamp> _committedSnapshot;
- // Snapshot to use for reads at a local stable timestamp.
- mutable Mutex _localSnapshotMutex = // Guards _localSnapshot.
- MONGO_MAKE_LATCH("WiredTigerSnapshotManager::_localSnapshotMutex");
- boost::optional<Timestamp> _localSnapshot;
+ // Timestamp to use for reads at a the lastApplied timestamp.
+ mutable Mutex _lastAppliedMutex = // Guards _lastApplied.
+ MONGO_MAKE_LATCH("WiredTigerSnapshotManager::_lastAppliedMutex");
+ boost::optional<Timestamp> _lastApplied;
};
} // namespace mongo