diff options
Diffstat (limited to 'src/mongo/db/storage')
-rw-r--r-- | src/mongo/db/storage/SConscript | 17 | ||||
-rw-r--r-- | src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp | 60 | ||||
-rw-r--r-- | src/mongo/db/storage/recovery_unit.h | 24 | ||||
-rw-r--r-- | src/mongo/db/storage/snapshot_helper.cpp | 151 | ||||
-rw-r--r-- | src/mongo/db/storage/snapshot_helper.h | 43 | ||||
-rw-r--r-- | src/mongo/db/storage/snapshot_manager.h | 8 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp | 61 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp | 83 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.cpp | 33 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.h | 22 |
11 files changed, 398 insertions, 110 deletions
diff --git a/src/mongo/db/storage/SConscript b/src/mongo/db/storage/SConscript index ccd5d048e1d..e32b8ba329a 100644 --- a/src/mongo/db/storage/SConscript +++ b/src/mongo/db/storage/SConscript @@ -30,6 +30,23 @@ env.Library( ) env.Library( + target='snapshot_helper', + source=[ + 'snapshot_helper.cpp', + ], + LIBDEPS=[ + '$BUILD_DIR/mongo/base', + '$BUILD_DIR/mongo/db/namespace_string', + ], + LIBDEPS_PRIVATE=[ + '$BUILD_DIR/mongo/db/concurrency/lock_manager', + '$BUILD_DIR/mongo/db/repl/read_concern_args', + '$BUILD_DIR/mongo/db/repl/repl_coordinator_interface', + 'recovery_unit_base', + ], + ) + +env.Library( target='duplicate_key_error_info', source=[ 'duplicate_key_error_info.cpp', diff --git a/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp b/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp index aeef71561ae..baecec3c8af 100644 --- a/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp +++ b/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp @@ -150,9 +150,9 @@ public: return itCountOn(op); } - int itCountLocal() { + int itCountLastApplied() { auto op = makeOperation(); - op->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kLastApplied); + op->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoOverlap); return itCountOn(op); } @@ -176,14 +176,14 @@ public: return std::string(record->data.data()); } - boost::optional<Record> readRecordLocal(RecordId id) { + boost::optional<Record> readRecordLastApplied(RecordId id) { auto op = makeOperation(); - op->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kLastApplied); + op->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoOverlap); return readRecordOn(op, id); } - std::string readStringLocal(RecordId id) { - auto record = readRecordLocal(id); + std::string readStringLastApplied(RecordId id) { + auto record = readRecordLastApplied(id); ASSERT(record); return std::string(record->data.data()); } @@ -360,7 +360,7 @@ TEST_F(SnapshotManagerTests, UpdateAndDelete) { ASSERT(!readRecordCommitted(id)); } -TEST_F(SnapshotManagerTests, InsertAndReadOnLocalSnapshot) { +TEST_F(SnapshotManagerTests, InsertAndReadOnLastAppliedSnapshot) { if (!snapshotManager) return; // This test is only for engines that DO support SnapshotManagers. @@ -369,7 +369,7 @@ TEST_F(SnapshotManagerTests, InsertAndReadOnLocalSnapshot) { auto id = insertRecordAndCommit(); auto afterInsert = fetchAndIncrementTimestamp(); - // Not reading on the last local timestamp returns the most recent data. + // Not reading on the last applied timestamp returns the most recent data. auto op = makeOperation(); auto ru = op->recoveryUnit(); ru->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset); @@ -379,18 +379,18 @@ TEST_F(SnapshotManagerTests, InsertAndReadOnLocalSnapshot) { deleteRecordAndCommit(id); auto afterDelete = fetchAndIncrementTimestamp(); - // Reading at the local snapshot timestamps returns data in order. - snapshotManager->setLocalSnapshot(beforeInsert); - ASSERT_EQ(itCountLocal(), 0); - ASSERT(!readRecordLocal(id)); + // Reading at the last applied snapshot timestamps returns data in order. + snapshotManager->setLastApplied(beforeInsert); + ASSERT_EQ(itCountLastApplied(), 0); + ASSERT(!readRecordLastApplied(id)); - snapshotManager->setLocalSnapshot(afterInsert); - ASSERT_EQ(itCountLocal(), 1); - ASSERT(readRecordLocal(id)); + snapshotManager->setLastApplied(afterInsert); + ASSERT_EQ(itCountLastApplied(), 1); + ASSERT(readRecordLastApplied(id)); - snapshotManager->setLocalSnapshot(afterDelete); - ASSERT_EQ(itCountLocal(), 0); - ASSERT(!readRecordLocal(id)); + snapshotManager->setLastApplied(afterDelete); + ASSERT_EQ(itCountLastApplied(), 0); + ASSERT(!readRecordLastApplied(id)); } TEST_F(SnapshotManagerTests, UpdateAndDeleteOnLocalSnapshot) { @@ -416,20 +416,20 @@ TEST_F(SnapshotManagerTests, UpdateAndDeleteOnLocalSnapshot) { deleteRecordAndCommit(id); auto afterDelete = fetchAndIncrementTimestamp(); - snapshotManager->setLocalSnapshot(beforeInsert); - ASSERT_EQ(itCountLocal(), 0); - ASSERT(!readRecordLocal(id)); + snapshotManager->setLastApplied(beforeInsert); + ASSERT_EQ(itCountLastApplied(), 0); + ASSERT(!readRecordLastApplied(id)); - snapshotManager->setLocalSnapshot(afterInsert); - ASSERT_EQ(itCountLocal(), 1); - ASSERT_EQ(readStringLocal(id), "Aardvark"); + snapshotManager->setLastApplied(afterInsert); + ASSERT_EQ(itCountLastApplied(), 1); + ASSERT_EQ(readStringLastApplied(id), "Aardvark"); - snapshotManager->setLocalSnapshot(afterUpdate); - ASSERT_EQ(itCountLocal(), 1); - ASSERT_EQ(readStringLocal(id), "Blue spotted stingray"); + snapshotManager->setLastApplied(afterUpdate); + ASSERT_EQ(itCountLastApplied(), 1); + ASSERT_EQ(readStringLastApplied(id), "Blue spotted stingray"); - snapshotManager->setLocalSnapshot(afterDelete); - ASSERT_EQ(itCountLocal(), 0); - ASSERT(!readRecordLocal(id)); + snapshotManager->setLastApplied(afterDelete); + ASSERT_EQ(itCountLastApplied(), 0); + ASSERT(!readRecordLastApplied(id)); } } // namespace mongo diff --git a/src/mongo/db/storage/recovery_unit.h b/src/mongo/db/storage/recovery_unit.h index f8c70309ae2..eeddfafd624 100644 --- a/src/mongo/db/storage/recovery_unit.h +++ b/src/mongo/db/storage/recovery_unit.h @@ -253,7 +253,6 @@ public: * - when using ReadSource::kNoOverlap, the timestamp chosen by the storage engine. * - when using ReadSource::kAllDurableSnapshot, the timestamp chosen using the storage * engine's all_durable timestamp. - * - when using ReadSource::kLastApplied, the timestamp chosen using the storage engine's last * applied timestamp. Can return boost::none if no timestamp has been established. * - when using ReadSource::kMajorityCommitted, the majority committed timestamp chosen by the * storage engine after a transaction has been opened or after a call to @@ -399,11 +398,6 @@ public: */ kNoOverlap, /** - * Read from the last applied timestamp. New transactions start at the most up-to-date - * timestamp. - */ - kLastApplied, - /** * Read from the all_durable timestamp. New transactions will always read from the same * timestamp and never advance. */ @@ -414,6 +408,24 @@ public: kProvided }; + static std::string toString(ReadSource rs) { + switch (rs) { + case ReadSource::kUnset: + return "kUnset"; + case ReadSource::kNoTimestamp: + return "kNoTimestamp"; + case ReadSource::kMajorityCommitted: + return "kMajorityCommitted"; + case ReadSource::kNoOverlap: + return "kNoOverlap"; + case ReadSource::kAllDurableSnapshot: + return "kAllDurableSnapshot"; + case ReadSource::kProvided: + return "kProvided"; + } + MONGO_UNREACHABLE; + } + /** * Sets which timestamp to use for read transactions. If 'provided' is supplied, only kProvided * is an acceptable input. diff --git a/src/mongo/db/storage/snapshot_helper.cpp b/src/mongo/db/storage/snapshot_helper.cpp new file mode 100644 index 00000000000..777868f7830 --- /dev/null +++ b/src/mongo/db/storage/snapshot_helper.cpp @@ -0,0 +1,151 @@ +/** + * Copyright (C) 2020-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kStorage + +#include "mongo/platform/basic.h" + +#include "mongo/db/storage/snapshot_helper.h" + +#include "mongo/db/repl/read_concern_args.h" +#include "mongo/db/repl/replication_coordinator.h" +#include "mongo/logv2/log.h" + +namespace mongo { +namespace SnapshotHelper { +bool canSwitchReadSource(OperationContext* opCtx) { + + // Most readConcerns have behavior controlled at higher levels. Local and available are the only + // ReadConcerns that should consider changing, since they read without a timestamp by default. + const auto readConcernLevel = repl::ReadConcernArgs::get(opCtx).getLevel(); + if (readConcernLevel == repl::ReadConcernLevel::kLocalReadConcern || + readConcernLevel == repl::ReadConcernLevel::kAvailableReadConcern) { + return true; + } + + return false; +} + +bool shouldReadAtNoOverlap(OperationContext* opCtx, + const NamespaceString& nss, + std::string* reason) { + + // If this is true, then the operation opted-in to the PBWM lock, implying that it cannot read + // at no-overlap. It's important to note that it is possible for this to be false, but still be + // holding the PBWM lock, explained below. + if (opCtx->lockState()->shouldConflictWithSecondaryBatchApplication()) { + *reason = "conflicts with batch application"; + return false; + } + + // If we are already holding the PBWM lock, do not read at no-overlap. Snapshots acquired by an + // operation after a yield/restore must see all writes in the pre-yield snapshot. Once a + // snapshot is reading without a timestamp, we choose to continue acquiring snapshots without a + // timestamp. This is done in lieu of determining a timestamp far enough in the future that's + // guaranteed to observe all previous writes. This may occur when multiple collection locks are + // held concurrently, which is often the case when DBDirectClient is used. + if (opCtx->lockState()->isLockHeldForMode(resourceIdParallelBatchWriterMode, MODE_IS)) { + *reason = "PBWM lock is held"; + LOGV2_DEBUG(20577, 1, "not reading at no-overlap because the PBWM lock is held"); + return false; + } + + // If we are in a replication state (like secondary or primary catch-up) where we are not + // accepting writes, we should read at no-overlap. If this node can accept writes, then no + // conflicting replication batches are being applied and we can read from the default snapshot. + if (repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesForDatabase(opCtx, "admin")) { + *reason = "primary"; + return false; + } + + // Non-replicated collections do not need to read at no-overlap, as those collections are not + // written by the replication system. However, the oplog is special, as it *is* written by the + // replication system. + if (!nss.isReplicated() && !nss.isOplog()) { + *reason = "unreplicated collection"; + return false; + } + + return true; +} +boost::optional<RecoveryUnit::ReadSource> getNewReadSource(OperationContext* opCtx, + const NamespaceString& nss) { + const bool canSwitch = canSwitchReadSource(opCtx); + if (!canSwitch) { + return boost::none; + } + + const auto existing = opCtx->recoveryUnit()->getTimestampReadSource(); + std::string reason; + const bool readAtNoOverlap = shouldReadAtNoOverlap(opCtx, nss, &reason); + if (existing == RecoveryUnit::ReadSource::kUnset) { + // Shifting from reading without a timestamp to reading with a timestamp can be dangerous + // because writes will appear to vanish. This case is intended for new reads on secondaries + // and query yield recovery after state transitions from primary to secondary. + if (readAtNoOverlap) { + LOGV2_DEBUG(4452901, 2, "Changing ReadSource to kNoOverlap", logAttrs(nss)); + return RecoveryUnit::ReadSource::kNoOverlap; + } + } else if (existing == RecoveryUnit::ReadSource::kNoOverlap) { + // For some reason, we can no longer read at kNoOverlap. + // An operation that yields a timestamped snapshot must restore a snapshot with at least as + // large of a timestamp, or with proper consideration of rollback scenarios, no timestamp. + // Given readers do not survive rollbacks, it's okay to go from reading with a timestamp to + // reading without one. More writes will become visible. + if (!readAtNoOverlap) { + LOGV2_DEBUG( + 4452902, 2, "Changing ReadSource to kUnset", logAttrs(nss), "reason"_attr = reason); + // This shift to kUnset assumes that callers will not make future attempts to manipulate + // their ReadSources after performing reads at an un-timetamped snapshot. The only + // exception is callers of this function that may need to change from kUnset to + // kNoOverlap in the event of a catalog conflict or query yield. + return RecoveryUnit::ReadSource::kUnset; + } + } + return boost::none; +} + +bool collectionChangesConflictWithRead(boost::optional<Timestamp> collectionMin, + boost::optional<Timestamp> readTimestamp) { + // This is the timestamp of the most recent catalog changes to this collection. If this is + // greater than any point in time read timestamps, we should either wait or return an error. + if (!collectionMin) { + return false; + } + + // If we do not have a point in time to conflict with collectionMin, return. + if (!readTimestamp || readTimestamp->isNull()) { + return false; + } + + // Return if there are no conflicting catalog changes with the readTimestamp. + return *collectionMin > readTimestamp; +} +} // namespace SnapshotHelper +} // namespace mongo
\ No newline at end of file diff --git a/src/mongo/db/storage/snapshot_helper.h b/src/mongo/db/storage/snapshot_helper.h new file mode 100644 index 00000000000..fa8fdd85f24 --- /dev/null +++ b/src/mongo/db/storage/snapshot_helper.h @@ -0,0 +1,43 @@ +/** + * Copyright (C) 2020-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/operation_context.h" + +namespace mongo { +namespace SnapshotHelper { +// Returns a ReadSource if we should change our current ReadSource. Returns boost::none otherwise. +boost::optional<RecoveryUnit::ReadSource> getNewReadSource(OperationContext* opCtx, + const NamespaceString& nss); + +bool collectionChangesConflictWithRead(boost::optional<Timestamp> collectionMin, + boost::optional<Timestamp> readTimestamp); +} // namespace SnapshotHelper +} // namespace mongo diff --git a/src/mongo/db/storage/snapshot_manager.h b/src/mongo/db/storage/snapshot_manager.h index d41df1c5013..7839ef179ba 100644 --- a/src/mongo/db/storage/snapshot_manager.h +++ b/src/mongo/db/storage/snapshot_manager.h @@ -59,14 +59,14 @@ public: virtual void setCommittedSnapshot(const Timestamp& timestamp) = 0; /** - * Sets the snapshot for the last stable timestamp for reading on secondaries. + * Sets the lastApplied timestamp. */ - virtual void setLocalSnapshot(const Timestamp& timestamp) = 0; + virtual void setLastApplied(const Timestamp& timestamp) = 0; /** - * Returns the local snapshot timestamp. + * Returns the lastApplied timestamp. */ - virtual boost::optional<Timestamp> getLocalSnapshot() = 0; + virtual boost::optional<Timestamp> getLastApplied() = 0; /** * Drops all snapshots and clears the "committed" snapshot. diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp index 661b0e009c1..352699a9de8 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp @@ -668,6 +668,12 @@ WiredTigerKVEngine::WiredTigerKVEngine(const std::string& canonicalName, setInitialDataTimestamp(_recoveryTimestamp); setOldestTimestamp(_recoveryTimestamp, false); setStableTimestamp(_recoveryTimestamp, false); + + _sessionCache->snapshotManager().setLastApplied(_recoveryTimestamp); + { + stdx::lock_guard<Latch> lk(_highestDurableTimestampMutex); + _highestSeenDurableTimestamp = _recoveryTimestamp.asULL(); + } } } diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp index 8cc45e0201e..f9d9751d464 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp @@ -453,7 +453,6 @@ boost::optional<Timestamp> WiredTigerRecoveryUnit::getPointInTimeReadTimestamp() // The following ReadSources can only establish a read timestamp when a transaction is // opened. case ReadSource::kNoOverlap: - case ReadSource::kLastApplied: case ReadSource::kAllDurableSnapshot: break; } @@ -462,14 +461,14 @@ boost::optional<Timestamp> WiredTigerRecoveryUnit::getPointInTimeReadTimestamp() getSession(); switch (_timestampReadSource) { - case ReadSource::kLastApplied: - // The lastApplied timestamp is not always available, so it is not possible to invariant - // that it exists as other ReadSources do. + case ReadSource::kNoOverlap: + // The lastApplied and allDurable timestamps are not always available if the system has + // not accepted writes, so it is not possible to invariant that it exists as other + // ReadSources do. if (!_readAtTimestamp.isNull()) { return _readAtTimestamp; } return boost::none; - case ReadSource::kNoOverlap: case ReadSource::kAllDurableSnapshot: invariant(!_readAtTimestamp.isNull()); return _readAtTimestamp; @@ -515,17 +514,6 @@ void WiredTigerRecoveryUnit::_txnOpen() { session, _prepareConflictBehavior, _roundUpPreparedTimestamps); break; } - case ReadSource::kLastApplied: { - if (_sessionCache->snapshotManager().getLocalSnapshot()) { - _readAtTimestamp = _sessionCache->snapshotManager().beginTransactionOnLocalSnapshot( - session, _prepareConflictBehavior, _roundUpPreparedTimestamps); - } else { - WiredTigerBeginTxnBlock( - session, _prepareConflictBehavior, _roundUpPreparedTimestamps) - .done(); - } - break; - } case ReadSource::kNoOverlap: { _readAtTimestamp = _beginTransactionAtNoOverlapTimestamp(session); break; @@ -555,8 +543,9 @@ void WiredTigerRecoveryUnit::_txnOpen() { LOGV2_DEBUG(22414, 3, - "WT begin_transaction for snapshot id {getSnapshotId_toNumber}", - "getSnapshotId_toNumber"_attr = getSnapshotId().toNumber()); + "WT begin_transaction", + "snapshotId"_attr = getSnapshotId().toNumber(), + "readSource"_attr = toString(_timestampReadSource)); } Timestamp WiredTigerRecoveryUnit::_beginTransactionAtAllDurableTimestamp(WT_SESSION* session) { @@ -578,8 +567,8 @@ Timestamp WiredTigerRecoveryUnit::_beginTransactionAtAllDurableTimestamp(WT_SESS Timestamp WiredTigerRecoveryUnit::_beginTransactionAtNoOverlapTimestamp(WT_SESSION* session) { - auto lastApplied = _sessionCache->snapshotManager().getLocalSnapshot(); - Timestamp allDurable = Timestamp(_oplogManager->fetchAllDurableValue(session->connection)); + auto lastApplied = _sessionCache->snapshotManager().getLastApplied(); + Timestamp allDurable = Timestamp(_sessionCache->getKVEngine()->getAllDurableTimestamp()); // When using timestamps for reads and writes, it's important that readers and writers don't // overlap with the timestamps they use. In other words, at any point in the system there should @@ -607,17 +596,34 @@ Timestamp WiredTigerRecoveryUnit::_beginTransactionAtNoOverlapTimestamp(WT_SESSI // should read afterward. Timestamp readTimestamp = (lastApplied) ? std::min(*lastApplied, allDurable) : allDurable; + if (readTimestamp.isNull()) { + // When there is not an all_durable or lastApplied timestamp available, read without a + // timestamp. Do not round up the read timestamp to the oldest timestamp. + + // There is a race that allows new transactions to start between the time we check for a + // read timestamp and start our transaction, which can temporarily violate the contract of + // kNoOverlap. That is, writes will be visible that occur after the all_durable time. This + // is only possible for readers that start immediately after an initial sync that did not + // replicate any oplog entries. Future transactions will start reading at a timestamp once + // timestamped writes have been made. + WiredTigerBeginTxnBlock txnOpen( + session, _prepareConflictBehavior, _roundUpPreparedTimestamps); + LOGV2_DEBUG(4452900, 1, "no read timestamp available for kNoOverlap"); + txnOpen.done(); + return readTimestamp; + } + WiredTigerBeginTxnBlock txnOpen(session, _prepareConflictBehavior, _roundUpPreparedTimestamps, RoundUpReadTimestamp::kRound); auto status = txnOpen.setReadSnapshot(readTimestamp); fassert(51066, status); + txnOpen.done(); - // We might have rounded to oldest between calling getAllDurable and setReadSnapshot. We need - // to get the actual read timestamp we used. + // We might have rounded to oldest between calling getAllDurable and setReadSnapshot. We + // need to get the actual read timestamp we used. readTimestamp = _getTransactionReadTimestamp(session); - txnOpen.done(); return readTimestamp; } @@ -769,15 +775,14 @@ void WiredTigerRecoveryUnit::setTimestampReadSource(ReadSource readSource, boost::optional<Timestamp> provided) { LOGV2_DEBUG(22416, 3, - "setting timestamp read source: {static_cast_int_readSource}, provided timestamp: " - "{provided_provided_none}", - "static_cast_int_readSource"_attr = static_cast<int>(readSource), - "provided_provided_none"_attr = ((provided) ? provided->toString() : "none")); + "setting timestamp read source", + "readSource"_attr = toString(readSource), + "provided"_attr = ((provided) ? provided->toString() : "none")); invariant(!_isActive() || _timestampReadSource == readSource, str::stream() << "Current state: " << toString(_getState()) << ". Invalid internal state while setting timestamp read source: " - << static_cast<int>(readSource) << ", provided timestamp: " + << toString(readSource) << ", provided timestamp: " << (provided ? provided->toString() : "none")); invariant(!provided == (readSource != ReadSource::kProvided)); invariant(!(provided && provided->isNull())); diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp index 314a524063e..1f440d319b7 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp @@ -163,11 +163,14 @@ public: clientAndCtx2 = makeClientAndOpCtx(harnessHelper.get(), "reader"); ru1 = checked_cast<WiredTigerRecoveryUnit*>(clientAndCtx1.second->recoveryUnit()); ru2 = checked_cast<WiredTigerRecoveryUnit*>(clientAndCtx2.second->recoveryUnit()); + snapshotManager = dynamic_cast<WiredTigerSnapshotManager*>( + harnessHelper->getEngine()->getSnapshotManager()); } std::unique_ptr<WiredTigerRecoveryUnitHarnessHelper> harnessHelper; ClientAndCtx clientAndCtx1, clientAndCtx2; WiredTigerRecoveryUnit *ru1, *ru2; + WiredTigerSnapshotManager* snapshotManager; private: const char* wt_uri = "table:prepare_transaction"; @@ -180,6 +183,86 @@ TEST_F(WiredTigerRecoveryUnitTestFixture, SetReadSource) { ASSERT_EQ(Timestamp(1, 1), ru1->getPointInTimeReadTimestamp()); } +TEST_F(WiredTigerRecoveryUnitTestFixture, NoOverlapReadSource) { + OperationContext* opCtx1 = clientAndCtx1.second.get(); + std::unique_ptr<RecordStore> rs(harnessHelper->createRecordStore(opCtx1, "a.b")); + + const std::string str = str::stream() << "test"; + const Timestamp ts1{1, 1}; + const Timestamp ts2{1, 2}; + const Timestamp ts3{1, 2}; + + RecordId rid1; + { + WriteUnitOfWork wuow(opCtx1); + StatusWith<RecordId> res = rs->insertRecord(opCtx1, str.c_str(), str.size() + 1, ts1); + ASSERT_OK(res); + wuow.commit(); + rid1 = res.getValue(); + snapshotManager->setLastApplied(ts1); + } + + // Read without a timestamp. The write should be visible. + ASSERT_EQ(opCtx1->recoveryUnit()->getTimestampReadSource(), RecoveryUnit::ReadSource::kUnset); + RecordData unused; + ASSERT_TRUE(rs->findRecord(opCtx1, rid1, &unused)); + + // Read with kNoOverlap. The write should be visible. + opCtx1->recoveryUnit()->abandonSnapshot(); + opCtx1->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoOverlap); + ASSERT_TRUE(rs->findRecord(opCtx1, rid1, &unused)); + + RecordId rid2, rid3; + { + // Start, but do not commit a transaction with opCtx2. This sets a timestamp at ts2, which + // creates a hole. kNoOverlap, which is a function of all_durable, will only be able to read + // at the time immediately before. + OperationContext* opCtx2 = clientAndCtx2.second.get(); + WriteUnitOfWork wuow(opCtx2); + StatusWith<RecordId> res = + rs->insertRecord(opCtx2, str.c_str(), str.size() + 1, Timestamp()); + ASSERT_OK(opCtx2->recoveryUnit()->setTimestamp(ts2)); + ASSERT_OK(res); + + // While holding open a transaction with opCtx2, perform an insert at ts3 with opCtx1. This + // creates a "hole". + { + WriteUnitOfWork wuow(opCtx1); + StatusWith<RecordId> res = rs->insertRecord(opCtx1, str.c_str(), str.size() + 1, ts3); + ASSERT_OK(res); + wuow.commit(); + rid3 = res.getValue(); + snapshotManager->setLastApplied(ts3); + } + + // Read without a timestamp, and we should see the first and third records. + opCtx1->recoveryUnit()->abandonSnapshot(); + opCtx1->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset); + ASSERT_TRUE(rs->findRecord(opCtx1, rid1, &unused)); + ASSERT_FALSE(rs->findRecord(opCtx1, rid2, &unused)); + ASSERT_TRUE(rs->findRecord(opCtx1, rid3, &unused)); + + // Now read at kNoOverlap. Since the transaction at ts2 has not committed, all_durable is + // held back to ts1. LastApplied has advanced to ts3, but because kNoOverlap is the minimum, + // we should only see one record. + opCtx1->recoveryUnit()->abandonSnapshot(); + opCtx1->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoOverlap); + ASSERT_TRUE(rs->findRecord(opCtx1, rid1, &unused)); + ASSERT_FALSE(rs->findRecord(opCtx1, rid2, &unused)); + ASSERT_FALSE(rs->findRecord(opCtx1, rid3, &unused)); + + wuow.commit(); + rid2 = res.getValue(); + } + + // Now that the hole has been closed, kNoOverlap should see all 3 records. + opCtx1->recoveryUnit()->abandonSnapshot(); + opCtx1->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoOverlap); + ASSERT_TRUE(rs->findRecord(opCtx1, rid1, &unused)); + ASSERT_TRUE(rs->findRecord(opCtx1, rid2, &unused)); + ASSERT_TRUE(rs->findRecord(opCtx1, rid3, &unused)); +} + TEST_F(WiredTigerRecoveryUnitTestFixture, CreateAndCheckForCachePressure) { int time = 1; diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.cpp index 74988508a4c..efc2d8adfe5 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.cpp @@ -48,17 +48,17 @@ void WiredTigerSnapshotManager::setCommittedSnapshot(const Timestamp& timestamp) _committedSnapshot = timestamp; } -void WiredTigerSnapshotManager::setLocalSnapshot(const Timestamp& timestamp) { - stdx::lock_guard<Latch> lock(_localSnapshotMutex); +void WiredTigerSnapshotManager::setLastApplied(const Timestamp& timestamp) { + stdx::lock_guard<Latch> lock(_lastAppliedMutex); if (timestamp.isNull()) - _localSnapshot = boost::none; + _lastApplied = boost::none; else - _localSnapshot = timestamp; + _lastApplied = timestamp; } -boost::optional<Timestamp> WiredTigerSnapshotManager::getLocalSnapshot() { - stdx::lock_guard<Latch> lock(_localSnapshotMutex); - return _localSnapshot; +boost::optional<Timestamp> WiredTigerSnapshotManager::getLastApplied() { + stdx::lock_guard<Latch> lock(_lastAppliedMutex); + return _lastApplied; } void WiredTigerSnapshotManager::dropAllSnapshots() { @@ -93,23 +93,4 @@ Timestamp WiredTigerSnapshotManager::beginTransactionOnCommittedSnapshot( return *_committedSnapshot; } -Timestamp WiredTigerSnapshotManager::beginTransactionOnLocalSnapshot( - WT_SESSION* session, - PrepareConflictBehavior prepareConflictBehavior, - RoundUpPreparedTimestamps roundUpPreparedTimestamps) const { - WiredTigerBeginTxnBlock txnOpen(session, prepareConflictBehavior, roundUpPreparedTimestamps); - - stdx::lock_guard<Latch> lock(_localSnapshotMutex); - invariant(_localSnapshot); - LOGV2_DEBUG(22427, - 3, - "begin_transaction on local snapshot {localSnapshot_get}", - "localSnapshot_get"_attr = _localSnapshot.get().toString()); - auto status = txnOpen.setReadSnapshot(_localSnapshot.get()); - fassert(50775, status); - - txnOpen.done(); - return *_localSnapshot; -} - } // namespace mongo diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.h b/src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.h index 1726a7d4c2b..b285c694e70 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.h +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_snapshot_manager.h @@ -51,8 +51,8 @@ public: WiredTigerSnapshotManager() = default; void setCommittedSnapshot(const Timestamp& timestamp) final; - void setLocalSnapshot(const Timestamp& timestamp) final; - boost::optional<Timestamp> getLocalSnapshot() final; + void setLastApplied(const Timestamp& timestamp) final; + boost::optional<Timestamp> getLastApplied() final; void dropAllSnapshots() final; // @@ -70,16 +70,6 @@ public: RoundUpPreparedTimestamps roundUpPreparedTimestamps) const; /** - * Starts a transaction on the last stable local timestamp, set by setLocalSnapshot. - * - * Throws if no local snapshot has been set. - */ - Timestamp beginTransactionOnLocalSnapshot( - WT_SESSION* session, - PrepareConflictBehavior prepareConflictBehavior, - RoundUpPreparedTimestamps roundUpPreparedTimestamps) const; - - /** * Returns lowest SnapshotName that could possibly be used by a future call to * beginTransactionOnCommittedSnapshot, or boost::none if there is currently no committed * snapshot. @@ -95,9 +85,9 @@ private: MONGO_MAKE_LATCH("WiredTigerSnapshotManager::_committedSnapshotMutex"); boost::optional<Timestamp> _committedSnapshot; - // Snapshot to use for reads at a local stable timestamp. - mutable Mutex _localSnapshotMutex = // Guards _localSnapshot. - MONGO_MAKE_LATCH("WiredTigerSnapshotManager::_localSnapshotMutex"); - boost::optional<Timestamp> _localSnapshot; + // Timestamp to use for reads at a the lastApplied timestamp. + mutable Mutex _lastAppliedMutex = // Guards _lastApplied. + MONGO_MAKE_LATCH("WiredTigerSnapshotManager::_lastAppliedMutex"); + boost::optional<Timestamp> _lastApplied; }; } // namespace mongo |