diff options
author | Gregory Noma <gregory.noma@gmail.com> | 2022-10-14 13:00:45 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-10-14 14:05:52 +0000 |
commit | a81af278bf063f677c6b23ae4c50c537edf7b07a (patch) | |
tree | 14c25e54851eb2b41b813ee668b6e5873caaf98d | |
parent | 95180bdfb528456a3a6b19ab185282f57cc3c926 (diff) | |
download | mongo-a81af278bf063f677c6b23ae4c50c537edf7b07a.tar.gz |
SERVER-70431 Use `ResourceMutex` for checkpointing
-rw-r--r-- | src/mongo/db/catalog/validate_state.cpp | 13 | ||||
-rw-r--r-- | src/mongo/db/storage/kv/kv_engine.h | 6 | ||||
-rw-r--r-- | src/mongo/db/storage/storage_engine.h | 28 | ||||
-rw-r--r-- | src/mongo/db/storage/storage_engine_impl.cpp | 5 | ||||
-rw-r--r-- | src/mongo/db/storage/storage_engine_impl.h | 3 | ||||
-rw-r--r-- | src/mongo/db/storage/storage_engine_mock.h | 5 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_index.cpp | 3 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp | 129 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h | 8 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp | 3 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_session_cache.cpp | 7 |
11 files changed, 140 insertions, 70 deletions
diff --git a/src/mongo/db/catalog/validate_state.cpp b/src/mongo/db/catalog/validate_state.cpp index 862eff71fac..07d6e462549 100644 --- a/src/mongo/db/catalog/validate_state.cpp +++ b/src/mongo/db/catalog/validate_state.cpp @@ -229,12 +229,13 @@ void ValidateState::initializeCursors(OperationContext* opCtx) { _dataThrottle.turnThrottlingOff(); } - boost::optional<Lock::ResourceLock> checkpointLock; - if (isBackground()) { - // Acquires a resource mutex to prevent taking a checkpoint during opening the checkpoint - // cursors to make sure all cursors are reading from the same point in time. - checkpointLock.emplace(opCtx, ResourceId(RESOURCE_MUTEX, "checkpoint"), MODE_X); - } + // Acquire the checkpoint lock to prevent a checkpoint from being taken while we are opening our + // checkpoint cursors. This ensures all cursors are reading from the same point in time. + auto checkpointLock = isBackground() + ? opCtx->getServiceContext()->getStorageEngine()->getCheckpointLock( + opCtx, StorageEngine::CheckpointLock::Mode::kShared) + : nullptr; + StringSet readyDurableIndexes; try { _traverseRecordStoreCursor = std::make_unique<SeekableRecordThrottleCursor>( diff --git a/src/mongo/db/storage/kv/kv_engine.h b/src/mongo/db/storage/kv/kv_engine.h index 44e212172c0..d3ce95f8c58 100644 --- a/src/mongo/db/storage/kv/kv_engine.h +++ b/src/mongo/db/storage/kv/kv_engine.h @@ -263,6 +263,12 @@ public: virtual void checkpoint(OperationContext* opCtx) {} + virtual std::unique_ptr<StorageEngine::CheckpointLock> getCheckpointLock( + OperationContext* opCtx, StorageEngine::CheckpointLock::Mode mode) { + uasserted(ErrorCodes::CommandNotSupported, + "The current storage engine does not support checkpoints"); + } + /** * Returns true if the KVEngine is ephemeral -- that is, it is NOT persistent and all data is * lost after shutdown. Otherwise, returns false. diff --git a/src/mongo/db/storage/storage_engine.h b/src/mongo/db/storage/storage_engine.h index 8f221ebc50e..da139aa8e9f 100644 --- a/src/mongo/db/storage/storage_engine.h +++ b/src/mongo/db/storage/storage_engine.h @@ -170,6 +170,28 @@ public: }; /** + * RAII lock to protect checkpointing. Operations performing a checkpoint take this lock in + * exclusive mode. Operations which need to conflict with checkpointing but do not need to + * conflict with each other may take this lock in shared mode. + */ + class CheckpointLock { + public: + enum class Mode { + kShared, + kExclusive, + }; + + CheckpointLock(const CheckpointLock&) = delete; + CheckpointLock(CheckpointLock&&) = delete; + CheckpointLock& operator=(const CheckpointLock&) = delete; + + virtual ~CheckpointLock() = default; + + protected: + CheckpointLock() = default; + }; + + /** * The destructor should only be called if we are tearing down but not exiting the process. */ virtual ~StorageEngine() {} @@ -484,6 +506,12 @@ public: virtual void checkpoint(OperationContext* opCtx) = 0; /** + * Returns a checkpoint lock in the requested mode. + */ + virtual std::unique_ptr<CheckpointLock> getCheckpointLock(OperationContext* opCtx, + CheckpointLock::Mode mode) = 0; + + /** * Recovers the storage engine state to the last stable timestamp. "Stable" in this case * refers to a timestamp that is guaranteed to never be rolled back. The stable timestamp * used should be one provided by StorageEngine::setStableTimestamp(). diff --git a/src/mongo/db/storage/storage_engine_impl.cpp b/src/mongo/db/storage/storage_engine_impl.cpp index d29e02122cc..de3df2908ee 100644 --- a/src/mongo/db/storage/storage_engine_impl.cpp +++ b/src/mongo/db/storage/storage_engine_impl.cpp @@ -1242,6 +1242,11 @@ void StorageEngineImpl::checkpoint(OperationContext* opCtx) { _engine->checkpoint(opCtx); } +std::unique_ptr<StorageEngine::CheckpointLock> StorageEngineImpl::getCheckpointLock( + OperationContext* opCtx, CheckpointLock::Mode mode) { + return _engine->getCheckpointLock(opCtx, mode); +} + void StorageEngineImpl::_onMinOfCheckpointAndOldestTimestampChanged(const Timestamp& timestamp) { // No drop-pending idents present if getEarliestDropTimestamp() returns boost::none. if (auto earliestDropTimestamp = _dropPendingIdentReaper.getEarliestDropTimestamp()) { diff --git a/src/mongo/db/storage/storage_engine_impl.h b/src/mongo/db/storage/storage_engine_impl.h index be302bed7e2..ed75065cfaa 100644 --- a/src/mongo/db/storage/storage_engine_impl.h +++ b/src/mongo/db/storage/storage_engine_impl.h @@ -327,6 +327,9 @@ public: void checkpoint(OperationContext* opCtx) override; + std::unique_ptr<CheckpointLock> getCheckpointLock(OperationContext* opCtx, + CheckpointLock::Mode mode) override; + StatusWith<ReconcileResult> reconcileCatalogAndIdents( OperationContext* opCtx, LastShutdownState lastShutdownState) override; diff --git a/src/mongo/db/storage/storage_engine_mock.h b/src/mongo/db/storage/storage_engine_mock.h index 91fefa20355..897dde20b33 100644 --- a/src/mongo/db/storage/storage_engine_mock.h +++ b/src/mongo/db/storage/storage_engine_mock.h @@ -181,6 +181,11 @@ public: void checkpoint(OperationContext* opCtx) final {} + std::unique_ptr<CheckpointLock> getCheckpointLock(OperationContext* opCtx, + CheckpointLock::Mode mode) final { + return nullptr; + } + int64_t sizeOnDiskForDb(OperationContext* opCtx, const DatabaseName& dbName) final { return 0; } diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_index.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_index.cpp index 70819b920ef..147c61a7c84 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_index.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_index.cpp @@ -504,7 +504,8 @@ Status WiredTigerIndex::compact(OperationContext* opCtx) { opCtx->recoveryUnit()->abandonSnapshot(); // WT compact prompts WT to take checkpoints, so we need to take the checkpoint lock around // WT compact calls. - Lock::ResourceLock checkpointLock{opCtx, ResourceId(RESOURCE_MUTEX, "checkpoint"), MODE_X}; + auto checkpointLock = opCtx->getServiceContext()->getStorageEngine()->getCheckpointLock( + opCtx, StorageEngine::CheckpointLock::Mode::kExclusive); int ret = s->compact(s, uri().c_str(), "timeout=0"); if (MONGO_unlikely(WTCompactIndexEBUSY.shouldFail())) { ret = EBUSY; diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp index 0b42e60183b..73041899938 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp @@ -141,6 +141,24 @@ constexpr bool kThreadSanitizerEnabled = true; constexpr bool kThreadSanitizerEnabled = false; #endif +class WiredTigerCheckpointLock : public StorageEngine::CheckpointLock { +public: + WiredTigerCheckpointLock(OperationContext* opCtx, StorageEngine::CheckpointLock::Mode mode) + : _lock([&]() -> stdx::variant<Lock::SharedLock, Lock::ExclusiveLock> { + static Lock::ResourceMutex mutex{"checkpoint"}; + switch (mode) { + case StorageEngine::CheckpointLock::Mode::kShared: + return Lock::SharedLock{opCtx, mutex}; + case StorageEngine::CheckpointLock::Mode::kExclusive: + return Lock::ExclusiveLock{opCtx, mutex}; + } + MONGO_UNREACHABLE; + }()) {} + +private: + stdx::variant<Lock::SharedLock, Lock::ExclusiveLock> _lock; +}; + boost::filesystem::path getOngoingBackupPath() { return boost::filesystem::path(storageGlobalParams.dbpath) / WiredTigerBackup::kOngoingBackupFile; @@ -1892,15 +1910,19 @@ bool WiredTigerKVEngine::supportsDirectoryPerDB() const { return true; } -void WiredTigerKVEngine::_checkpoint(OperationContext* opCtx, WT_SESSION* session) { +void WiredTigerKVEngine::_checkpoint(OperationContext* opCtx, WT_SESSION* session) try { // Ephemeral WiredTiger instances cannot do a checkpoint to disk as there is no disk backing // the data. if (_ephemeral) { return; } + + // Limits the actions of concurrent checkpoint callers as we update some internal data during a + // checkpoint. WT has a mutex of its own to only have one checkpoint active at all times so this + // is only to protect our internal updates. // TODO: SERVER-64507: Investigate whether we can smartly rely on one checkpointer if two or // more threads checkpoint at the same time. - stdx::lock_guard lk(_checkpointMutex); + auto checkpointLock = getCheckpointLock(opCtx, StorageEngine::CheckpointLock::Mode::kExclusive); const Timestamp stableTimestamp = getStableTimestamp(); const Timestamp initialDataTimestamp = getInitialDataTimestamp(); @@ -1920,63 +1942,57 @@ void WiredTigerKVEngine::_checkpoint(OperationContext* opCtx, WT_SESSION* sessio // safely assume that the oplog needed for crash recovery has caught up to the recorded value. // After the checkpoint, this value will be published such that actors which truncate the oplog // can read an updated value. - try { - // Three cases: - // - // First, initialDataTimestamp is Timestamp(0, 1) -> Take full checkpoint. This is when - // there is no consistent view of the data (e.g: during initial sync). - // - // Second, stableTimestamp < initialDataTimestamp: Skip checkpoints. The data on disk is - // prone to being rolled back. Hold off on checkpoints. Hope that the stable timestamp - // surpasses the data on disk, allowing storage to persist newer copies to disk. - // - // Third, stableTimestamp >= initialDataTimestamp: Take stable checkpoint. Steady state - // case. - if (initialDataTimestamp.asULL() <= 1) { - Lock::ResourceLock checkpointLock{ - opCtx, ResourceId(RESOURCE_MUTEX, "checkpoint"), MODE_X}; - invariantWTOK(session->checkpoint(session, "use_timestamp=false"), session); - LOGV2_FOR_RECOVERY(5576602, - 2, - "Completed unstable checkpoint.", - "initialDataTimestamp"_attr = initialDataTimestamp.toString()); - clearIndividuallyCheckpointedIndexes(); - } else if (stableTimestamp < initialDataTimestamp) { - LOGV2_FOR_RECOVERY( - 23985, - 2, - "Stable timestamp is behind the initial data timestamp, skipping a checkpoint.", - "stableTimestamp"_attr = stableTimestamp.toString(), - "initialDataTimestamp"_attr = initialDataTimestamp.toString()); - } else { - auto oplogNeededForRollback = getOplogNeededForRollback(); - LOGV2_FOR_RECOVERY(23986, - 2, - "Performing stable checkpoint.", - "stableTimestamp"_attr = stableTimestamp, - "oplogNeededForRollback"_attr = toString(oplogNeededForRollback)); + // Three cases: + // + // First, initialDataTimestamp is Timestamp(0, 1) -> Take full checkpoint. This is when there is + // no consistent view of the data (e.g: during initial sync). + // + // Second, stableTimestamp < initialDataTimestamp: Skip checkpoints. The data on disk is prone + // to being rolled back. Hold off on checkpoints. Hope that the stable timestamp surpasses the + // data on disk, allowing storage to persist newer copies to disk. + // + // Third, stableTimestamp >= initialDataTimestamp: Take stable checkpoint. Steady state case. + if (initialDataTimestamp.asULL() <= 1) { + invariantWTOK(session->checkpoint(session, "use_timestamp=false"), session); + LOGV2_FOR_RECOVERY(5576602, + 2, + "Completed unstable checkpoint.", + "initialDataTimestamp"_attr = initialDataTimestamp.toString()); + clearIndividuallyCheckpointedIndexes(); + } else if (stableTimestamp < initialDataTimestamp) { + LOGV2_FOR_RECOVERY( + 23985, + 2, + "Stable timestamp is behind the initial data timestamp, skipping a checkpoint.", + "stableTimestamp"_attr = stableTimestamp.toString(), + "initialDataTimestamp"_attr = initialDataTimestamp.toString()); + } else { + auto oplogNeededForRollback = getOplogNeededForRollback(); - { - Lock::ResourceLock checkpointLock{ - opCtx, ResourceId(RESOURCE_MUTEX, "checkpoint"), MODE_X}; - invariantWTOK(session->checkpoint(session, "use_timestamp=true"), session); - clearIndividuallyCheckpointedIndexes(); - } + LOGV2_FOR_RECOVERY(23986, + 2, + "Performing stable checkpoint.", + "stableTimestamp"_attr = stableTimestamp, + "oplogNeededForRollback"_attr = toString(oplogNeededForRollback)); - if (oplogNeededForRollback.isOK()) { - // Now that the checkpoint is durable, publish the oplog needed to recover from it. - _oplogNeededForCrashRecovery.store(oplogNeededForRollback.getValue().asULL()); - } + { + invariantWTOK(session->checkpoint(session, "use_timestamp=true"), session); + clearIndividuallyCheckpointedIndexes(); + } + + if (oplogNeededForRollback.isOK()) { + // Now that the checkpoint is durable, publish the oplog needed to recover from it. + _oplogNeededForCrashRecovery.store(oplogNeededForRollback.getValue().asULL()); } - } catch (const WriteConflictException&) { - LOGV2_WARNING(22346, "Checkpoint encountered a write conflict exception."); - } catch (const AssertionException& exc) { - invariant(exc.code() == ErrorCodes::InterruptedAtShutdown || - exc.code() == ErrorCodes::Interrupted, - exc.toString()); - LOGV2(7021300, "Skipping checkpoint due to exception", "exception"_attr = exc.toStatus()); } +} catch (const WriteConflictException&) { + LOGV2_WARNING(22346, "Checkpoint encountered a write conflict exception."); +} catch (const AssertionException& exc) { + invariant(exc.code() == ErrorCodes::InterruptedAtShutdown || + exc.code() == ErrorCodes::Interrupted, + exc.toString()); + LOGV2(7021300, "Skipping checkpoint due to exception", "exception"_attr = exc.toStatus()); } void WiredTigerKVEngine::checkpoint(OperationContext* opCtx) { @@ -1985,6 +2001,11 @@ void WiredTigerKVEngine::checkpoint(OperationContext* opCtx) { return _checkpoint(opCtx, s); } +std::unique_ptr<StorageEngine::CheckpointLock> WiredTigerKVEngine::getCheckpointLock( + OperationContext* opCtx, StorageEngine::CheckpointLock::Mode mode) { + return std::make_unique<WiredTigerCheckpointLock>(opCtx, mode); +} + bool WiredTigerKVEngine::hasIdent(OperationContext* opCtx, StringData ident) const { return _hasUri(WiredTigerRecoveryUnit::get(opCtx)->getSession()->getSession(), _uri(ident)); } diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h index 705f0dc04cb..c1128d6292b 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h @@ -128,6 +128,9 @@ public: void checkpoint(OperationContext* opCtx) override; + std::unique_ptr<StorageEngine::CheckpointLock> getCheckpointLock( + OperationContext* opCtx, StorageEngine::CheckpointLock::Mode mode) override; + bool isEphemeral() const override { return _ephemeral; } @@ -553,10 +556,5 @@ private: // Pins the oplog so that OplogStones will not truncate oplog history equal or newer to this // timestamp. AtomicWord<std::uint64_t> _pinnedOplogTimestamp; - - // Limits the actions of concurrent checkpoint callers as we update some internal data during a - // checkpoint. WT has a mutex of its own to only have one checkpoint active at all times so this - // is only to protect our internal updates. - Mutex _checkpointMutex = MONGO_MAKE_LATCH("WiredTigerKVEngine::_checkpointMutex"); }; } // namespace mongo diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp index 8bb44e80162..d3f8d26b3f5 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp @@ -1738,7 +1738,8 @@ Status WiredTigerRecordStore::doCompact(OperationContext* opCtx) { opCtx->recoveryUnit()->abandonSnapshot(); // WT compact prompts WT to take checkpoints, so we need to take the checkpoint lock around // WT compact calls. - Lock::ResourceLock checkpointLock{opCtx, ResourceId(RESOURCE_MUTEX, "checkpoint"), MODE_X}; + auto checkpointLock = + _kvEngine->getCheckpointLock(opCtx, StorageEngine::CheckpointLock::Mode::kExclusive); int ret = s->compact(s, getURI().c_str(), "timeout=0"); if (MONGO_unlikely(WTCompactRecordStoreEBUSY.shouldFail())) { ret = EBUSY; diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_session_cache.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_session_cache.cpp index 5e8337030d1..5fbfe2b8a7c 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_session_cache.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_session_cache.cpp @@ -296,8 +296,8 @@ void WiredTigerSessionCache::waitUntilDurable(OperationContext* opCtx, auto config = syncType == Fsync::kCheckpointStableTimestamp ? "use_timestamp=true" : "use_timestamp=false"; { - Lock::ResourceLock checkpointLock{ - opCtx, ResourceId(RESOURCE_MUTEX, "checkpoint"), MODE_X}; + auto checkpointLock = _engine->getCheckpointLock( + opCtx, StorageEngine::CheckpointLock::Mode::kExclusive); invariantWTOK(s->checkpoint(s, config), s); _engine->clearIndividuallyCheckpointedIndexes(); } @@ -352,7 +352,8 @@ void WiredTigerSessionCache::waitUntilDurable(OperationContext* opCtx, _waitUntilDurableSession); LOGV2_DEBUG(22419, 4, "flushed journal"); } else { - Lock::ResourceLock checkpointLock{opCtx, ResourceId(RESOURCE_MUTEX, "checkpoint"), MODE_X}; + auto checkpointLock = + _engine->getCheckpointLock(opCtx, StorageEngine::CheckpointLock::Mode::kExclusive); invariantWTOK(_waitUntilDurableSession->checkpoint(_waitUntilDurableSession, nullptr), _waitUntilDurableSession); LOGV2_DEBUG(22420, 4, "created checkpoint"); |