summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGregory Noma <gregory.noma@gmail.com>2022-10-14 13:00:45 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-10-14 14:05:52 +0000
commita81af278bf063f677c6b23ae4c50c537edf7b07a (patch)
tree14c25e54851eb2b41b813ee668b6e5873caaf98d
parent95180bdfb528456a3a6b19ab185282f57cc3c926 (diff)
downloadmongo-a81af278bf063f677c6b23ae4c50c537edf7b07a.tar.gz
SERVER-70431 Use `ResourceMutex` for checkpointing
-rw-r--r--src/mongo/db/catalog/validate_state.cpp13
-rw-r--r--src/mongo/db/storage/kv/kv_engine.h6
-rw-r--r--src/mongo/db/storage/storage_engine.h28
-rw-r--r--src/mongo/db/storage/storage_engine_impl.cpp5
-rw-r--r--src/mongo/db/storage/storage_engine_impl.h3
-rw-r--r--src/mongo/db/storage/storage_engine_mock.h5
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_index.cpp3
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp129
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h8
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp3
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_session_cache.cpp7
11 files changed, 140 insertions, 70 deletions
diff --git a/src/mongo/db/catalog/validate_state.cpp b/src/mongo/db/catalog/validate_state.cpp
index 862eff71fac..07d6e462549 100644
--- a/src/mongo/db/catalog/validate_state.cpp
+++ b/src/mongo/db/catalog/validate_state.cpp
@@ -229,12 +229,13 @@ void ValidateState::initializeCursors(OperationContext* opCtx) {
_dataThrottle.turnThrottlingOff();
}
- boost::optional<Lock::ResourceLock> checkpointLock;
- if (isBackground()) {
- // Acquires a resource mutex to prevent taking a checkpoint during opening the checkpoint
- // cursors to make sure all cursors are reading from the same point in time.
- checkpointLock.emplace(opCtx, ResourceId(RESOURCE_MUTEX, "checkpoint"), MODE_X);
- }
+ // Acquire the checkpoint lock to prevent a checkpoint from being taken while we are opening our
+ // checkpoint cursors. This ensures all cursors are reading from the same point in time.
+ auto checkpointLock = isBackground()
+ ? opCtx->getServiceContext()->getStorageEngine()->getCheckpointLock(
+ opCtx, StorageEngine::CheckpointLock::Mode::kShared)
+ : nullptr;
+
StringSet readyDurableIndexes;
try {
_traverseRecordStoreCursor = std::make_unique<SeekableRecordThrottleCursor>(
diff --git a/src/mongo/db/storage/kv/kv_engine.h b/src/mongo/db/storage/kv/kv_engine.h
index 44e212172c0..d3ce95f8c58 100644
--- a/src/mongo/db/storage/kv/kv_engine.h
+++ b/src/mongo/db/storage/kv/kv_engine.h
@@ -263,6 +263,12 @@ public:
virtual void checkpoint(OperationContext* opCtx) {}
+ virtual std::unique_ptr<StorageEngine::CheckpointLock> getCheckpointLock(
+ OperationContext* opCtx, StorageEngine::CheckpointLock::Mode mode) {
+ uasserted(ErrorCodes::CommandNotSupported,
+ "The current storage engine does not support checkpoints");
+ }
+
/**
* Returns true if the KVEngine is ephemeral -- that is, it is NOT persistent and all data is
* lost after shutdown. Otherwise, returns false.
diff --git a/src/mongo/db/storage/storage_engine.h b/src/mongo/db/storage/storage_engine.h
index 8f221ebc50e..da139aa8e9f 100644
--- a/src/mongo/db/storage/storage_engine.h
+++ b/src/mongo/db/storage/storage_engine.h
@@ -170,6 +170,28 @@ public:
};
/**
+ * RAII lock to protect checkpointing. Operations performing a checkpoint take this lock in
+ * exclusive mode. Operations which need to conflict with checkpointing but do not need to
+ * conflict with each other may take this lock in shared mode.
+ */
+ class CheckpointLock {
+ public:
+ enum class Mode {
+ kShared,
+ kExclusive,
+ };
+
+ CheckpointLock(const CheckpointLock&) = delete;
+ CheckpointLock(CheckpointLock&&) = delete;
+ CheckpointLock& operator=(const CheckpointLock&) = delete;
+
+ virtual ~CheckpointLock() = default;
+
+ protected:
+ CheckpointLock() = default;
+ };
+
+ /**
* The destructor should only be called if we are tearing down but not exiting the process.
*/
virtual ~StorageEngine() {}
@@ -484,6 +506,12 @@ public:
virtual void checkpoint(OperationContext* opCtx) = 0;
/**
+ * Returns a checkpoint lock in the requested mode.
+ */
+ virtual std::unique_ptr<CheckpointLock> getCheckpointLock(OperationContext* opCtx,
+ CheckpointLock::Mode mode) = 0;
+
+ /**
* Recovers the storage engine state to the last stable timestamp. "Stable" in this case
* refers to a timestamp that is guaranteed to never be rolled back. The stable timestamp
* used should be one provided by StorageEngine::setStableTimestamp().
diff --git a/src/mongo/db/storage/storage_engine_impl.cpp b/src/mongo/db/storage/storage_engine_impl.cpp
index d29e02122cc..de3df2908ee 100644
--- a/src/mongo/db/storage/storage_engine_impl.cpp
+++ b/src/mongo/db/storage/storage_engine_impl.cpp
@@ -1242,6 +1242,11 @@ void StorageEngineImpl::checkpoint(OperationContext* opCtx) {
_engine->checkpoint(opCtx);
}
+std::unique_ptr<StorageEngine::CheckpointLock> StorageEngineImpl::getCheckpointLock(
+ OperationContext* opCtx, CheckpointLock::Mode mode) {
+ return _engine->getCheckpointLock(opCtx, mode);
+}
+
void StorageEngineImpl::_onMinOfCheckpointAndOldestTimestampChanged(const Timestamp& timestamp) {
// No drop-pending idents present if getEarliestDropTimestamp() returns boost::none.
if (auto earliestDropTimestamp = _dropPendingIdentReaper.getEarliestDropTimestamp()) {
diff --git a/src/mongo/db/storage/storage_engine_impl.h b/src/mongo/db/storage/storage_engine_impl.h
index be302bed7e2..ed75065cfaa 100644
--- a/src/mongo/db/storage/storage_engine_impl.h
+++ b/src/mongo/db/storage/storage_engine_impl.h
@@ -327,6 +327,9 @@ public:
void checkpoint(OperationContext* opCtx) override;
+ std::unique_ptr<CheckpointLock> getCheckpointLock(OperationContext* opCtx,
+ CheckpointLock::Mode mode) override;
+
StatusWith<ReconcileResult> reconcileCatalogAndIdents(
OperationContext* opCtx, LastShutdownState lastShutdownState) override;
diff --git a/src/mongo/db/storage/storage_engine_mock.h b/src/mongo/db/storage/storage_engine_mock.h
index 91fefa20355..897dde20b33 100644
--- a/src/mongo/db/storage/storage_engine_mock.h
+++ b/src/mongo/db/storage/storage_engine_mock.h
@@ -181,6 +181,11 @@ public:
void checkpoint(OperationContext* opCtx) final {}
+ std::unique_ptr<CheckpointLock> getCheckpointLock(OperationContext* opCtx,
+ CheckpointLock::Mode mode) final {
+ return nullptr;
+ }
+
int64_t sizeOnDiskForDb(OperationContext* opCtx, const DatabaseName& dbName) final {
return 0;
}
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_index.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_index.cpp
index 70819b920ef..147c61a7c84 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_index.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_index.cpp
@@ -504,7 +504,8 @@ Status WiredTigerIndex::compact(OperationContext* opCtx) {
opCtx->recoveryUnit()->abandonSnapshot();
// WT compact prompts WT to take checkpoints, so we need to take the checkpoint lock around
// WT compact calls.
- Lock::ResourceLock checkpointLock{opCtx, ResourceId(RESOURCE_MUTEX, "checkpoint"), MODE_X};
+ auto checkpointLock = opCtx->getServiceContext()->getStorageEngine()->getCheckpointLock(
+ opCtx, StorageEngine::CheckpointLock::Mode::kExclusive);
int ret = s->compact(s, uri().c_str(), "timeout=0");
if (MONGO_unlikely(WTCompactIndexEBUSY.shouldFail())) {
ret = EBUSY;
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
index 0b42e60183b..73041899938 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
@@ -141,6 +141,24 @@ constexpr bool kThreadSanitizerEnabled = true;
constexpr bool kThreadSanitizerEnabled = false;
#endif
+class WiredTigerCheckpointLock : public StorageEngine::CheckpointLock {
+public:
+ WiredTigerCheckpointLock(OperationContext* opCtx, StorageEngine::CheckpointLock::Mode mode)
+ : _lock([&]() -> stdx::variant<Lock::SharedLock, Lock::ExclusiveLock> {
+ static Lock::ResourceMutex mutex{"checkpoint"};
+ switch (mode) {
+ case StorageEngine::CheckpointLock::Mode::kShared:
+ return Lock::SharedLock{opCtx, mutex};
+ case StorageEngine::CheckpointLock::Mode::kExclusive:
+ return Lock::ExclusiveLock{opCtx, mutex};
+ }
+ MONGO_UNREACHABLE;
+ }()) {}
+
+private:
+ stdx::variant<Lock::SharedLock, Lock::ExclusiveLock> _lock;
+};
+
boost::filesystem::path getOngoingBackupPath() {
return boost::filesystem::path(storageGlobalParams.dbpath) /
WiredTigerBackup::kOngoingBackupFile;
@@ -1892,15 +1910,19 @@ bool WiredTigerKVEngine::supportsDirectoryPerDB() const {
return true;
}
-void WiredTigerKVEngine::_checkpoint(OperationContext* opCtx, WT_SESSION* session) {
+void WiredTigerKVEngine::_checkpoint(OperationContext* opCtx, WT_SESSION* session) try {
// Ephemeral WiredTiger instances cannot do a checkpoint to disk as there is no disk backing
// the data.
if (_ephemeral) {
return;
}
+
+ // Limits the actions of concurrent checkpoint callers as we update some internal data during a
+ // checkpoint. WT has a mutex of its own to only have one checkpoint active at all times so this
+ // is only to protect our internal updates.
// TODO: SERVER-64507: Investigate whether we can smartly rely on one checkpointer if two or
// more threads checkpoint at the same time.
- stdx::lock_guard lk(_checkpointMutex);
+ auto checkpointLock = getCheckpointLock(opCtx, StorageEngine::CheckpointLock::Mode::kExclusive);
const Timestamp stableTimestamp = getStableTimestamp();
const Timestamp initialDataTimestamp = getInitialDataTimestamp();
@@ -1920,63 +1942,57 @@ void WiredTigerKVEngine::_checkpoint(OperationContext* opCtx, WT_SESSION* sessio
// safely assume that the oplog needed for crash recovery has caught up to the recorded value.
// After the checkpoint, this value will be published such that actors which truncate the oplog
// can read an updated value.
- try {
- // Three cases:
- //
- // First, initialDataTimestamp is Timestamp(0, 1) -> Take full checkpoint. This is when
- // there is no consistent view of the data (e.g: during initial sync).
- //
- // Second, stableTimestamp < initialDataTimestamp: Skip checkpoints. The data on disk is
- // prone to being rolled back. Hold off on checkpoints. Hope that the stable timestamp
- // surpasses the data on disk, allowing storage to persist newer copies to disk.
- //
- // Third, stableTimestamp >= initialDataTimestamp: Take stable checkpoint. Steady state
- // case.
- if (initialDataTimestamp.asULL() <= 1) {
- Lock::ResourceLock checkpointLock{
- opCtx, ResourceId(RESOURCE_MUTEX, "checkpoint"), MODE_X};
- invariantWTOK(session->checkpoint(session, "use_timestamp=false"), session);
- LOGV2_FOR_RECOVERY(5576602,
- 2,
- "Completed unstable checkpoint.",
- "initialDataTimestamp"_attr = initialDataTimestamp.toString());
- clearIndividuallyCheckpointedIndexes();
- } else if (stableTimestamp < initialDataTimestamp) {
- LOGV2_FOR_RECOVERY(
- 23985,
- 2,
- "Stable timestamp is behind the initial data timestamp, skipping a checkpoint.",
- "stableTimestamp"_attr = stableTimestamp.toString(),
- "initialDataTimestamp"_attr = initialDataTimestamp.toString());
- } else {
- auto oplogNeededForRollback = getOplogNeededForRollback();
- LOGV2_FOR_RECOVERY(23986,
- 2,
- "Performing stable checkpoint.",
- "stableTimestamp"_attr = stableTimestamp,
- "oplogNeededForRollback"_attr = toString(oplogNeededForRollback));
+ // Three cases:
+ //
+ // First, initialDataTimestamp is Timestamp(0, 1) -> Take full checkpoint. This is when there is
+ // no consistent view of the data (e.g: during initial sync).
+ //
+ // Second, stableTimestamp < initialDataTimestamp: Skip checkpoints. The data on disk is prone
+ // to being rolled back. Hold off on checkpoints. Hope that the stable timestamp surpasses the
+ // data on disk, allowing storage to persist newer copies to disk.
+ //
+ // Third, stableTimestamp >= initialDataTimestamp: Take stable checkpoint. Steady state case.
+ if (initialDataTimestamp.asULL() <= 1) {
+ invariantWTOK(session->checkpoint(session, "use_timestamp=false"), session);
+ LOGV2_FOR_RECOVERY(5576602,
+ 2,
+ "Completed unstable checkpoint.",
+ "initialDataTimestamp"_attr = initialDataTimestamp.toString());
+ clearIndividuallyCheckpointedIndexes();
+ } else if (stableTimestamp < initialDataTimestamp) {
+ LOGV2_FOR_RECOVERY(
+ 23985,
+ 2,
+ "Stable timestamp is behind the initial data timestamp, skipping a checkpoint.",
+ "stableTimestamp"_attr = stableTimestamp.toString(),
+ "initialDataTimestamp"_attr = initialDataTimestamp.toString());
+ } else {
+ auto oplogNeededForRollback = getOplogNeededForRollback();
- {
- Lock::ResourceLock checkpointLock{
- opCtx, ResourceId(RESOURCE_MUTEX, "checkpoint"), MODE_X};
- invariantWTOK(session->checkpoint(session, "use_timestamp=true"), session);
- clearIndividuallyCheckpointedIndexes();
- }
+ LOGV2_FOR_RECOVERY(23986,
+ 2,
+ "Performing stable checkpoint.",
+ "stableTimestamp"_attr = stableTimestamp,
+ "oplogNeededForRollback"_attr = toString(oplogNeededForRollback));
- if (oplogNeededForRollback.isOK()) {
- // Now that the checkpoint is durable, publish the oplog needed to recover from it.
- _oplogNeededForCrashRecovery.store(oplogNeededForRollback.getValue().asULL());
- }
+ {
+ invariantWTOK(session->checkpoint(session, "use_timestamp=true"), session);
+ clearIndividuallyCheckpointedIndexes();
+ }
+
+ if (oplogNeededForRollback.isOK()) {
+ // Now that the checkpoint is durable, publish the oplog needed to recover from it.
+ _oplogNeededForCrashRecovery.store(oplogNeededForRollback.getValue().asULL());
}
- } catch (const WriteConflictException&) {
- LOGV2_WARNING(22346, "Checkpoint encountered a write conflict exception.");
- } catch (const AssertionException& exc) {
- invariant(exc.code() == ErrorCodes::InterruptedAtShutdown ||
- exc.code() == ErrorCodes::Interrupted,
- exc.toString());
- LOGV2(7021300, "Skipping checkpoint due to exception", "exception"_attr = exc.toStatus());
}
+} catch (const WriteConflictException&) {
+ LOGV2_WARNING(22346, "Checkpoint encountered a write conflict exception.");
+} catch (const AssertionException& exc) {
+ invariant(exc.code() == ErrorCodes::InterruptedAtShutdown ||
+ exc.code() == ErrorCodes::Interrupted,
+ exc.toString());
+ LOGV2(7021300, "Skipping checkpoint due to exception", "exception"_attr = exc.toStatus());
}
void WiredTigerKVEngine::checkpoint(OperationContext* opCtx) {
@@ -1985,6 +2001,11 @@ void WiredTigerKVEngine::checkpoint(OperationContext* opCtx) {
return _checkpoint(opCtx, s);
}
+std::unique_ptr<StorageEngine::CheckpointLock> WiredTigerKVEngine::getCheckpointLock(
+ OperationContext* opCtx, StorageEngine::CheckpointLock::Mode mode) {
+ return std::make_unique<WiredTigerCheckpointLock>(opCtx, mode);
+}
+
bool WiredTigerKVEngine::hasIdent(OperationContext* opCtx, StringData ident) const {
return _hasUri(WiredTigerRecoveryUnit::get(opCtx)->getSession()->getSession(), _uri(ident));
}
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h
index 705f0dc04cb..c1128d6292b 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h
@@ -128,6 +128,9 @@ public:
void checkpoint(OperationContext* opCtx) override;
+ std::unique_ptr<StorageEngine::CheckpointLock> getCheckpointLock(
+ OperationContext* opCtx, StorageEngine::CheckpointLock::Mode mode) override;
+
bool isEphemeral() const override {
return _ephemeral;
}
@@ -553,10 +556,5 @@ private:
// Pins the oplog so that OplogStones will not truncate oplog history equal or newer to this
// timestamp.
AtomicWord<std::uint64_t> _pinnedOplogTimestamp;
-
- // Limits the actions of concurrent checkpoint callers as we update some internal data during a
- // checkpoint. WT has a mutex of its own to only have one checkpoint active at all times so this
- // is only to protect our internal updates.
- Mutex _checkpointMutex = MONGO_MAKE_LATCH("WiredTigerKVEngine::_checkpointMutex");
};
} // namespace mongo
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp
index 8bb44e80162..d3f8d26b3f5 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp
@@ -1738,7 +1738,8 @@ Status WiredTigerRecordStore::doCompact(OperationContext* opCtx) {
opCtx->recoveryUnit()->abandonSnapshot();
// WT compact prompts WT to take checkpoints, so we need to take the checkpoint lock around
// WT compact calls.
- Lock::ResourceLock checkpointLock{opCtx, ResourceId(RESOURCE_MUTEX, "checkpoint"), MODE_X};
+ auto checkpointLock =
+ _kvEngine->getCheckpointLock(opCtx, StorageEngine::CheckpointLock::Mode::kExclusive);
int ret = s->compact(s, getURI().c_str(), "timeout=0");
if (MONGO_unlikely(WTCompactRecordStoreEBUSY.shouldFail())) {
ret = EBUSY;
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_session_cache.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_session_cache.cpp
index 5e8337030d1..5fbfe2b8a7c 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_session_cache.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_session_cache.cpp
@@ -296,8 +296,8 @@ void WiredTigerSessionCache::waitUntilDurable(OperationContext* opCtx,
auto config = syncType == Fsync::kCheckpointStableTimestamp ? "use_timestamp=true"
: "use_timestamp=false";
{
- Lock::ResourceLock checkpointLock{
- opCtx, ResourceId(RESOURCE_MUTEX, "checkpoint"), MODE_X};
+ auto checkpointLock = _engine->getCheckpointLock(
+ opCtx, StorageEngine::CheckpointLock::Mode::kExclusive);
invariantWTOK(s->checkpoint(s, config), s);
_engine->clearIndividuallyCheckpointedIndexes();
}
@@ -352,7 +352,8 @@ void WiredTigerSessionCache::waitUntilDurable(OperationContext* opCtx,
_waitUntilDurableSession);
LOGV2_DEBUG(22419, 4, "flushed journal");
} else {
- Lock::ResourceLock checkpointLock{opCtx, ResourceId(RESOURCE_MUTEX, "checkpoint"), MODE_X};
+ auto checkpointLock =
+ _engine->getCheckpointLock(opCtx, StorageEngine::CheckpointLock::Mode::kExclusive);
invariantWTOK(_waitUntilDurableSession->checkpoint(_waitUntilDurableSession, nullptr),
_waitUntilDurableSession);
LOGV2_DEBUG(22420, 4, "created checkpoint");