diff options
-rw-r--r-- | jstests/core/currentop_waiting_for_latch.js | 90 | ||||
-rw-r--r-- | jstests/libs/parallelTester.js | 3 | ||||
-rw-r--r-- | src/mongo/db/curop.cpp | 21 | ||||
-rw-r--r-- | src/mongo/db/pipeline/mongo_process_common.cpp | 8 | ||||
-rw-r--r-- | src/mongo/platform/mutex.cpp | 10 | ||||
-rw-r--r-- | src/mongo/platform/mutex.h | 8 | ||||
-rw-r--r-- | src/mongo/util/diagnostic_info.cpp | 104 | ||||
-rw-r--r-- | src/mongo/util/diagnostic_info.h | 23 |
8 files changed, 191 insertions, 76 deletions
diff --git a/jstests/core/currentop_waiting_for_latch.js b/jstests/core/currentop_waiting_for_latch.js index 5f23b23c8e2..90b7e0fd478 100644 --- a/jstests/core/currentop_waiting_for_latch.js +++ b/jstests/core/currentop_waiting_for_latch.js @@ -10,35 +10,69 @@ const adminDB = db.getSiblingDB("admin"); const getCurrentOp = function() { + jsTestLog("Getting $currentOp"); + let result = + adminDB + .aggregate( + [ + { + $currentOp: + {allUsers: true, idleConnections: true, localOps: true, backtrace: true} + }, + ], + {readConcern: {level: "local"}}) + .toArray(); + assert(result); + return result; +}; + +const blockedOpClientName = "DiagnosticCaptureTest"; + +const getClientName = function() { let myUri = adminDB.runCommand({whatsmyuri: 1}).you; - return adminDB - .aggregate( - [ - {$currentOp: {localOps: true, allUsers: false, backtrace: true}}, - {$match: {client: myUri}} - ], - {readConcern: {level: "local"}}) - .toArray()[0]; + return adminDB.aggregate([{$currentOp: {localOps: true}}, {$match: {client: myUri}}]) + .toArray()[0] + .desc; }; -assert.commandWorked(db.adminCommand( - {"configureFailPoint": 'keepDiagnosticCaptureOnFailedLock', "mode": 'alwaysOn'})); -var result = getCurrentOp(); - -assert(result.hasOwnProperty("waitingForLatch")); -assert(result["waitingForLatch"].hasOwnProperty("timestamp")); -assert(result["waitingForLatch"].hasOwnProperty("captureName")); -assert(result["waitingForLatch"].hasOwnProperty("backtrace")); -result["waitingForLatch"]["backtrace"].forEach(function(obj) { - assert(obj.hasOwnProperty("addr")); - assert(typeof obj["addr"] === "string"); - assert(obj.hasOwnProperty("path")); - assert(typeof obj["path"] === "string"); -}); - -assert.commandWorked( - db.adminCommand({"configureFailPoint": 'keepDiagnosticCaptureOnFailedLock', "mode": 'off'})); -result = getCurrentOp(); - -assert(!result.hasOwnProperty("waitingForLatch")); +let clientName = getClientName(); + +try { + assert.commandWorked(db.adminCommand({ + "configureFailPoint": 'currentOpSpawnsThreadWaitingForLatch', + "mode": 'alwaysOn', + "data": { + 'clientName': clientName, + }, + })); + + let result = null; + getCurrentOp().forEach(function(op) { + jsTestLog(tojson(op)); + if (op["desc"] == blockedOpClientName) { + result = op; + } + }); + assert(result); + assert(result.hasOwnProperty("waitingForLatch")); + assert(result["waitingForLatch"].hasOwnProperty("timestamp")); + assert(result["waitingForLatch"].hasOwnProperty("captureName")); + assert(result["waitingForLatch"].hasOwnProperty("backtrace")); + result["waitingForLatch"]["backtrace"].forEach(function(frame) { + assert(frame.hasOwnProperty("addr")); + assert(typeof frame["addr"] === "string"); + assert(frame.hasOwnProperty("path")); + assert(typeof frame["path"] === "string"); + }); +} finally { + assert.commandWorked(db.adminCommand( + {"configureFailPoint": 'currentOpSpawnsThreadWaitingForLatch', "mode": 'off'})); + + getCurrentOp().forEach(function(op) { + jsTestLog(tojson(op)); + if (op["desc"] == blockedOpClientName) { + assert(!op.hasOwnProperty("waitingForLatch")); + } + }); +} })(); diff --git a/jstests/libs/parallelTester.js b/jstests/libs/parallelTester.js index 41ea256713e..489f8846637 100644 --- a/jstests/libs/parallelTester.js +++ b/jstests/libs/parallelTester.js @@ -204,9 +204,6 @@ if (typeof _threadInject != "undefined") { // Destroys and recreates the catalog, which will interfere with other tests. "restart_catalog.js", - // Concurrently using keepDiagnosticCaptureOnFailedLock fails, will be fixed soon. - "currentop_waiting_for_latch.js", - // This test works close to the BSON document limit for entries in the durable catalog, // so running it in parallel with other tests will cause failures. "long_collection_names.js", diff --git a/src/mongo/db/curop.cpp b/src/mongo/db/curop.cpp index 6fcb94b2938..cef1fcd7449 100644 --- a/src/mongo/db/curop.cpp +++ b/src/mongo/db/curop.cpp @@ -48,7 +48,6 @@ #include "mongo/db/prepare_conflict_tracker.h" #include "mongo/db/query/getmore_request.h" #include "mongo/db/query/plan_summary_stats.h" -#include "mongo/platform/mutex.h" #include "mongo/rpc/metadata/client_metadata.h" #include "mongo/rpc/metadata/client_metadata_ismaster.h" #include "mongo/rpc/metadata/impersonated_user_metadata.h" @@ -234,28 +233,12 @@ CurOp* CurOp::get(const OperationContext& opCtx) { return _curopStack(opCtx).top(); } -namespace { - -struct { - Mutex mutex = Mutex("TestMutex"_sd, Seconds(1)); - stdx::unique_lock<Mutex> lock = stdx::unique_lock<Mutex>(mutex, stdx::defer_lock); -} gHangLock; - -} // namespace void CurOp::reportCurrentOpForClient(OperationContext* opCtx, Client* client, bool truncateOps, bool backtraceMode, BSONObjBuilder* infoBuilder) { invariant(client); - if (MONGO_unlikely(keepDiagnosticCaptureOnFailedLock.shouldFail())) { - gHangLock.lock.lock(); - try { - stdx::lock_guard testLock(gHangLock.mutex); - } catch (const DBException& e) { - log() << "Successfully caught " << e; - } - } OperationContext* clientOpCtx = client->getOperationContext(); @@ -336,10 +319,6 @@ void CurOp::reportCurrentOpForClient(OperationContext* opCtx, } } } - - if (MONGO_unlikely(keepDiagnosticCaptureOnFailedLock.shouldFail())) { - gHangLock.lock.unlock(); - } } void CurOp::setGenericCursor_inlock(GenericCursor gc) { diff --git a/src/mongo/db/pipeline/mongo_process_common.cpp b/src/mongo/db/pipeline/mongo_process_common.cpp index 1c3a9166cfa..3d20d646258 100644 --- a/src/mongo/db/pipeline/mongo_process_common.cpp +++ b/src/mongo/db/pipeline/mongo_process_common.cpp @@ -27,6 +27,8 @@ * it in the license file. */ +#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kDefault + #include "mongo/platform/basic.h" #include "mongo/db/pipeline/mongo_process_common.h" @@ -39,8 +41,12 @@ #include "mongo/db/operation_context.h" #include "mongo/db/pipeline/expression_context.h" #include "mongo/db/service_context.h" +#include "mongo/platform/atomic_word.h" +#include "mongo/platform/mutex.h" #include "mongo/s/catalog_cache.h" #include "mongo/s/grid.h" +#include "mongo/util/diagnostic_info.h" +#include "mongo/util/log.h" #include "mongo/util/net/socket_utils.h" namespace mongo { @@ -58,6 +64,8 @@ std::vector<BSONObj> MongoProcessCommon::getCurrentOps( std::vector<BSONObj> ops; + auto blockedOpGuard = DiagnosticInfo::maybeMakeBlockedOpForTest(opCtx->getClient()); + for (ServiceContext::LockedClientsCursor cursor(opCtx->getClient()->getServiceContext()); Client* client = cursor.next();) { invariant(client); diff --git a/src/mongo/platform/mutex.cpp b/src/mongo/platform/mutex.cpp index 47dd212c92c..49ececfd25e 100644 --- a/src/mongo/platform/mutex.cpp +++ b/src/mongo/platform/mutex.cpp @@ -43,17 +43,11 @@ void Mutex::lock() { if (gLockActions) { gLockActions->onContendedLock(_name); } - hasLock = _mutex.try_lock_for(_lockTimeout.toSystemDuration() - - kContendedLockTimeout.toSystemDuration()); - if (gLockActions && !hasLock) { - gLockActions->onFailedLock(); - } - uassert( - ErrorCodes::InternalError, "Unable to take latch, wait time exceeds set timeout", hasLock); + _mutex.lock(); } void Mutex::unlock() { if (gLockActions) { - gLockActions->onUnlock(); + gLockActions->onUnlock(_name); } _mutex.unlock(); } diff --git a/src/mongo/platform/mutex.h b/src/mongo/platform/mutex.h index 4ba2a7b7737..ded61a90783 100644 --- a/src/mongo/platform/mutex.h +++ b/src/mongo/platform/mutex.h @@ -42,20 +42,18 @@ class LockActions { public: virtual ~LockActions() = default; virtual void onContendedLock(const StringData& name) = 0; - virtual void onUnlock() = 0; - virtual void onFailedLock() = 0; + virtual void onUnlock(const StringData& name) = 0; }; class Mutex { public: static constexpr auto kAnonymousMutexStr = "AnonymousMutex"_sd; + static constexpr Milliseconds kContendedLockTimeout = Milliseconds(100); Mutex() : Mutex(kAnonymousMutexStr) {} // Note that StringData is a view type, thus the underlying string for _name must outlive any // given Mutex explicit Mutex(const StringData& name) : _name(name) {} - explicit Mutex(const StringData& name, Seconds lockTimeout) - : _name(name), _lockTimeout(lockTimeout) {} void lock(); void unlock(); @@ -68,8 +66,6 @@ public: private: const StringData _name; - const Seconds _lockTimeout = Seconds(60); - static constexpr Milliseconds kContendedLockTimeout = Milliseconds(100); stdx::timed_mutex _mutex; }; diff --git a/src/mongo/util/diagnostic_info.cpp b/src/mongo/util/diagnostic_info.cpp index a05dcb8a3f6..9681414e797 100644 --- a/src/mongo/util/diagnostic_info.cpp +++ b/src/mongo/util/diagnostic_info.cpp @@ -54,9 +54,79 @@ using namespace fmt::literals; namespace mongo { // Maximum number of stack frames to appear in a backtrace. const unsigned int kMaxBackTraceFrames = 100; -MONGO_FAIL_POINT_DEFINE(keepDiagnosticCaptureOnFailedLock); namespace { +MONGO_FAIL_POINT_DEFINE(currentOpSpawnsThreadWaitingForLatch); + +constexpr auto kBlockedOpMutexName = "BlockedOpForTest"_sd; + +class BlockedOp { +public: + void start(ServiceContext* serviceContext); + void join(); + void setIsContended(bool value); + +private: + Mutex _testMutex{kBlockedOpMutexName}; + + stdx::condition_variable _cv; + stdx::mutex _m; + + struct State { + bool isContended = false; + boost::optional<stdx::thread> thread{boost::none}; + }; + State _state; +} gBlockedOp; + +// This function causes us to make an additional thread with a self-contended lock so that +// $currentOp can observe its DiagnosticInfo. Note that we track each thread that called us so that +// we can join the thread when they are gone. +void BlockedOp::start(ServiceContext* serviceContext) { + stdx::unique_lock<stdx::mutex> lk(_m); + + invariant(!_state.thread); + + _testMutex.lock(); + _state.thread = stdx::thread([this, serviceContext]() mutable { + ThreadClient tc("DiagnosticCaptureTest", serviceContext); + + log() << "Entered currentOpSpawnsThreadWaitingForLatch thread"; + + stdx::lock_guard testLock(_testMutex); + + log() << "Joining currentOpSpawnsThreadWaitingForLatch thread"; + }); + + _cv.wait(lk, [this] { return _state.isContended; }); + log() << "Started thread for currentOpSpawnsThreadWaitingForLatch"; +} + +// This function unlocks testMutex and joins if there are no more callers of BlockedOp::start() +// remaining +void BlockedOp::join() { + auto thread = [&] { + stdx::lock_guard<stdx::mutex> lk(_m); + + invariant(_state.thread); + + _testMutex.unlock(); + + _state.isContended = false; + _cv.notify_one(); + + return std::exchange(_state.thread, boost::none); + }(); + thread->join(); +} + +void BlockedOp::setIsContended(bool value) { + log() << "Setting isContended to " << (value ? "true" : "false"); + stdx::lock_guard lk(_m); + _state.isContended = value; + _cv.notify_one(); +} + const auto gDiagnosticHandle = Client::declareDecoration<DiagnosticInfo::Diagnostic>(); MONGO_INITIALIZER(LockActions)(InitializerContext* context) { @@ -68,15 +138,15 @@ MONGO_INITIALIZER(LockActions)(InitializerContext* context) { Client::getCurrent(), std::make_shared<DiagnosticInfo>(takeDiagnosticInfo(name))); } + + if (currentOpSpawnsThreadWaitingForLatch.shouldFail() && + (name == kBlockedOpMutexName)) { + gBlockedOp.setIsContended(true); + } } - void onUnlock() override { + void onUnlock(const StringData&) override { DiagnosticInfo::Diagnostic::clearDiagnostic(); } - void onFailedLock() override { - if (!MONGO_unlikely(keepDiagnosticCaptureOnFailedLock.shouldFail())) { - DiagnosticInfo::Diagnostic::clearDiagnostic(); - } - } }; std::unique_ptr<LockActions> mutexPointer = std::make_unique<LockActionsSubclass>(); @@ -275,4 +345,24 @@ DiagnosticInfo takeDiagnosticInfo(const StringData& captureName) { captureName, getBacktraceAddresses()); } + +DiagnosticInfo::BlockedOpGuard::~BlockedOpGuard() { + gBlockedOp.join(); +} + +auto DiagnosticInfo::maybeMakeBlockedOpForTest(Client* client) -> std::unique_ptr<BlockedOpGuard> { + std::unique_ptr<BlockedOpGuard> guard; + currentOpSpawnsThreadWaitingForLatch.executeIf( + [&](const BSONObj&) { + gBlockedOp.start(client->getServiceContext()); + guard = std::make_unique<BlockedOpGuard>(); + }, + [&](const BSONObj& data) { + return data.hasField("clientName") && + (data.getStringField("clientName") == client->desc()); + }); + + return guard; +} + } // namespace mongo diff --git a/src/mongo/util/diagnostic_info.h b/src/mongo/util/diagnostic_info.h index 0a4958f35de..a2c3339d3a5 100644 --- a/src/mongo/util/diagnostic_info.h +++ b/src/mongo/util/diagnostic_info.h @@ -30,19 +30,29 @@ #pragma once #include "mongo/base/string_data.h" -#include "mongo/db/service_context.h" -#include "mongo/stdx/mutex.h" +#include "mongo/db/client.h" +#include "mongo/platform/condition_variable.h" +#include "mongo/platform/mutex.h" #include "mongo/util/fail_point_service.h" #include "mongo/util/time_support.h" namespace mongo { -extern FailPoint keepDiagnosticCaptureOnFailedLock; /** * DiagnosticInfo keeps track of diagnostic information such as a developer provided * name, the time when a lock was first acquired, and a partial caller call stack. */ class DiagnosticInfo { public: + /** + * A simple RAII guard to attempt to join a blocked op once it is no longer needed + * + * This type is used in tests in conjunction with maybeMakeBlockedOpForTest + */ + class BlockedOpGuard { + public: + ~BlockedOpGuard(); + }; + struct Diagnostic { static std::shared_ptr<DiagnosticInfo> get(Client*); static void set(Client*, std::shared_ptr<DiagnosticInfo>); @@ -92,6 +102,12 @@ public: std::string toString() const; friend DiagnosticInfo takeDiagnosticInfo(const StringData& captureName); + /** + * This function checks the FailPoint currentOpSpawnsThreadWaitingForLatch and potentially + * launches a blocked operation to populate waitingForLatch for $currentOp. + */ + static std::unique_ptr<BlockedOpGuard> maybeMakeBlockedOpForTest(Client* client); + private: friend bool operator==(const DiagnosticInfo& info1, const DiagnosticInfo& info2); friend bool operator!=(const DiagnosticInfo& info1, const DiagnosticInfo& info2) { @@ -124,4 +140,5 @@ inline std::ostream& operator<<(std::ostream& s, const DiagnosticInfo& info) { * Captures the diagnostic information based on the caller's context. */ DiagnosticInfo takeDiagnosticInfo(const StringData& captureName); + } // namespace mongo |