summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWenbin Zhu <wenbin.zhu@mongodb.com>2021-09-03 23:06:13 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-11-01 20:27:30 +0000
commitd1a9e68d581f8e3162bc790bdb30986b8d030d86 (patch)
tree51e20934ba9c53716fd7daf4ac9926c8dd31d67e
parentaa68c654c00bf87283f34607405d7b72823a92ee (diff)
downloadmongo-d1a9e68d581f8e3162bc790bdb30986b8d030d86.tar.gz
SERVER-59226 Fix deadlock between uninterruptible profiling operation and stepdown thread.
(cherry picked from commit 43479818bd01f27ee25b6e992045529d2ac0185a)
-rw-r--r--src/mongo/db/introspect.cpp28
-rw-r--r--src/mongo/db/operation_context.cpp17
-rw-r--r--src/mongo/db/operation_context.h11
3 files changed, 47 insertions, 9 deletions
diff --git a/src/mongo/db/introspect.cpp b/src/mongo/db/introspect.cpp
index 8bda2543426..78ae238a239 100644
--- a/src/mongo/db/introspect.cpp
+++ b/src/mongo/db/introspect.cpp
@@ -50,9 +50,9 @@
namespace mongo {
-using std::unique_ptr;
using std::endl;
using std::string;
+using std::unique_ptr;
namespace {
@@ -117,20 +117,32 @@ void profile(OperationContext* opCtx, NetworkOp op) {
const string dbName(nsToDatabase(CurOp::get(opCtx)->getNS()));
+ // True if we need to acquire an X lock on the database in order to create the system.profile
+ // collection.
+ bool acquireDbXLock = false;
+
try {
- // Even if the operation we are profiling was interrupted, we still want to output the
- // profiler entry. This lock guard will prevent lock acquisitions from throwing exceptions
- // before we finish writing the entry. However, our maximum lock timeout overrides
- // uninterruptibility.
- boost::optional<UninterruptibleLockGuard> noInterrupt;
+ // Set the opCtx to be only interruptible for replication state changes. This is needed
+ // because for some operations that are already marked as killed due to errors such as
+ // operation time exceeding maxTimeMS, we still want to output the profiler entry. Thus
+ // in these cases we do not interrupt lock acquisition even though the opCtx has already
+ // been killed. In the meantime we need to make sure replication state changes can still
+ // interrupt lock acquisition, otherwise there could be deadlocks when the state change
+ // thread is waiting for the session checked out by this opCtx while holding RSTL lock.
+ // However when maxLockTimeout is set, we want it to be always interruptible.
if (!opCtx->lockState()->hasMaxLockTimeout()) {
- noInterrupt.emplace(opCtx->lockState());
+ opCtx->setIgnoreInterruptsExceptForReplStateChange(true);
}
- bool acquireDbXLock = false;
+ // IX lock acquisitions beyond this block will not be related to writes to system.profile.
+ ON_BLOCK_EXIT([opCtx] { opCtx->setIgnoreInterruptsExceptForReplStateChange(false); });
+
while (true) {
std::unique_ptr<AutoGetDb> autoGetDb;
if (acquireDbXLock) {
+ // We should not attempt to acquire an X lock while opCtx ignores interrupts.
+ opCtx->setIgnoreInterruptsExceptForReplStateChange(false);
+
autoGetDb.reset(new AutoGetDb(opCtx, dbName, MODE_X));
if (autoGetDb->getDb()) {
createProfileCollection(opCtx, autoGetDb->getDb()).transitional_ignore();
diff --git a/src/mongo/db/operation_context.cpp b/src/mongo/db/operation_context.cpp
index 02a3b1226d5..3775b73a82f 100644
--- a/src/mongo/db/operation_context.cpp
+++ b/src/mongo/db/operation_context.cpp
@@ -196,6 +196,14 @@ bool opShouldFail(Client* client, const BSONObj& failPointInfo) {
} // namespace
Status OperationContext::checkForInterruptNoAssert() {
+ const auto killStatus = getKillStatus();
+
+ if (_ignoreInterruptsExceptForReplStateChange &&
+ killStatus != ErrorCodes::InterruptedDueToReplStateChange &&
+ !_killRequestedForReplStateChange.loadRelaxed()) {
+ return Status::OK();
+ }
+
// TODO: Remove the MONGO_likely(getClient()) once all operation contexts are constructed with
// clients.
if (MONGO_likely(getClient() && getServiceContext()) &&
@@ -215,7 +223,6 @@ Status OperationContext::checkForInterruptNoAssert() {
}
}
- const auto killStatus = getKillStatus();
if (killStatus != ErrorCodes::OK) {
return Status(killStatus, "operation was interrupted");
}
@@ -361,6 +368,14 @@ void OperationContext::markKilled(ErrorCodes::Error killCode) {
});
lkWaitMutex = stdx::unique_lock<stdx::mutex>{*_waitMutex};
}
+
+ // Record that a kill was requested on this operationContext due to replication state change
+ // since it is possible to call markKilled() multiple times but only the first killCode will
+ // be preserved.
+ if (killCode == ErrorCodes::InterruptedDueToReplStateChange) {
+ _killRequestedForReplStateChange.store(true);
+ }
+
_killCode.compareAndSwap(ErrorCodes::OK, killCode);
if (lkWaitMutex && _numKillers == 0) {
invariant(_waitCV);
diff --git a/src/mongo/db/operation_context.h b/src/mongo/db/operation_context.h
index 4a9bc116da4..a2eacc1e700 100644
--- a/src/mongo/db/operation_context.h
+++ b/src/mongo/db/operation_context.h
@@ -432,6 +432,14 @@ public:
*/
Microseconds getRemainingMaxTimeMicros() const;
+ /**
+ * Sets that this operation should ignore interruption except for replication state change. Can
+ * only be called by the thread executing this on behalf of this OperationContext.
+ */
+ void setIgnoreInterruptsExceptForReplStateChange(bool target) {
+ _ignoreInterruptsExceptForReplStateChange = target;
+ }
+
private:
/**
* Returns true if this operation has a deadline and it has passed according to the fast clock
@@ -521,6 +529,9 @@ private:
Timer _elapsedTime;
bool _writesAreReplicated = true;
+
+ bool _ignoreInterruptsExceptForReplStateChange = false;
+ AtomicWord<bool> _killRequestedForReplStateChange{false};
};
namespace repl {