summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSuganthi Mani <suganthi.mani@mongodb.com>2020-03-06 05:51:17 -0500
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-03-09 15:03:43 +0000
commit997841bdace7ff9ed5fd5bd0f952ec20880b9d92 (patch)
tree2af832d3b50ce9135c33f48767d8307dab697a5f
parent86d40fda4d156c0a439f1c5edb2b7890a085d337 (diff)
downloadmongo-997841bdace7ff9ed5fd5bd0f952ec20880b9d92.tar.gz
SERVER-46664 runCmdOnPrimaryAndAwaitResponse() should not run DBDirect client command with the rstl lock held.
-rw-r--r--src/mongo/db/index_builds_coordinator_mongod.cpp17
-rw-r--r--src/mongo/db/index_builds_coordinator_mongod.h2
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp43
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.h1
4 files changed, 33 insertions, 30 deletions
diff --git a/src/mongo/db/index_builds_coordinator_mongod.cpp b/src/mongo/db/index_builds_coordinator_mongod.cpp
index f6f64573a37..a136f74d685 100644
--- a/src/mongo/db/index_builds_coordinator_mongod.cpp
+++ b/src/mongo/db/index_builds_coordinator_mongod.cpp
@@ -369,7 +369,7 @@ void IndexBuildsCoordinatorMongod::_signalIfCommitQuorumIsSatisfied(
if (onDiskcommitQuorum == replState->commitQuorum.get()) {
if (voteReceived >= requiredQuorumCount) {
LOGV2(3856201,
- "Index build Commit Quorum Satisfied: {indexBuildEntry}",
+ "Index build commit quorum satisfied:",
"indexBuildEntry"_attr = indexBuildEntry);
_sendCommitQuorumSatisfiedSignal(lk, opCtx, replState);
}
@@ -414,15 +414,17 @@ bool IndexBuildsCoordinatorMongod::_signalIfCommitQuorumNotEnabled(
return false;
}
-bool IndexBuildsCoordinatorMongod::_checkVoteCommitIndexCmdSucceeded(const BSONObj& response) {
+bool IndexBuildsCoordinatorMongod::_checkVoteCommitIndexCmdSucceeded(const BSONObj& response,
+ const UUID& indexBuildUUID) {
auto commandStatus = getStatusFromCommandResult(response);
auto wcStatus = getWriteConcernStatusFromCommandResult(response);
if (commandStatus.isOK() && wcStatus.isOK()) {
return true;
}
LOGV2(3856202,
- "'voteCommitIndexBuild' command failed with response : {response}",
- "response"_attr = response);
+ "'voteCommitIndexBuild' command failed.",
+ "indexBuildUUID"_attr = indexBuildUUID,
+ "responseStatus"_attr = response);
return false;
}
@@ -527,11 +529,16 @@ void IndexBuildsCoordinatorMongod::_signalPrimaryForCommitReadiness(
uassertStatusOK(convertToNonFatalStatus(ex.toStatus()));
}
// All other error including network errors should be retried.
+ LOGV2_DEBUG(4666400,
+ 1,
+ "Failed to run 'voteCommitIndexBuild' command.",
+ "indexBuildUUID"_attr = replState->buildUUID,
+ "errorMsg"_attr = ex);
continue;
}
// Command error and write concern error have to be retried.
- if (_checkVoteCommitIndexCmdSucceeded(voteCmdResponse)) {
+ if (_checkVoteCommitIndexCmdSucceeded(voteCmdResponse, replState->buildUUID)) {
break;
}
}
diff --git a/src/mongo/db/index_builds_coordinator_mongod.h b/src/mongo/db/index_builds_coordinator_mongod.h
index e9c2717304f..b7cc0d9458d 100644
--- a/src/mongo/db/index_builds_coordinator_mongod.h
+++ b/src/mongo/db/index_builds_coordinator_mongod.h
@@ -129,7 +129,7 @@ private:
/**
* Process voteCommitIndexBuild command's response.
*/
- bool _checkVoteCommitIndexCmdSucceeded(const BSONObj& response);
+ bool _checkVoteCommitIndexCmdSucceeded(const BSONObj& response, const UUID& indexBuildUUID);
/**
* Signals index builder to commit.
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index 0d2a90b0f10..5270397657f 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -2180,31 +2180,31 @@ BSONObj ReplicationCoordinatorImpl::runCmdOnPrimaryAndAwaitResponse(
const BSONObj& cmdObj,
OnRemoteCmdScheduledFn onRemoteCmdScheduled,
OnRemoteCmdCompleteFn onRemoteCmdComplete) {
- // Sanity check
- invariant(!opCtx->lockState()->isRSTLLocked());
+ // About to make network and DBDirectClient (recursive) calls, so we should not hold any locks.
+ invariant(!opCtx->lockState()->isLocked());
- repl::ReplicationStateTransitionLockGuard rstl(opCtx, MODE_IX);
+ const auto myHostAndPort = getMyHostAndPort();
+ const auto primaryHostAndPort = getCurrentPrimaryHostAndPort();
- if (getMemberState().primary()) {
- if (canAcceptWritesForDatabase(opCtx, dbName)) {
- // Run command using DBDirectClient to avoid tcp connection.
- return _runCmdOnSelfOnAlternativeClient(opCtx, dbName, cmdObj);
- }
- // Node is primary but it's not in a state to accept non-local writes because it might be in
- // the catchup or draining phase. So, try releasing and reacquiring RSTL lock so that we
- // give chance for the node to finish executing signalDrainComplete() and become master.
- uassertStatusOK(
- Status{ErrorCodes::NotMaster, "Node is in primary state but can't accept writes."});
+ if (myHostAndPort.empty()) {
+ // Possibly because either rsconfig is uninitialized or the node got removed from config.
+ uassertStatusOK(Status{ErrorCodes::NodeNotFound, "Address unknown."});
}
- // Node is not primary, so we will run the remote command via AsyncDBClient. To use
- // AsyncDBClient, we will be using repl task executor.
- const auto primary = getCurrentPrimaryHostAndPort();
- if (primary.empty()) {
- uassertStatusOK(Status{ErrorCodes::CommandFailed, "Primary is unknown/down."});
+ if (primaryHostAndPort.empty()) {
+ uassertStatusOK(Status{ErrorCodes::NoConfigMaster, "Primary is unknown/down."});
}
- executor::RemoteCommandRequest request(primary, dbName, cmdObj, nullptr);
+ auto iAmPrimary = (myHostAndPort == primaryHostAndPort) ? true : false;
+
+ if (iAmPrimary) {
+ // Run command using DBDirectClient to avoid tcp connection.
+ return _runCmdOnSelfOnAlternativeClient(opCtx, dbName, cmdObj);
+ }
+
+ // Node is not primary, so we will run the remote command via AsyncDBClient. To use
+ // AsyncDBClient, we will be using repl task executor.
+ executor::RemoteCommandRequest request(primaryHostAndPort, dbName, cmdObj, nullptr);
executor::RemoteCommandResponse cbkResponse(
Status{ErrorCodes::InternalError, "Uninitialized value"});
@@ -2218,11 +2218,6 @@ BSONObj ReplicationCoordinatorImpl::runCmdOnPrimaryAndAwaitResponse(
CallbackHandle cbkHandle = scheduleResult.getValue();
onRemoteCmdScheduled(cbkHandle);
- // Before, we wait for the remote command response, it's important we release the rstl lock to
- // ensure that the state transition can happen during the wait period. Else, it can lead to
- // deadlock. Consider a case, where the remote command waits for majority write concern. But, we
- // are not able to transition our state to secondary (steady state replication).
- rstl.release();
// Wait for the response in an interruptible mode.
_replExecutor->wait(cbkHandle, opCtx);
diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h
index cd1c8f3a919..5128fc4025f 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.h
+++ b/src/mongo/db/repl/replication_coordinator_impl.h
@@ -1396,6 +1396,7 @@ private:
/**
* Runs the command using DBDirectClient and returns the response received for that command.
+ * Callers of this function should not hold any locks.
*/
BSONObj _runCmdOnSelfOnAlternativeClient(OperationContext* opCtx,
const std::string& dbName,