diff options
-rw-r--r-- | src/mongo/db/index_builds_coordinator_mongod.cpp | 17 | ||||
-rw-r--r-- | src/mongo/db/index_builds_coordinator_mongod.h | 2 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.cpp | 43 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.h | 1 |
4 files changed, 33 insertions, 30 deletions
diff --git a/src/mongo/db/index_builds_coordinator_mongod.cpp b/src/mongo/db/index_builds_coordinator_mongod.cpp index f6f64573a37..a136f74d685 100644 --- a/src/mongo/db/index_builds_coordinator_mongod.cpp +++ b/src/mongo/db/index_builds_coordinator_mongod.cpp @@ -369,7 +369,7 @@ void IndexBuildsCoordinatorMongod::_signalIfCommitQuorumIsSatisfied( if (onDiskcommitQuorum == replState->commitQuorum.get()) { if (voteReceived >= requiredQuorumCount) { LOGV2(3856201, - "Index build Commit Quorum Satisfied: {indexBuildEntry}", + "Index build commit quorum satisfied:", "indexBuildEntry"_attr = indexBuildEntry); _sendCommitQuorumSatisfiedSignal(lk, opCtx, replState); } @@ -414,15 +414,17 @@ bool IndexBuildsCoordinatorMongod::_signalIfCommitQuorumNotEnabled( return false; } -bool IndexBuildsCoordinatorMongod::_checkVoteCommitIndexCmdSucceeded(const BSONObj& response) { +bool IndexBuildsCoordinatorMongod::_checkVoteCommitIndexCmdSucceeded(const BSONObj& response, + const UUID& indexBuildUUID) { auto commandStatus = getStatusFromCommandResult(response); auto wcStatus = getWriteConcernStatusFromCommandResult(response); if (commandStatus.isOK() && wcStatus.isOK()) { return true; } LOGV2(3856202, - "'voteCommitIndexBuild' command failed with response : {response}", - "response"_attr = response); + "'voteCommitIndexBuild' command failed.", + "indexBuildUUID"_attr = indexBuildUUID, + "responseStatus"_attr = response); return false; } @@ -527,11 +529,16 @@ void IndexBuildsCoordinatorMongod::_signalPrimaryForCommitReadiness( uassertStatusOK(convertToNonFatalStatus(ex.toStatus())); } // All other error including network errors should be retried. + LOGV2_DEBUG(4666400, + 1, + "Failed to run 'voteCommitIndexBuild' command.", + "indexBuildUUID"_attr = replState->buildUUID, + "errorMsg"_attr = ex); continue; } // Command error and write concern error have to be retried. - if (_checkVoteCommitIndexCmdSucceeded(voteCmdResponse)) { + if (_checkVoteCommitIndexCmdSucceeded(voteCmdResponse, replState->buildUUID)) { break; } } diff --git a/src/mongo/db/index_builds_coordinator_mongod.h b/src/mongo/db/index_builds_coordinator_mongod.h index e9c2717304f..b7cc0d9458d 100644 --- a/src/mongo/db/index_builds_coordinator_mongod.h +++ b/src/mongo/db/index_builds_coordinator_mongod.h @@ -129,7 +129,7 @@ private: /** * Process voteCommitIndexBuild command's response. */ - bool _checkVoteCommitIndexCmdSucceeded(const BSONObj& response); + bool _checkVoteCommitIndexCmdSucceeded(const BSONObj& response, const UUID& indexBuildUUID); /** * Signals index builder to commit. diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index 0d2a90b0f10..5270397657f 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -2180,31 +2180,31 @@ BSONObj ReplicationCoordinatorImpl::runCmdOnPrimaryAndAwaitResponse( const BSONObj& cmdObj, OnRemoteCmdScheduledFn onRemoteCmdScheduled, OnRemoteCmdCompleteFn onRemoteCmdComplete) { - // Sanity check - invariant(!opCtx->lockState()->isRSTLLocked()); + // About to make network and DBDirectClient (recursive) calls, so we should not hold any locks. + invariant(!opCtx->lockState()->isLocked()); - repl::ReplicationStateTransitionLockGuard rstl(opCtx, MODE_IX); + const auto myHostAndPort = getMyHostAndPort(); + const auto primaryHostAndPort = getCurrentPrimaryHostAndPort(); - if (getMemberState().primary()) { - if (canAcceptWritesForDatabase(opCtx, dbName)) { - // Run command using DBDirectClient to avoid tcp connection. - return _runCmdOnSelfOnAlternativeClient(opCtx, dbName, cmdObj); - } - // Node is primary but it's not in a state to accept non-local writes because it might be in - // the catchup or draining phase. So, try releasing and reacquiring RSTL lock so that we - // give chance for the node to finish executing signalDrainComplete() and become master. - uassertStatusOK( - Status{ErrorCodes::NotMaster, "Node is in primary state but can't accept writes."}); + if (myHostAndPort.empty()) { + // Possibly because either rsconfig is uninitialized or the node got removed from config. + uassertStatusOK(Status{ErrorCodes::NodeNotFound, "Address unknown."}); } - // Node is not primary, so we will run the remote command via AsyncDBClient. To use - // AsyncDBClient, we will be using repl task executor. - const auto primary = getCurrentPrimaryHostAndPort(); - if (primary.empty()) { - uassertStatusOK(Status{ErrorCodes::CommandFailed, "Primary is unknown/down."}); + if (primaryHostAndPort.empty()) { + uassertStatusOK(Status{ErrorCodes::NoConfigMaster, "Primary is unknown/down."}); } - executor::RemoteCommandRequest request(primary, dbName, cmdObj, nullptr); + auto iAmPrimary = (myHostAndPort == primaryHostAndPort) ? true : false; + + if (iAmPrimary) { + // Run command using DBDirectClient to avoid tcp connection. + return _runCmdOnSelfOnAlternativeClient(opCtx, dbName, cmdObj); + } + + // Node is not primary, so we will run the remote command via AsyncDBClient. To use + // AsyncDBClient, we will be using repl task executor. + executor::RemoteCommandRequest request(primaryHostAndPort, dbName, cmdObj, nullptr); executor::RemoteCommandResponse cbkResponse( Status{ErrorCodes::InternalError, "Uninitialized value"}); @@ -2218,11 +2218,6 @@ BSONObj ReplicationCoordinatorImpl::runCmdOnPrimaryAndAwaitResponse( CallbackHandle cbkHandle = scheduleResult.getValue(); onRemoteCmdScheduled(cbkHandle); - // Before, we wait for the remote command response, it's important we release the rstl lock to - // ensure that the state transition can happen during the wait period. Else, it can lead to - // deadlock. Consider a case, where the remote command waits for majority write concern. But, we - // are not able to transition our state to secondary (steady state replication). - rstl.release(); // Wait for the response in an interruptible mode. _replExecutor->wait(cbkHandle, opCtx); diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h index cd1c8f3a919..5128fc4025f 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.h +++ b/src/mongo/db/repl/replication_coordinator_impl.h @@ -1396,6 +1396,7 @@ private: /** * Runs the command using DBDirectClient and returns the response received for that command. + * Callers of this function should not hold any locks. */ BSONObj _runCmdOnSelfOnAlternativeClient(OperationContext* opCtx, const std::string& dbName, |