diff options
-rw-r--r-- | src/mongo/base/error_codes.err | 12 | ||||
-rw-r--r-- | src/mongo/db/dbdirectclient.cpp | 12 | ||||
-rw-r--r-- | src/mongo/db/lasterror.cpp | 11 | ||||
-rw-r--r-- | src/mongo/db/lasterror.h | 14 | ||||
-rw-r--r-- | src/mongo/db/service_entry_point_mongod.cpp | 16 | ||||
-rw-r--r-- | src/mongo/s/client/shard_remote.cpp | 4 | ||||
-rw-r--r-- | src/mongo/s/service_entry_point_mongos.cpp | 2 | ||||
-rw-r--r-- | src/mongo/util/net/op_msg_integration_test.cpp | 101 |
8 files changed, 153 insertions, 19 deletions
diff --git a/src/mongo/base/error_codes.err b/src/mongo/base/error_codes.err index 1f518b2b6df..dde84b170bc 100644 --- a/src/mongo/base/error_codes.err +++ b/src/mongo/base/error_codes.err @@ -250,7 +250,17 @@ error_class("Interruption", ["Interrupted", "InterruptedAtShutdown", "InterruptedDueToReplStateChange", "ExceededTimeLimit"]) -error_class("NotMasterError", ["NotMaster", "NotMasterNoSlaveOk"]) + +# isNotMasterError() includes all codes that indicate that the node that received the request was +# not master at some point during command processing, regardless of whether some write may have +# happened. If you care about whether a write could have happened, check for individual codes. +error_class("NotMasterError", [ + "NotMaster", + "NotMasterNoSlaveOk", + "NotMasterOrSecondary", + "InterruptedDueToReplStateChange", + "PrimarySteppedDown", + ]) error_class("StaleShardingError", ["RecvStaleConfig", "SendStaleConfig", "StaleShardVersion", "StaleEpoch"]) error_class("WriteConcernError", ["WriteConcernFailed", diff --git a/src/mongo/db/dbdirectclient.cpp b/src/mongo/db/dbdirectclient.cpp index d955777eb0a..b6df5ab4dd3 100644 --- a/src/mongo/db/dbdirectclient.cpp +++ b/src/mongo/db/dbdirectclient.cpp @@ -123,6 +123,10 @@ QueryOptions DBDirectClient::_lookupAvailableOptions() { namespace { DbResponse loopbackBuildResponse(OperationContext* const opCtx, Message& toSend) { + DirectClientScope directClientScope(opCtx); + LastError::get(opCtx->getClient()).startDirectClientRequest(); + CurOp curOp(opCtx); + toSend.header().setId(nextMessageId()); toSend.header().setResponseToMsgId(0); return opCtx->getServiceContext()->getServiceEntryPoint()->handleRequest(opCtx, toSend); @@ -130,10 +134,6 @@ DbResponse loopbackBuildResponse(OperationContext* const opCtx, Message& toSend) } // namespace bool DBDirectClient::call(Message& toSend, Message& response, bool assertOk, string* actualServer) { - DirectClientScope directClientScope(_opCtx); - LastError::get(_opCtx->getClient()).startRequest(); - - CurOp curOp(_opCtx); auto dbResponse = loopbackBuildResponse(_opCtx, toSend); invariant(!dbResponse.response.empty()); response = std::move(dbResponse.response); @@ -142,10 +142,6 @@ bool DBDirectClient::call(Message& toSend, Message& response, bool assertOk, str } void DBDirectClient::say(Message& toSend, bool isRetry, string* actualServer) { - DirectClientScope directClientScope(_opCtx); - LastError::get(_opCtx->getClient()).startRequest(); - - CurOp curOp(_opCtx); auto dbResponse = loopbackBuildResponse(_opCtx, toSend); invariant(dbResponse.response.empty()); } diff --git a/src/mongo/db/lasterror.cpp b/src/mongo/db/lasterror.cpp index 370c38f9f6f..2521fa99466 100644 --- a/src/mongo/db/lasterror.cpp +++ b/src/mongo/db/lasterror.cpp @@ -52,6 +52,9 @@ void LastError::setLastError(int code, std::string msg) { reset(true); _code = code; _msg = std::move(msg); + + if (ErrorCodes::isNotMasterError(ErrorCodes::fromInt(_code))) + _hadNotMasterError = true; } void LastError::recordUpdate(bool updateObjects, long long nObjects, BSONObj upsertedId) { @@ -107,7 +110,13 @@ void LastError::disable() { _nPrev--; // caller is a command that shouldn't count as an operation } -void LastError::startRequest() { +void LastError::startTopLevelRequest() { + _disabled = false; + ++_nPrev; + _hadNotMasterError = false; +} + +void LastError::startDirectClientRequest() { _disabled = false; ++_nPrev; } diff --git a/src/mongo/db/lasterror.h b/src/mongo/db/lasterror.h index 27ce978d2b6..ee450089595 100644 --- a/src/mongo/db/lasterror.h +++ b/src/mongo/db/lasterror.h @@ -50,9 +50,14 @@ public: void reset(bool valid = false); /** - * when db receives a message/request, call this + * when db receives a top level message/request, call this */ - void startRequest(); + void startTopLevelRequest(); + + /** + * when DBDirectClient receives a message/request, call this + */ + void startDirectClientRequest(); /** * Disables error recording for the current operation. @@ -86,6 +91,10 @@ public: return _nPrev; } + bool hadNotMasterError() const { + return _hadNotMasterError; + } + class Disabled { public: explicit Disabled(LastError* le) : _le(le), _prev(le->_disabled) { @@ -115,6 +124,7 @@ private: int _nPrev = 1; bool _valid = false; bool _disabled = false; + bool _hadNotMasterError = false; }; } // namespace mongo diff --git a/src/mongo/db/service_entry_point_mongod.cpp b/src/mongo/db/service_entry_point_mongod.cpp index b0a07648ff6..6c24eab6d3a 100644 --- a/src/mongo/db/service_entry_point_mongod.cpp +++ b/src/mongo/db/service_entry_point_mongod.cpp @@ -518,6 +518,10 @@ bool runCommandImpl(OperationContext* opCtx, return result; } +// When active, we won't check if we are master in command dispatch. Activate this if you want to +// test failing during command execution. +MONGO_FP_DECLARE(skipCheckingForNotMasterInCommandDispatch); + /** * Executes a command after stripping metadata, performing authorization checks, * handling audit impersonation, and (potentially) setting maintenance mode. This method @@ -595,7 +599,9 @@ void execCommandDatabase(OperationContext* opCtx, repl::ReplicationCoordinator::get(opCtx->getClient()->getServiceContext()); const bool iAmPrimary = replCoord->canAcceptWritesForDatabase_UNSAFE(opCtx, dbname); - if (!opCtx->getClient()->isInDirectClient()) { + if (!opCtx->getClient()->isInDirectClient() && + !MONGO_FAIL_POINT(skipCheckingForNotMasterInCommandDispatch)) { + bool commandCanRunOnSecondary = command->slaveOk(); bool commandIsOverriddenToRunOnSecondary = @@ -816,7 +822,11 @@ DbResponse runCommands(OperationContext* opCtx, const Message& message) { }(); if (OpMsg::isFlagSet(message, OpMsg::kMoreToCome)) { - // TODO SERVER-28510 throw to close connection if we failed with a not master error. + // Close the connection to get client to go through server selection again. + uassert(ErrorCodes::NotMaster, + "Not-master error during fire-and-forget command processing", + !LastError::get(opCtx->getClient()).hadNotMasterError()); + return {}; // Don't reply. } @@ -1029,7 +1039,7 @@ DbResponse ServiceEntryPointMongod::handleRequest(OperationContext* opCtx, const if (c.isInDirectClient()) { invariant(!opCtx->lockState()->inAWriteUnitOfWork()); } else { - LastError::get(c).startRequest(); + LastError::get(c).startTopLevelRequest(); AuthorizationSession::get(c)->startRequest(opCtx); // We should not be holding any locks at this point diff --git a/src/mongo/s/client/shard_remote.cpp b/src/mongo/s/client/shard_remote.cpp index 4617ef23f50..1bc9b6e5c97 100644 --- a/src/mongo/s/client/shard_remote.cpp +++ b/src/mongo/s/client/shard_remote.cpp @@ -120,9 +120,7 @@ void ShardRemote::updateReplSetMonitor(const HostAndPort& remoteHost, if (remoteCommandStatus.isOK()) return; - if (ErrorCodes::isNotMasterError(remoteCommandStatus.code()) || - (remoteCommandStatus == ErrorCodes::InterruptedDueToReplStateChange) || - (remoteCommandStatus == ErrorCodes::PrimarySteppedDown)) { + if (ErrorCodes::isNotMasterError(remoteCommandStatus.code())) { _targeter->markHostNotMaster(remoteHost, remoteCommandStatus); } else if (ErrorCodes::isNetworkError(remoteCommandStatus.code())) { _targeter->markHostUnreachable(remoteHost, remoteCommandStatus); diff --git a/src/mongo/s/service_entry_point_mongos.cpp b/src/mongo/s/service_entry_point_mongos.cpp index dc1f8312b97..709485758b0 100644 --- a/src/mongo/s/service_entry_point_mongos.cpp +++ b/src/mongo/s/service_entry_point_mongos.cpp @@ -81,7 +81,7 @@ DbResponse ServiceEntryPointMongos::handleRequest(OperationContext* opCtx, const ClusterLastErrorInfo::get(client) = std::make_shared<ClusterLastErrorInfo>(); } ClusterLastErrorInfo::get(client)->newRequest(); - LastError::get(client).startRequest(); + LastError::get(client).startTopLevelRequest(); AuthorizationSession::get(opCtx->getClient())->startRequest(opCtx); DbMessage dbm(message); diff --git a/src/mongo/util/net/op_msg_integration_test.cpp b/src/mongo/util/net/op_msg_integration_test.cpp index 3d83243c1a4..3942c5dcf76 100644 --- a/src/mongo/util/net/op_msg_integration_test.cpp +++ b/src/mongo/util/net/op_msg_integration_test.cpp @@ -33,6 +33,8 @@ #include "mongo/unittest/integration_test.h" #include "mongo/unittest/unittest.h" #include "mongo/util/net/op_msg.h" +#include "mongo/util/scopeguard.h" + namespace mongo { @@ -64,4 +66,103 @@ TEST(OpMsg, UnknownOptionalFlagIsIgnored) { conn->parseCommandReplyMessage(conn->getServerAddress(), reply)->getCommandReply())); } +TEST(OpMsg, FireAndForgetInsertWorks) { + std::string errMsg; + auto conn = std::unique_ptr<DBClientBase>( + unittest::getFixtureConnectionString().connect("integration_test", errMsg)); + uassert(ErrorCodes::SocketException, errMsg, conn); + + conn->dropCollection("test.collection"); + + conn->runFireAndForgetCommand(OpMsgRequest::fromDBAndBody("test", fromjson(R"({ + insert: "collection", + writeConcern: {w: 0}, + documents: [ + {a: 1} + ] + })"))); + + ASSERT_EQ(conn->count("test.collection"), 1u); +} + +TEST(OpMsg, CloseConnectionOnFireAndForgetNotMasterError) { + const auto connStr = unittest::getFixtureConnectionString(); + + // This test only works against a replica set. + if (connStr.type() != ConnectionString::SET) { + return; + } + + bool foundSecondary = false; + for (auto host : connStr.getServers()) { + DBClientConnection conn; + uassertStatusOK(conn.connect(host, "integration_test")); + bool isMaster; + ASSERT(conn.isMaster(isMaster)); + if (isMaster) + continue; + foundSecondary = true; + + auto request = OpMsgRequest::fromDBAndBody("test", fromjson(R"({ + insert: "collection", + writeConcern: {w: 0}, + documents: [ + {a: 1} + ] + })")).serialize(); + + // Round-trip command fails with NotMaster error. Note that this failure is in command + // dispatch which ignores w:0. + Message reply; + ASSERT(conn.call(request, reply, /*assertOK*/ true, nullptr)); + ASSERT_EQ( + getStatusFromCommandResult( + conn.parseCommandReplyMessage(conn.getServerAddress(), reply)->getCommandReply()), + ErrorCodes::NotMaster); + + // Fire-and-forget closes connection when it sees that error. Note that this is using call() + // rather than say() so that we get an error back when the connection is closed. Normally + // using call() if kMoreToCome set results in blocking forever. + OpMsg::setFlag(&request, OpMsg::kMoreToCome); + ASSERT(!conn.call(request, reply, /*assertOK*/ false, nullptr)); + + uassertStatusOK(conn.connect(host, "integration_test")); // Reconnect. + + // Disable eager checking of master to simulate a stepdown occurring after the check. This + // should respect w:0. + BSONObj output; + ASSERT(conn.runCommand("admin", + fromjson(R"({ + configureFailPoint: 'skipCheckingForNotMasterInCommandDispatch', + mode: 'alwaysOn' + })"), + output)) + << output; + ON_BLOCK_EXIT([&] { + ASSERT(conn.runCommand("admin", + fromjson(R"({ + configureFailPoint: + 'skipCheckingForNotMasterInCommandDispatch', + mode: 'off' + })"), + output)) + << output; + }); + + + // Round-trip command claims to succeed due to w:0. + OpMsg::replaceFlags(&request, 0); + ASSERT(conn.call(request, reply, /*assertOK*/ true, nullptr)); + ASSERT_OK(getStatusFromCommandResult( + conn.parseCommandReplyMessage(conn.getServerAddress(), reply)->getCommandReply())); + + // Fire-and-forget should still close connection. + OpMsg::setFlag(&request, OpMsg::kMoreToCome); + ASSERT(!conn.call(request, reply, /*assertOK*/ false, nullptr)); + + break; + } + ASSERT(foundSecondary); +} + } // namespace mongo |