summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMathias Stearn <mathias@10gen.com>2017-07-24 15:59:24 -0400
committerMathias Stearn <mathias@10gen.com>2017-08-17 22:13:09 -0400
commit583127818f1ead21b67a57eb117b9678232e5472 (patch)
treed64e7927045a9b33d4e248acb48433903e3bb340
parent112982eeaddf92cbc14be655061200e23069250a (diff)
downloadmongo-583127818f1ead21b67a57eb117b9678232e5472.tar.gz
SERVER-28510 Close incoming mongod connection on NotMaster error when handling fire-and-forget command
-rw-r--r--src/mongo/base/error_codes.err12
-rw-r--r--src/mongo/db/dbdirectclient.cpp12
-rw-r--r--src/mongo/db/lasterror.cpp11
-rw-r--r--src/mongo/db/lasterror.h14
-rw-r--r--src/mongo/db/service_entry_point_mongod.cpp16
-rw-r--r--src/mongo/s/client/shard_remote.cpp4
-rw-r--r--src/mongo/s/service_entry_point_mongos.cpp2
-rw-r--r--src/mongo/util/net/op_msg_integration_test.cpp101
8 files changed, 153 insertions, 19 deletions
diff --git a/src/mongo/base/error_codes.err b/src/mongo/base/error_codes.err
index 1f518b2b6df..dde84b170bc 100644
--- a/src/mongo/base/error_codes.err
+++ b/src/mongo/base/error_codes.err
@@ -250,7 +250,17 @@ error_class("Interruption", ["Interrupted",
"InterruptedAtShutdown",
"InterruptedDueToReplStateChange",
"ExceededTimeLimit"])
-error_class("NotMasterError", ["NotMaster", "NotMasterNoSlaveOk"])
+
+# isNotMasterError() includes all codes that indicate that the node that received the request was
+# not master at some point during command processing, regardless of whether some write may have
+# happened. If you care about whether a write could have happened, check for individual codes.
+error_class("NotMasterError", [
+ "NotMaster",
+ "NotMasterNoSlaveOk",
+ "NotMasterOrSecondary",
+ "InterruptedDueToReplStateChange",
+ "PrimarySteppedDown",
+ ])
error_class("StaleShardingError",
["RecvStaleConfig", "SendStaleConfig", "StaleShardVersion", "StaleEpoch"])
error_class("WriteConcernError", ["WriteConcernFailed",
diff --git a/src/mongo/db/dbdirectclient.cpp b/src/mongo/db/dbdirectclient.cpp
index d955777eb0a..b6df5ab4dd3 100644
--- a/src/mongo/db/dbdirectclient.cpp
+++ b/src/mongo/db/dbdirectclient.cpp
@@ -123,6 +123,10 @@ QueryOptions DBDirectClient::_lookupAvailableOptions() {
namespace {
DbResponse loopbackBuildResponse(OperationContext* const opCtx, Message& toSend) {
+ DirectClientScope directClientScope(opCtx);
+ LastError::get(opCtx->getClient()).startDirectClientRequest();
+ CurOp curOp(opCtx);
+
toSend.header().setId(nextMessageId());
toSend.header().setResponseToMsgId(0);
return opCtx->getServiceContext()->getServiceEntryPoint()->handleRequest(opCtx, toSend);
@@ -130,10 +134,6 @@ DbResponse loopbackBuildResponse(OperationContext* const opCtx, Message& toSend)
} // namespace
bool DBDirectClient::call(Message& toSend, Message& response, bool assertOk, string* actualServer) {
- DirectClientScope directClientScope(_opCtx);
- LastError::get(_opCtx->getClient()).startRequest();
-
- CurOp curOp(_opCtx);
auto dbResponse = loopbackBuildResponse(_opCtx, toSend);
invariant(!dbResponse.response.empty());
response = std::move(dbResponse.response);
@@ -142,10 +142,6 @@ bool DBDirectClient::call(Message& toSend, Message& response, bool assertOk, str
}
void DBDirectClient::say(Message& toSend, bool isRetry, string* actualServer) {
- DirectClientScope directClientScope(_opCtx);
- LastError::get(_opCtx->getClient()).startRequest();
-
- CurOp curOp(_opCtx);
auto dbResponse = loopbackBuildResponse(_opCtx, toSend);
invariant(dbResponse.response.empty());
}
diff --git a/src/mongo/db/lasterror.cpp b/src/mongo/db/lasterror.cpp
index 370c38f9f6f..2521fa99466 100644
--- a/src/mongo/db/lasterror.cpp
+++ b/src/mongo/db/lasterror.cpp
@@ -52,6 +52,9 @@ void LastError::setLastError(int code, std::string msg) {
reset(true);
_code = code;
_msg = std::move(msg);
+
+ if (ErrorCodes::isNotMasterError(ErrorCodes::fromInt(_code)))
+ _hadNotMasterError = true;
}
void LastError::recordUpdate(bool updateObjects, long long nObjects, BSONObj upsertedId) {
@@ -107,7 +110,13 @@ void LastError::disable() {
_nPrev--; // caller is a command that shouldn't count as an operation
}
-void LastError::startRequest() {
+void LastError::startTopLevelRequest() {
+ _disabled = false;
+ ++_nPrev;
+ _hadNotMasterError = false;
+}
+
+void LastError::startDirectClientRequest() {
_disabled = false;
++_nPrev;
}
diff --git a/src/mongo/db/lasterror.h b/src/mongo/db/lasterror.h
index 27ce978d2b6..ee450089595 100644
--- a/src/mongo/db/lasterror.h
+++ b/src/mongo/db/lasterror.h
@@ -50,9 +50,14 @@ public:
void reset(bool valid = false);
/**
- * when db receives a message/request, call this
+ * when db receives a top level message/request, call this
*/
- void startRequest();
+ void startTopLevelRequest();
+
+ /**
+ * when DBDirectClient receives a message/request, call this
+ */
+ void startDirectClientRequest();
/**
* Disables error recording for the current operation.
@@ -86,6 +91,10 @@ public:
return _nPrev;
}
+ bool hadNotMasterError() const {
+ return _hadNotMasterError;
+ }
+
class Disabled {
public:
explicit Disabled(LastError* le) : _le(le), _prev(le->_disabled) {
@@ -115,6 +124,7 @@ private:
int _nPrev = 1;
bool _valid = false;
bool _disabled = false;
+ bool _hadNotMasterError = false;
};
} // namespace mongo
diff --git a/src/mongo/db/service_entry_point_mongod.cpp b/src/mongo/db/service_entry_point_mongod.cpp
index b0a07648ff6..6c24eab6d3a 100644
--- a/src/mongo/db/service_entry_point_mongod.cpp
+++ b/src/mongo/db/service_entry_point_mongod.cpp
@@ -518,6 +518,10 @@ bool runCommandImpl(OperationContext* opCtx,
return result;
}
+// When active, we won't check if we are master in command dispatch. Activate this if you want to
+// test failing during command execution.
+MONGO_FP_DECLARE(skipCheckingForNotMasterInCommandDispatch);
+
/**
* Executes a command after stripping metadata, performing authorization checks,
* handling audit impersonation, and (potentially) setting maintenance mode. This method
@@ -595,7 +599,9 @@ void execCommandDatabase(OperationContext* opCtx,
repl::ReplicationCoordinator::get(opCtx->getClient()->getServiceContext());
const bool iAmPrimary = replCoord->canAcceptWritesForDatabase_UNSAFE(opCtx, dbname);
- if (!opCtx->getClient()->isInDirectClient()) {
+ if (!opCtx->getClient()->isInDirectClient() &&
+ !MONGO_FAIL_POINT(skipCheckingForNotMasterInCommandDispatch)) {
+
bool commandCanRunOnSecondary = command->slaveOk();
bool commandIsOverriddenToRunOnSecondary =
@@ -816,7 +822,11 @@ DbResponse runCommands(OperationContext* opCtx, const Message& message) {
}();
if (OpMsg::isFlagSet(message, OpMsg::kMoreToCome)) {
- // TODO SERVER-28510 throw to close connection if we failed with a not master error.
+ // Close the connection to get client to go through server selection again.
+ uassert(ErrorCodes::NotMaster,
+ "Not-master error during fire-and-forget command processing",
+ !LastError::get(opCtx->getClient()).hadNotMasterError());
+
return {}; // Don't reply.
}
@@ -1029,7 +1039,7 @@ DbResponse ServiceEntryPointMongod::handleRequest(OperationContext* opCtx, const
if (c.isInDirectClient()) {
invariant(!opCtx->lockState()->inAWriteUnitOfWork());
} else {
- LastError::get(c).startRequest();
+ LastError::get(c).startTopLevelRequest();
AuthorizationSession::get(c)->startRequest(opCtx);
// We should not be holding any locks at this point
diff --git a/src/mongo/s/client/shard_remote.cpp b/src/mongo/s/client/shard_remote.cpp
index 4617ef23f50..1bc9b6e5c97 100644
--- a/src/mongo/s/client/shard_remote.cpp
+++ b/src/mongo/s/client/shard_remote.cpp
@@ -120,9 +120,7 @@ void ShardRemote::updateReplSetMonitor(const HostAndPort& remoteHost,
if (remoteCommandStatus.isOK())
return;
- if (ErrorCodes::isNotMasterError(remoteCommandStatus.code()) ||
- (remoteCommandStatus == ErrorCodes::InterruptedDueToReplStateChange) ||
- (remoteCommandStatus == ErrorCodes::PrimarySteppedDown)) {
+ if (ErrorCodes::isNotMasterError(remoteCommandStatus.code())) {
_targeter->markHostNotMaster(remoteHost, remoteCommandStatus);
} else if (ErrorCodes::isNetworkError(remoteCommandStatus.code())) {
_targeter->markHostUnreachable(remoteHost, remoteCommandStatus);
diff --git a/src/mongo/s/service_entry_point_mongos.cpp b/src/mongo/s/service_entry_point_mongos.cpp
index dc1f8312b97..709485758b0 100644
--- a/src/mongo/s/service_entry_point_mongos.cpp
+++ b/src/mongo/s/service_entry_point_mongos.cpp
@@ -81,7 +81,7 @@ DbResponse ServiceEntryPointMongos::handleRequest(OperationContext* opCtx, const
ClusterLastErrorInfo::get(client) = std::make_shared<ClusterLastErrorInfo>();
}
ClusterLastErrorInfo::get(client)->newRequest();
- LastError::get(client).startRequest();
+ LastError::get(client).startTopLevelRequest();
AuthorizationSession::get(opCtx->getClient())->startRequest(opCtx);
DbMessage dbm(message);
diff --git a/src/mongo/util/net/op_msg_integration_test.cpp b/src/mongo/util/net/op_msg_integration_test.cpp
index 3d83243c1a4..3942c5dcf76 100644
--- a/src/mongo/util/net/op_msg_integration_test.cpp
+++ b/src/mongo/util/net/op_msg_integration_test.cpp
@@ -33,6 +33,8 @@
#include "mongo/unittest/integration_test.h"
#include "mongo/unittest/unittest.h"
#include "mongo/util/net/op_msg.h"
+#include "mongo/util/scopeguard.h"
+
namespace mongo {
@@ -64,4 +66,103 @@ TEST(OpMsg, UnknownOptionalFlagIsIgnored) {
conn->parseCommandReplyMessage(conn->getServerAddress(), reply)->getCommandReply()));
}
+TEST(OpMsg, FireAndForgetInsertWorks) {
+ std::string errMsg;
+ auto conn = std::unique_ptr<DBClientBase>(
+ unittest::getFixtureConnectionString().connect("integration_test", errMsg));
+ uassert(ErrorCodes::SocketException, errMsg, conn);
+
+ conn->dropCollection("test.collection");
+
+ conn->runFireAndForgetCommand(OpMsgRequest::fromDBAndBody("test", fromjson(R"({
+ insert: "collection",
+ writeConcern: {w: 0},
+ documents: [
+ {a: 1}
+ ]
+ })")));
+
+ ASSERT_EQ(conn->count("test.collection"), 1u);
+}
+
+TEST(OpMsg, CloseConnectionOnFireAndForgetNotMasterError) {
+ const auto connStr = unittest::getFixtureConnectionString();
+
+ // This test only works against a replica set.
+ if (connStr.type() != ConnectionString::SET) {
+ return;
+ }
+
+ bool foundSecondary = false;
+ for (auto host : connStr.getServers()) {
+ DBClientConnection conn;
+ uassertStatusOK(conn.connect(host, "integration_test"));
+ bool isMaster;
+ ASSERT(conn.isMaster(isMaster));
+ if (isMaster)
+ continue;
+ foundSecondary = true;
+
+ auto request = OpMsgRequest::fromDBAndBody("test", fromjson(R"({
+ insert: "collection",
+ writeConcern: {w: 0},
+ documents: [
+ {a: 1}
+ ]
+ })")).serialize();
+
+ // Round-trip command fails with NotMaster error. Note that this failure is in command
+ // dispatch which ignores w:0.
+ Message reply;
+ ASSERT(conn.call(request, reply, /*assertOK*/ true, nullptr));
+ ASSERT_EQ(
+ getStatusFromCommandResult(
+ conn.parseCommandReplyMessage(conn.getServerAddress(), reply)->getCommandReply()),
+ ErrorCodes::NotMaster);
+
+ // Fire-and-forget closes connection when it sees that error. Note that this is using call()
+ // rather than say() so that we get an error back when the connection is closed. Normally
+ // using call() if kMoreToCome set results in blocking forever.
+ OpMsg::setFlag(&request, OpMsg::kMoreToCome);
+ ASSERT(!conn.call(request, reply, /*assertOK*/ false, nullptr));
+
+ uassertStatusOK(conn.connect(host, "integration_test")); // Reconnect.
+
+ // Disable eager checking of master to simulate a stepdown occurring after the check. This
+ // should respect w:0.
+ BSONObj output;
+ ASSERT(conn.runCommand("admin",
+ fromjson(R"({
+ configureFailPoint: 'skipCheckingForNotMasterInCommandDispatch',
+ mode: 'alwaysOn'
+ })"),
+ output))
+ << output;
+ ON_BLOCK_EXIT([&] {
+ ASSERT(conn.runCommand("admin",
+ fromjson(R"({
+ configureFailPoint:
+ 'skipCheckingForNotMasterInCommandDispatch',
+ mode: 'off'
+ })"),
+ output))
+ << output;
+ });
+
+
+ // Round-trip command claims to succeed due to w:0.
+ OpMsg::replaceFlags(&request, 0);
+ ASSERT(conn.call(request, reply, /*assertOK*/ true, nullptr));
+ ASSERT_OK(getStatusFromCommandResult(
+ conn.parseCommandReplyMessage(conn.getServerAddress(), reply)->getCommandReply()));
+
+ // Fire-and-forget should still close connection.
+ OpMsg::setFlag(&request, OpMsg::kMoreToCome);
+ ASSERT(!conn.call(request, reply, /*assertOK*/ false, nullptr));
+
+ break;
+ }
+ ASSERT(foundSecondary);
+}
+
} // namespace mongo