diff options
author | jannaerin <golden.janna@gmail.com> | 2020-04-16 11:59:42 -0400 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-04-20 16:40:41 +0000 |
commit | ae194cbab3b84a145d2e5b585c4dd0a261830675 (patch) | |
tree | bf243d81744f7539cfc3d96ca32711db718d3805 | |
parent | e84cf820af0e835b822f6599794ce8f4568c56b8 (diff) | |
download | mongo-ae194cbab3b84a145d2e5b585c4dd0a261830675.tar.gz |
SERVER-47045 Add tests to check that the RSM behaves correctly when contacting a mongod fails for various reasons
-rw-r--r-- | jstests/sharding/awaitable_isMaster_primary_failures.js | 67 | ||||
-rw-r--r-- | src/mongo/transport/service_state_machine.cpp | 9 |
2 files changed, 74 insertions, 2 deletions
diff --git a/jstests/sharding/awaitable_isMaster_primary_failures.js b/jstests/sharding/awaitable_isMaster_primary_failures.js new file mode 100644 index 00000000000..00175bfd9b7 --- /dev/null +++ b/jstests/sharding/awaitable_isMaster_primary_failures.js @@ -0,0 +1,67 @@ +/** + * Test to assert that the RSM behaves correctly when contacting the primary node fails in various + * ways. + */ + +// Checking UUID consistency and orphans involves talking to a shard node, which in this test is +// shutdown +TestData.skipCheckingUUIDsConsistentAcrossCluster = true; +TestData.skipCheckOrphans = true; + +(function() { +"use strict"; + +load("jstests/libs/fail_point_util.js"); +load("jstests/replsets/rslib.js"); + +let st = new ShardingTest({shards: {rs0: {nodes: 1}}}); +let mongos = st.s; +let rsPrimary = st.rs0.getPrimary(); + +// Make sure mongos knows who the primary is +awaitRSClientHosts(mongos, {host: rsPrimary.name}, {ok: true, ismaster: true}); + +// Turn on the waitInIsMaster failpoint. This will cause the primary node to cease sending isMaster +// responses and the RSM should mark the node as down +jsTestLog("Turning on waitInIsMaster failpoint. Node should stop sending isMaster responses."); +const isMasterFailpoint = configureFailPoint(rsPrimary, "waitInIsMaster"); +awaitRSClientHosts(mongos, {host: rsPrimary.name}, {ok: false, ismaster: false}); +isMasterFailpoint.off(); + +// Wait for mongos to find out the node is still primary +awaitRSClientHosts(mongos, {host: rsPrimary.name}, {ok: true, ismaster: true}); + +// Force the primary node to fail all isMaster requests. The RSM should mark the node as down. +jsTestLog("Turning on failCommand failpoint. Node should fail all isMaster responses."); +const failCmdFailpoint = configureFailPoint( + rsPrimary, + "failCommand", + {errorCode: ErrorCodes.CommandFailed, failCommands: ["isMaster"], failInternalCommands: true}); +awaitRSClientHosts(mongos, {host: rsPrimary.name}, {ok: false, ismaster: false}); +failCmdFailpoint.off(); + +// Wait for mongos to find out the node is still primary +awaitRSClientHosts(mongos, {host: rsPrimary.name}, {ok: true, ismaster: true}); + +// Force the primary node to end the isMaster stream by not setting the 'moreToCome' bit on the +// resposne. The RSM should not mark the server as down or unknown and should continue monitoring +// the node. +jsTestLog( + "Turning on doNotSetMoreToCome failpoint. Node should return successful isMaster responses."); +const moreToComeFailpoint = configureFailPoint(rsPrimary, "doNotSetMoreToCome"); +// Wait for maxAwaitTimeMS to guarantee that mongos has received at least one isMaster response from +// the primary without the moreToCome bit set. +sleep(10000); +awaitRSClientHosts(mongos, {host: rsPrimary.name}, {ok: true, ismaster: true}); +moreToComeFailpoint.off(); + +// Wait for mongos to find out the node is still primary +awaitRSClientHosts(mongos, {host: rsPrimary.name}, {ok: true, ismaster: true}); + +// Shutdown the primary node. The RSM should mark the node as down. +jsTestLog("Shutting down primary node."); +st.rs0.stop(0); +awaitRSClientHosts(mongos, {host: rsPrimary.name}, {ok: false}); + +st.stop(); +}()); diff --git a/src/mongo/transport/service_state_machine.cpp b/src/mongo/transport/service_state_machine.cpp index 6f24f61213b..2f4e77ac0fc 100644 --- a/src/mongo/transport/service_state_machine.cpp +++ b/src/mongo/transport/service_state_machine.cpp @@ -53,11 +53,13 @@ #include "mongo/util/concurrency/thread_name.h" #include "mongo/util/debug_util.h" #include "mongo/util/exit.h" +#include "mongo/util/fail_point.h" #include "mongo/util/net/socket_exception.h" #include "mongo/util/quick_exit.h" namespace mongo { namespace { +MONGO_FAIL_POINT_DEFINE(doNotSetMoreToCome); /** * Creates and returns a legacy exhaust message, if exhaust is allowed. The returned message is to * be used as the subsequent 'synthetic' exhaust request. Returns an empty message if exhaust is not @@ -146,8 +148,11 @@ Message makeExhaustMessage(Message requestMsg, DbResponse* dbresponse) { } OpMsg::removeChecksum(&dbresponse->response); - // Indicate that the response is part of an exhaust stream. Re-checksum if needed. - OpMsg::setFlag(&dbresponse->response, OpMsg::kMoreToCome); + // Indicate that the response is part of an exhaust stream (unless the 'doNotSetMoreToCome' + // failpoint is set). Re-checksum if needed. + if (!MONGO_unlikely(doNotSetMoreToCome.shouldFail())) { + OpMsg::setFlag(&dbresponse->response, OpMsg::kMoreToCome); + } if (checksumPresent) { OpMsg::appendChecksum(&dbresponse->response); } |