summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjannaerin <golden.janna@gmail.com>2020-04-16 11:59:42 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-04-20 16:40:41 +0000
commitae194cbab3b84a145d2e5b585c4dd0a261830675 (patch)
treebf243d81744f7539cfc3d96ca32711db718d3805
parente84cf820af0e835b822f6599794ce8f4568c56b8 (diff)
downloadmongo-ae194cbab3b84a145d2e5b585c4dd0a261830675.tar.gz
SERVER-47045 Add tests to check that the RSM behaves correctly when contacting a mongod fails for various reasons
-rw-r--r--jstests/sharding/awaitable_isMaster_primary_failures.js67
-rw-r--r--src/mongo/transport/service_state_machine.cpp9
2 files changed, 74 insertions, 2 deletions
diff --git a/jstests/sharding/awaitable_isMaster_primary_failures.js b/jstests/sharding/awaitable_isMaster_primary_failures.js
new file mode 100644
index 00000000000..00175bfd9b7
--- /dev/null
+++ b/jstests/sharding/awaitable_isMaster_primary_failures.js
@@ -0,0 +1,67 @@
+/**
+ * Test to assert that the RSM behaves correctly when contacting the primary node fails in various
+ * ways.
+ */
+
+// Checking UUID consistency and orphans involves talking to a shard node, which in this test is
+// shutdown
+TestData.skipCheckingUUIDsConsistentAcrossCluster = true;
+TestData.skipCheckOrphans = true;
+
+(function() {
+"use strict";
+
+load("jstests/libs/fail_point_util.js");
+load("jstests/replsets/rslib.js");
+
+let st = new ShardingTest({shards: {rs0: {nodes: 1}}});
+let mongos = st.s;
+let rsPrimary = st.rs0.getPrimary();
+
+// Make sure mongos knows who the primary is
+awaitRSClientHosts(mongos, {host: rsPrimary.name}, {ok: true, ismaster: true});
+
+// Turn on the waitInIsMaster failpoint. This will cause the primary node to cease sending isMaster
+// responses and the RSM should mark the node as down
+jsTestLog("Turning on waitInIsMaster failpoint. Node should stop sending isMaster responses.");
+const isMasterFailpoint = configureFailPoint(rsPrimary, "waitInIsMaster");
+awaitRSClientHosts(mongos, {host: rsPrimary.name}, {ok: false, ismaster: false});
+isMasterFailpoint.off();
+
+// Wait for mongos to find out the node is still primary
+awaitRSClientHosts(mongos, {host: rsPrimary.name}, {ok: true, ismaster: true});
+
+// Force the primary node to fail all isMaster requests. The RSM should mark the node as down.
+jsTestLog("Turning on failCommand failpoint. Node should fail all isMaster responses.");
+const failCmdFailpoint = configureFailPoint(
+ rsPrimary,
+ "failCommand",
+ {errorCode: ErrorCodes.CommandFailed, failCommands: ["isMaster"], failInternalCommands: true});
+awaitRSClientHosts(mongos, {host: rsPrimary.name}, {ok: false, ismaster: false});
+failCmdFailpoint.off();
+
+// Wait for mongos to find out the node is still primary
+awaitRSClientHosts(mongos, {host: rsPrimary.name}, {ok: true, ismaster: true});
+
+// Force the primary node to end the isMaster stream by not setting the 'moreToCome' bit on the
+// resposne. The RSM should not mark the server as down or unknown and should continue monitoring
+// the node.
+jsTestLog(
+ "Turning on doNotSetMoreToCome failpoint. Node should return successful isMaster responses.");
+const moreToComeFailpoint = configureFailPoint(rsPrimary, "doNotSetMoreToCome");
+// Wait for maxAwaitTimeMS to guarantee that mongos has received at least one isMaster response from
+// the primary without the moreToCome bit set.
+sleep(10000);
+awaitRSClientHosts(mongos, {host: rsPrimary.name}, {ok: true, ismaster: true});
+moreToComeFailpoint.off();
+
+// Wait for mongos to find out the node is still primary
+awaitRSClientHosts(mongos, {host: rsPrimary.name}, {ok: true, ismaster: true});
+
+// Shutdown the primary node. The RSM should mark the node as down.
+jsTestLog("Shutting down primary node.");
+st.rs0.stop(0);
+awaitRSClientHosts(mongos, {host: rsPrimary.name}, {ok: false});
+
+st.stop();
+}());
diff --git a/src/mongo/transport/service_state_machine.cpp b/src/mongo/transport/service_state_machine.cpp
index 6f24f61213b..2f4e77ac0fc 100644
--- a/src/mongo/transport/service_state_machine.cpp
+++ b/src/mongo/transport/service_state_machine.cpp
@@ -53,11 +53,13 @@
#include "mongo/util/concurrency/thread_name.h"
#include "mongo/util/debug_util.h"
#include "mongo/util/exit.h"
+#include "mongo/util/fail_point.h"
#include "mongo/util/net/socket_exception.h"
#include "mongo/util/quick_exit.h"
namespace mongo {
namespace {
+MONGO_FAIL_POINT_DEFINE(doNotSetMoreToCome);
/**
* Creates and returns a legacy exhaust message, if exhaust is allowed. The returned message is to
* be used as the subsequent 'synthetic' exhaust request. Returns an empty message if exhaust is not
@@ -146,8 +148,11 @@ Message makeExhaustMessage(Message requestMsg, DbResponse* dbresponse) {
}
OpMsg::removeChecksum(&dbresponse->response);
- // Indicate that the response is part of an exhaust stream. Re-checksum if needed.
- OpMsg::setFlag(&dbresponse->response, OpMsg::kMoreToCome);
+ // Indicate that the response is part of an exhaust stream (unless the 'doNotSetMoreToCome'
+ // failpoint is set). Re-checksum if needed.
+ if (!MONGO_unlikely(doNotSetMoreToCome.shouldFail())) {
+ OpMsg::setFlag(&dbresponse->response, OpMsg::kMoreToCome);
+ }
if (checksumPresent) {
OpMsg::appendChecksum(&dbresponse->response);
}