diff options
author | Andrew Shuvalov <andrew.shuvalov@mongodb.com> | 2021-06-01 21:41:48 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-06-01 21:54:15 +0000 |
commit | b71fa542988c92480da3b2008f8247f40584f151 (patch) | |
tree | f88d91b40d86caf924ce146bf36e492583e102c2 | |
parent | 2e9443c048cf3dd5199fd19f9e532f3f946ffd43 (diff) | |
download | mongo-b71fa542988c92480da3b2008f8247f40584f151.tar.gz |
SERVER-56489: New pass-through test with random hello server-side delays
-rw-r--r-- | buildscripts/resmokeconfig/suites/sharding_hello_failures.yml | 54 | ||||
-rw-r--r-- | buildscripts/resmokelib/testing/hooks/hello_failures.py | 42 | ||||
-rw-r--r-- | etc/evergreen.yml | 14 | ||||
-rw-r--r-- | jstests/hooks/run_cleanup_hello_failures.js | 36 | ||||
-rw-r--r-- | jstests/hooks/run_inject_hello_failures.js | 165 | ||||
-rw-r--r-- | jstests/libs/fixture_helpers.js | 27 | ||||
-rw-r--r-- | src/mongo/client/replica_set_monitor.cpp | 13 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_info.cpp | 53 | ||||
-rw-r--r-- | src/mongo/util/exit.cpp | 1 |
9 files changed, 395 insertions, 10 deletions
diff --git a/buildscripts/resmokeconfig/suites/sharding_hello_failures.yml b/buildscripts/resmokeconfig/suites/sharding_hello_failures.yml new file mode 100644 index 00000000000..37c8cf51db4 --- /dev/null +++ b/buildscripts/resmokeconfig/suites/sharding_hello_failures.yml @@ -0,0 +1,54 @@ +test_kind: fsm_workload_test + +selector: + roots: + - jstests/concurrency/fsm_workloads/explain_update.js + - jstests/concurrency/fsm_workloads/update_and_bulk_insert.js + - jstests/concurrency/fsm_workloads/update_array.js + - jstests/concurrency/fsm_workloads/update_array_noindex.js + - jstests/concurrency/fsm_workloads/update_check_index.js + - jstests/concurrency/fsm_workloads/update_inc.js + - jstests/concurrency/fsm_workloads/update_inc_capped.js + - jstests/concurrency/fsm_workloads/update_multifield.js + - jstests/concurrency/fsm_workloads/update_multifield_multiupdate.js + - jstests/concurrency/fsm_workloads/update_multifield_multiupdate_noindex.js + - jstests/concurrency/fsm_workloads/update_multifield_noindex.js + - jstests/concurrency/fsm_workloads/update_ordered_bulk_inc.js + - jstests/concurrency/fsm_workloads/update_rename.js + - jstests/concurrency/fsm_workloads/update_rename_noindex.js + - jstests/concurrency/fsm_workloads/update_replace.js + - jstests/concurrency/fsm_workloads/update_replace_noindex.js + - jstests/concurrency/fsm_workloads/update_simple.js + - jstests/concurrency/fsm_workloads/update_simple_eval.js + - jstests/concurrency/fsm_workloads/update_simple_eval_nolock.js + - jstests/concurrency/fsm_workloads/update_simple_noindex.js + - jstests/concurrency/fsm_workloads/update_where.js + +executor: + hooks: + - class: HelloDelays + fixture: + class: ShardedClusterFixture + mongos_options: + set_parameters: + enableTestCommands: 1 + logComponentVerbosity: + verbosity: 0 + command: 1 + network: + verbosity: 1 + asio: 2 + mongod_options: + set_parameters: + enableTestCommands: 1 + logComponentVerbosity: + verbosity: 0 + command: 1 + network: + verbosity: 1 + asio: 2 + enable_sharding: + - test + num_rs_nodes_per_shard: 3 + shard_options: + all_nodes_electable: true diff --git a/buildscripts/resmokelib/testing/hooks/hello_failures.py b/buildscripts/resmokelib/testing/hooks/hello_failures.py new file mode 100644 index 00000000000..84b1b5886b6 --- /dev/null +++ b/buildscripts/resmokelib/testing/hooks/hello_failures.py @@ -0,0 +1,42 @@ +"""Test hook that injects delays into server-side Hello Cmd handler.""" + +from __future__ import absolute_import + +import os + +from buildscripts.resmokelib import errors +from buildscripts.resmokelib import utils +from buildscripts.resmokelib.testing.hooks import interface +from buildscripts.resmokelib.testing.fixtures import replicaset +from buildscripts.resmokelib.testing.fixtures import shardedcluster + +from . import interface +from . import jsfile + + +class HelloDelays(interface.Hook): + """Sets Hello fault injections.""" + + def __init__(self, hook_logger, fixture): + """Initialize HelloDelays.""" + description = "Sets Hello fault injections" + interface.Hook.__init__(self, hook_logger, fixture, description) + self.js_filename = os.path.join("jstests", "hooks", "run_inject_hello_failures.js") + self.cleanup_js_filename = os.path.join("jstests", "hooks", "run_cleanup_hello_failures.js") + self.shell_options = None + + def before_test(self, test, test_report): + """Each test will call this before it executes.""" + print 'before_test hook starts injecting Hello failures' + hook_test_case = jsfile.DynamicJSTestCase.create_before_test( + self.logger.test_case_logger, test, self, self.js_filename, self.shell_options) + hook_test_case.configure(self.fixture) + hook_test_case.run_dynamic_test(test_report) + + def after_test(self, test, test_report): + """Each test will call this after it executes.""" + print 'Cleanup hook is starting to remove Hello fail injections' + hook_test_case = jsfile.DynamicJSTestCase.create_after_test( + self.logger.test_case_logger, test, self, self.cleanup_js_filename, self.shell_options) + hook_test_case.configure(self.fixture) + hook_test_case.run_dynamic_test(test_report) diff --git a/etc/evergreen.yml b/etc/evergreen.yml index 90186d00ba3..d7b05561562 100644 --- a/etc/evergreen.yml +++ b/etc/evergreen.yml @@ -4775,6 +4775,16 @@ tasks: fallback_num_sub_suites: 4 - <<: *task_template + name: sharding_hello_failures + depends_on: + - name: jsCore + commands: + - func: "do setup" + - func: "run tests" + vars: + resmoke_args: --suites=sharding_hello_failures --storageEngine=wiredTiger + +- <<: *task_template name: parallel depends_on: - name: jsCore @@ -7366,6 +7376,7 @@ buildvariants: - name: sharding_auth_gen - name: sharding_auth_audit_gen - name: sharding_ese_gen + - name: sharding_hello_failures - name: slow1_gen - name: serial_run - name: sharded_collections_jscore_passthrough @@ -7469,6 +7480,7 @@ buildvariants: - name: sharding_jscore_passthrough - name: sharding_jscore_op_query_passthrough - name: sharding_jscore_passthrough_wire_ops_gen + - name: sharding_hello_failures - name: ssl_gen - name: sslSpecial_gen - name: tool @@ -8394,6 +8406,7 @@ buildvariants: - name: parallel_gen - name: parallel_compatibility - name: read_concern_linearizable_passthrough + - name: sharding_hello_failures distros: - rhel62-large - name: read_concern_majority_passthrough_gen @@ -9022,6 +9035,7 @@ buildvariants: - name: sasl - name: sharding_auth_gen - name: sharding_auth_audit_gen + - name: sharding_hello_failures - name: snmp - name: ssl_gen - name: sslSpecial_gen diff --git a/jstests/hooks/run_cleanup_hello_failures.js b/jstests/hooks/run_cleanup_hello_failures.js new file mode 100644 index 00000000000..2807f3bce92 --- /dev/null +++ b/jstests/hooks/run_cleanup_hello_failures.js @@ -0,0 +1,36 @@ +'use strict'; + +function cleanupHelloFailInjection(connection) { + jsTestLog(`Cleanup Hello fail injection in ${connection}`); + let adminDB = connection.getDB('admin'); + assert.commandWorked( + adminDB.runCommand({configureFailPoint: 'blockOrFailHelloCommand', mode: "off"})); + const res = assert.commandWorked( + adminDB.runCommand({getParameter: 1, "failpoint.blockOrFailHelloCommand": 1})); + assert.eq(res["failpoint.blockOrFailHelloCommand"].mode, 0); +} + +function doFailInjectionCleanup(db) { + let connectionsToPrimaries = FixtureHelpers.getPrimaries(db); + + for (let connection of connectionsToPrimaries.concat(FixtureHelpers.getSecondaries(db))) { + cleanupHelloFailInjection(connection); + } +} + +(function() { + load('jstests/libs/discover_topology.js'); // For Topology and DiscoverTopology. + load('jstests/libs/fixture_helpers.js'); + + assert.eq(typeof db, 'object', 'Invalid `db` object, is the shell connected to a mongod?'); + var cmdLineOpts = db.adminCommand('getCmdLineOpts'); + const topology = DiscoverTopology.findConnectedNodes(db.getMongo()); + jsTestLog(`Run Hello test suite cleanup in ${JSON.stringify(topology)}, + Invoked with ${JSON.stringify(cmdLineOpts)}, + topology type ${topology.type}`); + + if (topology.type === Topology.kShardedCluster) { + doFailInjectionCleanup(db); + } + jsTestLog(`Hello fail hook completed`); +})(); diff --git a/jstests/hooks/run_inject_hello_failures.js b/jstests/hooks/run_inject_hello_failures.js new file mode 100644 index 00000000000..93c1b97f32a --- /dev/null +++ b/jstests/hooks/run_inject_hello_failures.js @@ -0,0 +1,165 @@ +'use strict'; + +// Interval between test loops. +const kTestLoopPeriodMs = 20 * 1000; + +// Sleep injected to the Hello response at the server side. +const kInjectedHelloDelayMs = 6000 * 1000; + +// How many times the fail inject - stepdown cycle is repeated. +const kTestLoops = 1; + +// Refresh timeout will be reduced to this interval. +const kRefreshTimeoutSec = 1; + +// Connection could be to 'mongos' or 'mongod'. +function getAdminDB(connection) { + let adminDB; + if (typeof connection.getDB === 'function') { + adminDB = connection.getDB('admin'); + } else { + assert(typeof connection.getSiblingDB === 'function', + `Cannot get Admin DB from ${tojson(connection)}`); + adminDB = connection.getSiblingDB('admin'); + } + return adminDB; +} + +function stepDown(connection) { + jsTestLog(`Force stepDown to ${connection}`); + const adminDB = getAdminDB(connection); + let res; + let error; + try { + res = adminDB.runCommand({replSetStepDown: 10, force: true, secondaryCatchUpPeriodSecs: 8}); + error = res; + } catch (e) { + error = e; + jsTestLog(`Step down error is usually normal: ${error}`); + } + if (error && (error.code == undefined || error.code == ErrorCodes.HostUnreachable)) { + jsTestLog(`Transient error ${error}`); + return; + } + assert.commandWorked(res); + jsTestLog(`Forced step down to ${connection}, result ${res}`); +} + +function stepUp(connection) { + const adminDB = getAdminDB(connection); + assert.soonNoExcept(() => { + const res = adminDB.runCommand({replSetStepUp: 1}); + if (!res.ok) { + jsTestLog(`Failed to step up with ${res}`); + } + return res.ok; + }, "Failed to step up"); + jsTestLog(`Forced step up to ${connection}`); +} + +// The default interval of 30 sec between RSM refresh cycles is too long for +// this test. +function injectReduceRefreshPeriod(connection) { + jsTestLog(`Reduce refresh interval for ${connection}`); + const adminDB = getAdminDB(connection); + assert.commandWorked(adminDB.runCommand({ + configureFailPoint: "modifyReplicaSetMonitorDefaultRefreshPeriod", + mode: "alwaysOn", + data: {period: kRefreshTimeoutSec}, + })); + const res = adminDB.runCommand( + {getParameter: 1, "failpoint.modifyReplicaSetMonitorDefaultRefreshPeriod": 1}); + assert.commandWorked(res); + assert.eq(res["failpoint.modifyReplicaSetMonitorDefaultRefreshPeriod"].mode, 1); +} + +function injectHelloFail(connection) { + jsTestLog(`Inject Hello fail to connection ${connection}`); + const adminDB = getAdminDB(connection); + assert.commandWorked(adminDB.runCommand({ + configureFailPoint: 'blockOrFailHelloCommand', + mode: "alwaysOn", + data: {delay: kInjectedHelloDelayMs, internalClient: 1} + })); + const res = adminDB.runCommand({getParameter: 1, "failpoint.blockOrFailHelloCommand": 1}); + assert.commandWorked(res); + assert.eq(res["failpoint.blockOrFailHelloCommand"].mode, 1); +} + +function freeze(connection) { + const adminDB = getAdminDB(connection); + assert.commandWorked(adminDB.runCommand({replSetFreeze: 20})); +} + +function getConfigServer(connection) { + const adminDB = getAdminDB(connection); + const res = assert.commandWorked(adminDB.runCommand({serverStatus: 1})) + .sharding.configsvrConnectionString; + var rx = /.*\/(.*)/g; + var arr = rx.exec(res); + jsTestLog(`Config server: ${arr[1]} extracted from ${tojson(res)}`); + return new Mongo(arr[1]); +} + +function doFailInjectionLoop(db) { + for (let counter = 0; counter < kTestLoops; ++counter) { + let connectionsToPrimaries; + let connectionsToSecondaries = []; + let configServer; + try { + connectionsToPrimaries = FixtureHelpers.getPrimaries(db); + let allReplicaSets = FixtureHelpers.getAllReplicas(db); + for (let replicaSet of allReplicaSets) { + connectionsToSecondaries.push(replicaSet.getSecondaries()); + } + configServer = getConfigServer(db); + } catch (e) { + jsTestLog(`Cannot fetch primaries or secondaries: ${e}`); + sleep(kTestLoopPeriodMs); + continue; + } + // This will reduce refresh timeout on mongos and config server as well. + injectReduceRefreshPeriod(db); + injectReduceRefreshPeriod(configServer); + for (let connection of connectionsToPrimaries.concat(FixtureHelpers.getSecondaries(db))) { + injectReduceRefreshPeriod(connection); + } + // The tests usually have 10-20 sec timeout on operations. The default refresh period is 30 + // sec. + // After we reduced the refresh timeout we need to wait for the previously scheduled + // timeouts to + // approach before injecting the Hello delay failure. + sleep(25 * 1000); + for (let connection of connectionsToPrimaries) { + injectHelloFail(connection); + } + for (let connection of connectionsToPrimaries) { + stepDown(connection); + freeze(connection); + } + for (let arrayOfSecondaries of connectionsToSecondaries) { + for (let connection of arrayOfSecondaries) { + stepUp(connection); + break; // For each replica set pick one secondary. + } + } + sleep(kTestLoopPeriodMs); + } +} + +(function() { + load('jstests/libs/discover_topology.js'); // For Topology and DiscoverTopology. + load('jstests/libs/fixture_helpers.js'); + + assert.eq(typeof db, 'object', 'Invalid `db` object, is the shell connected to a mongod?'); + var cmdLineOpts = db.adminCommand('getCmdLineOpts'); + const topology = DiscoverTopology.findConnectedNodes(db.getMongo()); + jsTestLog(`Run Hello fail injection in ${JSON.stringify(topology)}, + Invoked with ${JSON.stringify(cmdLineOpts)}, + topology type ${topology.type}`); + + if (topology.type === Topology.kShardedCluster) { + doFailInjectionLoop(db); + } + jsTestLog(`Hello fail hook completed`); +})(); diff --git a/jstests/libs/fixture_helpers.js b/jstests/libs/fixture_helpers.js index a1b0bbbb968..972e5c77726 100644 --- a/jstests/libs/fixture_helpers.js +++ b/jstests/libs/fixture_helpers.js @@ -19,7 +19,7 @@ var FixtureHelpers = (function() { * Returns an array of connections to each data-bearing replica set in the fixture (not * including the config servers). */ - function _getAllReplicas(db) { + function getAllReplicas(db) { let replicas = []; if (isMongos(db)) { const shardObjs = db.getSiblingDB("config").shards.find().sort({_id: 1}); @@ -36,7 +36,7 @@ var FixtureHelpers = (function() { * Asserts if the fixture is a standalone or if the shards are standalones. */ function awaitReplication(db) { - _getAllReplicas(db).forEach((replSet) => replSet.awaitReplication()); + getAllReplicas(db).forEach((replSet) => replSet.awaitReplication()); } /** @@ -47,7 +47,7 @@ var FixtureHelpers = (function() { * Asserts if the fixture is a standalone or if the shards are standalones. */ function awaitLastOpCommitted(db) { - _getAllReplicas(db).forEach((replSet) => replSet.awaitLastOpCommitted()); + getAllReplicas(db).forEach((replSet) => replSet.awaitLastOpCommitted()); } /** @@ -87,7 +87,7 @@ var FixtureHelpers = (function() { * replica set. Asserts if the fixture is a standalone or if the shards are standalones. */ function runCommandOnEachPrimary({db, cmdObj}) { - return _getAllReplicas(db).map( + return getAllReplicas(db).map( (replSet) => assert.commandWorked(replSet.getPrimary().getDB(db.getName()).runCommand(cmdObj))); } @@ -113,6 +113,22 @@ var FixtureHelpers = (function() { } /** + * Returns a collection of connections to each primary in a cluster. + */ + function getPrimaries(db) { + return getAllReplicas(db).map((replSet) => replSet.getPrimary()); + } + + /** + * Returns a collection of connections to secondaries in a cluster. + */ + function getSecondaries(db) { + return getAllReplicas(db).reduce((array, replSet) => { + return array.concat(replSet.getSecondaries()); + }, []); + } + + /** * Returns true if we have a replica set. */ function isReplSet(db) { @@ -128,6 +144,9 @@ var FixtureHelpers = (function() { awaitReplication: awaitReplication, awaitLastOpCommitted: awaitLastOpCommitted, runCommandOnEachPrimary: runCommandOnEachPrimary, + getAllReplicas: getAllReplicas, + getPrimaries: getPrimaries, + getSecondaries: getSecondaries, getPrimaryForNodeHostingDatabase: getPrimaryForNodeHostingDatabase, isReplSet: isReplSet, }; diff --git a/src/mongo/client/replica_set_monitor.cpp b/src/mongo/client/replica_set_monitor.cpp index 7d01c32b2c5..339ea0316f1 100644 --- a/src/mongo/client/replica_set_monitor.cpp +++ b/src/mongo/client/replica_set_monitor.cpp @@ -69,9 +69,8 @@ using std::vector; // Failpoint for disabling AsyncConfigChangeHook calls on updated RS nodes. MONGO_FAIL_POINT_DEFINE(failAsyncConfigChangeHook); -// Failpoint for changing the default refresh period +// Failpoint for changing the default refresh period. MONGO_FAIL_POINT_DEFINE(modifyReplicaSetMonitorDefaultRefreshPeriod); - // Failpoint for changing the default socket timeout for Hello command. MONGO_FAIL_POINT_DEFINE(modifyReplicaSetMonitorHelloTimeout); @@ -91,8 +90,6 @@ using executor::TaskExecutor; using CallbackArgs = TaskExecutor::CallbackArgs; using CallbackHandle = TaskExecutor::CallbackHandle; -const double socketTimeoutSecs = 5; - // Intentionally chosen to compare worse than all known latencies. const int64_t unknownLatency = numeric_limits<int64_t>::max(); @@ -293,7 +290,13 @@ void ReplicaSetMonitor::_scheduleRefresh(Date_t when) { void ReplicaSetMonitor::_doScheduledRefresh(const CallbackHandle& currentHandle) { startOrContinueRefresh().refreshAll(); - // And now we set up the next one + // And now we set up the next one. + const auto defaultRefreshPeriod = getRefreshPeriod(); + if (_state->refreshPeriod != defaultRefreshPeriod) { + _state->refreshPeriod = + defaultRefreshPeriod; // Path executed in tests with fail injection. + log() << "Changed refresh period for " << _state->name; + } _scheduleRefresh(_executor->now() + _state->refreshPeriod); } diff --git a/src/mongo/db/repl/replication_info.cpp b/src/mongo/db/repl/replication_info.cpp index 745a33659e5..5f858ff8b77 100644 --- a/src/mongo/db/repl/replication_info.cpp +++ b/src/mongo/db/repl/replication_info.cpp @@ -26,7 +26,7 @@ * exception statement from all source files in the program, then also delete * it in the license file. */ -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kFTDC +#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kReplication #include "mongo/platform/basic.h" @@ -59,7 +59,9 @@ #include "mongo/executor/network_interface.h" #include "mongo/rpc/metadata/client_metadata.h" #include "mongo/rpc/metadata/client_metadata_ismaster.h" +#include "mongo/util/exit.h" #include "mongo/util/fail_point_service.h" +#include "mongo/util/log.h" #include "mongo/util/map_util.h" namespace mongo { @@ -74,6 +76,7 @@ namespace repl { namespace { MONGO_FAIL_POINT_DEFINE(impersonateFullyUpgradedFutureVersion); +MONGO_FAIL_POINT_DEFINE(blockOrFailHelloCommand); constexpr auto kHelloString = "hello"_sd; constexpr auto kCamelCaseIsMasterString = "isMaster"_sd; @@ -218,6 +221,46 @@ public: } } oplogInfoServerStatus; +// Fail point for Hello command. Returns sleep timeout if needed. Supported arguments: +// internalClient: enabled only for internal clients +// notInternalClient: enabled only for non-internal clients +// delay: specifies the sleep duration in milliseconds +// uassert: pass this integer argument to uassert and throw +boost::optional<Milliseconds> handleHelloFailPoint(const BSONObj& args, const BSONObj& cmdObj) { + if (args.hasElement("internalClient") && !cmdObj.hasElement("internalClient")) { + log() << "Fail point Hello is disabled for external client"; + return boost::none; // Filtered out not internal client. + } + if (args.hasElement("notInternalClient") && cmdObj.hasElement("internalClient")) { + log() << "Fail point Hello is disabled for internal client"; + return boost::none; // Filtered out internal client. + } + if (args.hasElement("delay")) { + auto millisToSleep = args["delay"].numberInt(); + log() << "Fail point delays Hello response by " << millisToSleep << " ms"; + return Milliseconds(millisToSleep); + } + if (args.hasElement("uassert")) { + log() << "Fail point fails Hello response"; + uasserted(args["uassert"].numberInt(), "Fail point"); + } + return boost::none; +} + +// Sleep implementation outside the fail point handler itself to avoid the problem that +// processing a fail point will block its state. +void sleepForDurationOrUntilShutdown(Milliseconds sleep) { + while (sleep > Milliseconds(0) && !globalInShutdownDeprecated()) { + auto nextSleep = std::min(sleep, Milliseconds(1000)); + try { + sleepmillis(nextSleep.count()); + sleep -= nextSleep; + } catch (...) { + break; + } + } +} + class CmdHello : public BasicCommand { public: CmdHello() : CmdHello(kHelloString, {}) {} @@ -265,6 +308,14 @@ public: clientMetadataIsMasterState.setSeenIsMaster(); } + boost::optional<Milliseconds> sleepTimeout; + MONGO_FAIL_POINT_BLOCK(blockOrFailHelloCommand, customArgs) { + sleepTimeout = handleHelloFailPoint(customArgs.getData(), cmdObj); + } + if (MONGO_unlikely(sleepTimeout)) { + sleepForDurationOrUntilShutdown(*sleepTimeout); + } + BSONElement element = cmdObj[kMetadataDocumentName]; if (!element.eoo()) { if (seenIsMaster) { diff --git a/src/mongo/util/exit.cpp b/src/mongo/util/exit.cpp index fc874c0eea4..b4ff5990b55 100644 --- a/src/mongo/util/exit.cpp +++ b/src/mongo/util/exit.cpp @@ -78,6 +78,7 @@ MONGO_COMPILER_NORETURN void logAndQuickExit_inlock() { } void setShutdownFlag() { + log() << "Shutdown started"; shutdownFlag.fetchAndAdd(1); } |