diff options
author | Lingzhi Deng <lingzhi.deng@mongodb.com> | 2020-04-14 22:46:07 -0400 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-05-06 15:17:46 +0000 |
commit | ff38a192f588a01718f0ca259c147707819bd4cb (patch) | |
tree | 452656b2b657e5e9671b98090fb8d73db4bdd03c | |
parent | cd7deb59fb70d7c93f7b026f07e40e576ebfcecd (diff) | |
download | mongo-ff38a192f588a01718f0ca259c147707819bd4cb.tar.gz |
SERVER-47190: Shutdown command with force:true should ignore all stepdown errors
(cherry picked from commit 1563ac389bdcb08aadeb31705b2cc123742b739b)
(cherry picked from commit f60e93dd7ff8c7c9876d54e9c3c66b141eb32f03)
-rw-r--r-- | etc/backports_required_for_multiversion_tests.yml | 2 | ||||
-rw-r--r-- | jstests/replsets/force_shutdown_primary.js | 69 | ||||
-rw-r--r-- | src/mongo/db/commands/shutdown.h | 3 | ||||
-rw-r--r-- | src/mongo/db/commands/shutdown_d.cpp | 38 | ||||
-rw-r--r-- | src/mongo/db/db.cpp | 27 |
5 files changed, 111 insertions, 28 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml index e47013f3459..3bbee5b299f 100644 --- a/etc/backports_required_for_multiversion_tests.yml +++ b/etc/backports_required_for_multiversion_tests.yml @@ -55,6 +55,8 @@ replica_sets_multiversion: test_file: jstests/replsets/rollback_via_refetch_update_rollback_id_before_oplog_truncation.js - ticket: SERVER-45143 test_file: jstests/replsets/reconfig_uses_default_protocolVersion.js +- ticket: SERVER-47190 + test_file: jstests/replsets/force_shutdown_primary.js sharding_multiversion: - ticket: SERVER-38691 diff --git a/jstests/replsets/force_shutdown_primary.js b/jstests/replsets/force_shutdown_primary.js new file mode 100644 index 00000000000..9e578c8efd2 --- /dev/null +++ b/jstests/replsets/force_shutdown_primary.js @@ -0,0 +1,69 @@ +/** + * Test that the shutdown command called on a primary node with {force: true} succeeds even if + * stepDown fails. + * + * 1. Initiate a 3-node replica set. + * 2. Block replication to secondaries. + * 3. Write to primary. + * 4. Try to shut down primary with {force: true}. + * 5. Kill the shutdown command while the shutdown command is waiting to stepDown. + * 6. Test that the primary node still shuts down. + * + * @tags: [requires_fcv_44] + */ +(function() { +"use strict"; + +load("jstests/libs/write_concern_util.js"); // for stopReplicationOnSecondaries. +const replTest = new ReplSetTest({nodes: 3}); +replTest.startSet(); +replTest.initiateWithHighElectionTimeout(); + +const primary = replTest.getPrimary(); +const testDB = primary.getDB("test"); +assert.commandWorked(testDB.foo.insert({x: 1}, {writeConcern: {w: 3}})); + +jsTestLog("Blocking replication to secondaries."); +stopReplicationOnSecondaries(replTest); + +jsTestLog("Executing write to primary."); +assert.commandWorked(testDB.foo.insert({x: 2})); + +jsTestLog("Shutting down primary in a parallel shell"); +const shutdownShell = startParallelShell(function() { + db.adminCommand({shutdown: 1, timeoutSecs: 60, force: true}); +}, primary.port); + +let shutdownOpID = -1; +let res = {}; +jsTestLog("Looking for shutdown in currentOp() output"); +assert.soon(function() { + res = primary.getDB('admin').currentOp(true); + for (const index in res.inprog) { + const entry = res.inprog[index]; + if (entry["command"] && entry["command"]["shutdown"] === 1) { + shutdownOpID = entry.opid; + return true; + } + } + return false; +}, "No shutdown command found: " + tojson(res)); + +jsTestLog("Killing shutdown command on primary."); +primary.getDB('admin').killOp(shutdownOpID); + +jsTestLog("Verifying primary shut down and cannot be connected to."); +const exitCode = shutdownShell({checkExitSuccess: false}); +assert.neq(0, exitCode, "expected shutdown to close the shell's connection"); +assert.soonNoExcept(function() { + // The parallel shell exits while shutdown is in progress, and if this happens early enough, + // the primary can still accept connections despite successfully starting to shutdown. + // So, retry connecting until connections cannot be established and an error is thrown. + assert.throws(function() { + new Mongo(primary.host); + }); + return true; +}, "expected primary node to shut down and not be connectable"); + +replTest.stopSet(); +})(); diff --git a/src/mongo/db/commands/shutdown.h b/src/mongo/db/commands/shutdown.h index c8686afc99e..e3725619f96 100644 --- a/src/mongo/db/commands/shutdown.h +++ b/src/mongo/db/commands/shutdown.h @@ -35,6 +35,9 @@ #include "mongo/db/jsobj.h" namespace mongo { +Status stepDownForShutdown(OperationContext* opCtx, + const Milliseconds& waitTime, + bool forceShutdown) noexcept; class CmdShutdown : public BasicCommand { public: diff --git a/src/mongo/db/commands/shutdown_d.cpp b/src/mongo/db/commands/shutdown_d.cpp index b6d8154c999..5bf404f682e 100644 --- a/src/mongo/db/commands/shutdown_d.cpp +++ b/src/mongo/db/commands/shutdown_d.cpp @@ -35,8 +35,37 @@ #include "mongo/db/commands/shutdown.h" #include "mongo/db/repl/replication_coordinator.h" +#include "mongo/db/s/transaction_coordinator_service.h" +#include "mongo/util/log.h" namespace mongo { + +Status stepDownForShutdown(OperationContext* opCtx, + const Milliseconds& waitTime, + bool forceShutdown) noexcept { + auto replCoord = repl::ReplicationCoordinator::get(opCtx); + // If this is a single node replica set, then we don't have to wait + // for any secondaries. Ignore stepdown. + if (replCoord->getConfig().getNumMembers() != 1) { + try { + replCoord->stepDown(opCtx, false /* force */, waitTime, Seconds(120)); + } catch (const ExceptionFor<ErrorCodes::NotMaster>&) { + // Ignore not master errors. + } catch (const DBException& e) { + if (!forceShutdown) { + return e.toStatus(); + } + // Ignore stepDown errors on force shutdown. + log() << "Error stepping down during force shutdown " << e.toStatus(); + } + + // Even if the ReplicationCoordinator failed to step down, ensure we still shut down the + // TransactionCoordinatorService (see SERVER-45009) + TransactionCoordinatorService::get(opCtx)->onStepDown(); + } + return Status::OK(); +} + namespace { class CmdShutdownMongoD : public CmdShutdown { @@ -61,14 +90,7 @@ public: timeoutSecs = cmdObj["timeoutSecs"].numberLong(); } - try { - repl::ReplicationCoordinator::get(opCtx)->stepDown( - opCtx, force, Seconds(timeoutSecs), Seconds(120)); - } catch (const DBException& e) { - if (e.code() != ErrorCodes::NotMaster) { // ignore not master - throw; - } - } + uassertStatusOK(stepDownForShutdown(opCtx, Seconds(timeoutSecs), force)); // Never returns shutdownHelper(cmdObj); diff --git a/src/mongo/db/db.cpp b/src/mongo/db/db.cpp index 4d20f50a16f..4cbafa15ee9 100644 --- a/src/mongo/db/db.cpp +++ b/src/mongo/db/db.cpp @@ -62,6 +62,7 @@ #include "mongo/db/clientcursor.h" #include "mongo/db/commands/feature_compatibility_version.h" #include "mongo/db/commands/feature_compatibility_version_gen.h" +#include "mongo/db/commands/shutdown.h" #include "mongo/db/concurrency/d_concurrency.h" #include "mongo/db/concurrency/flow_control_ticketholder.h" #include "mongo/db/concurrency/lock_state.h" @@ -928,26 +929,12 @@ void shutdownTask(const ShutdownTaskArgs& shutdownArgs) { opCtx = uniqueOpCtx.get(); } - // If this is a single node replica set, then we don't have to wait - // for any secondaries. Ignore stepdown. - if (repl::ReplicationCoordinator::get(serviceContext)->getConfig().getNumMembers() != 1) { - try { - // For faster tests, we allow a short wait time with setParameter. - auto waitTime = repl::waitForStepDownOnNonCommandShutdown.load() - ? Milliseconds(Seconds(10)) - : Milliseconds(100); - replCoord->stepDown(opCtx, false /* force */, waitTime, Seconds(120)); - } catch (const ExceptionFor<ErrorCodes::NotMaster>&) { - // ignore not master errors - } catch (const DBException& e) { - log() << "Failed to stepDown in non-command initiated shutdown path " - << e.toString(); - } - - // Even if the replCoordinator failed to step down, ensure we still shut down the - // TransactionCoordinatorService (see SERVER-45009) - TransactionCoordinatorService::get(serviceContext)->onStepDown(); - } + // For faster tests, we allow a short wait time with setParameter. + auto waitTime = repl::waitForStepDownOnNonCommandShutdown.load() ? Milliseconds(Seconds(10)) + : Milliseconds(100); + const auto forceShutdown = true; + // stepDown should never return an error during force shutdown. + invariant(stepDownForShutdown(opCtx, waitTime, forceShutdown).isOK()); } WaitForMajorityService::get(serviceContext).shutDown(); |