summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLingzhi Deng <lingzhi.deng@mongodb.com>2020-04-14 22:46:07 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-05-06 15:17:46 +0000
commitff38a192f588a01718f0ca259c147707819bd4cb (patch)
tree452656b2b657e5e9671b98090fb8d73db4bdd03c
parentcd7deb59fb70d7c93f7b026f07e40e576ebfcecd (diff)
downloadmongo-ff38a192f588a01718f0ca259c147707819bd4cb.tar.gz
SERVER-47190: Shutdown command with force:true should ignore all stepdown errors
(cherry picked from commit 1563ac389bdcb08aadeb31705b2cc123742b739b) (cherry picked from commit f60e93dd7ff8c7c9876d54e9c3c66b141eb32f03)
-rw-r--r--etc/backports_required_for_multiversion_tests.yml2
-rw-r--r--jstests/replsets/force_shutdown_primary.js69
-rw-r--r--src/mongo/db/commands/shutdown.h3
-rw-r--r--src/mongo/db/commands/shutdown_d.cpp38
-rw-r--r--src/mongo/db/db.cpp27
5 files changed, 111 insertions, 28 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml
index e47013f3459..3bbee5b299f 100644
--- a/etc/backports_required_for_multiversion_tests.yml
+++ b/etc/backports_required_for_multiversion_tests.yml
@@ -55,6 +55,8 @@ replica_sets_multiversion:
test_file: jstests/replsets/rollback_via_refetch_update_rollback_id_before_oplog_truncation.js
- ticket: SERVER-45143
test_file: jstests/replsets/reconfig_uses_default_protocolVersion.js
+- ticket: SERVER-47190
+ test_file: jstests/replsets/force_shutdown_primary.js
sharding_multiversion:
- ticket: SERVER-38691
diff --git a/jstests/replsets/force_shutdown_primary.js b/jstests/replsets/force_shutdown_primary.js
new file mode 100644
index 00000000000..9e578c8efd2
--- /dev/null
+++ b/jstests/replsets/force_shutdown_primary.js
@@ -0,0 +1,69 @@
+/**
+ * Test that the shutdown command called on a primary node with {force: true} succeeds even if
+ * stepDown fails.
+ *
+ * 1. Initiate a 3-node replica set.
+ * 2. Block replication to secondaries.
+ * 3. Write to primary.
+ * 4. Try to shut down primary with {force: true}.
+ * 5. Kill the shutdown command while the shutdown command is waiting to stepDown.
+ * 6. Test that the primary node still shuts down.
+ *
+ * @tags: [requires_fcv_44]
+ */
+(function() {
+"use strict";
+
+load("jstests/libs/write_concern_util.js"); // for stopReplicationOnSecondaries.
+const replTest = new ReplSetTest({nodes: 3});
+replTest.startSet();
+replTest.initiateWithHighElectionTimeout();
+
+const primary = replTest.getPrimary();
+const testDB = primary.getDB("test");
+assert.commandWorked(testDB.foo.insert({x: 1}, {writeConcern: {w: 3}}));
+
+jsTestLog("Blocking replication to secondaries.");
+stopReplicationOnSecondaries(replTest);
+
+jsTestLog("Executing write to primary.");
+assert.commandWorked(testDB.foo.insert({x: 2}));
+
+jsTestLog("Shutting down primary in a parallel shell");
+const shutdownShell = startParallelShell(function() {
+ db.adminCommand({shutdown: 1, timeoutSecs: 60, force: true});
+}, primary.port);
+
+let shutdownOpID = -1;
+let res = {};
+jsTestLog("Looking for shutdown in currentOp() output");
+assert.soon(function() {
+ res = primary.getDB('admin').currentOp(true);
+ for (const index in res.inprog) {
+ const entry = res.inprog[index];
+ if (entry["command"] && entry["command"]["shutdown"] === 1) {
+ shutdownOpID = entry.opid;
+ return true;
+ }
+ }
+ return false;
+}, "No shutdown command found: " + tojson(res));
+
+jsTestLog("Killing shutdown command on primary.");
+primary.getDB('admin').killOp(shutdownOpID);
+
+jsTestLog("Verifying primary shut down and cannot be connected to.");
+const exitCode = shutdownShell({checkExitSuccess: false});
+assert.neq(0, exitCode, "expected shutdown to close the shell's connection");
+assert.soonNoExcept(function() {
+ // The parallel shell exits while shutdown is in progress, and if this happens early enough,
+ // the primary can still accept connections despite successfully starting to shutdown.
+ // So, retry connecting until connections cannot be established and an error is thrown.
+ assert.throws(function() {
+ new Mongo(primary.host);
+ });
+ return true;
+}, "expected primary node to shut down and not be connectable");
+
+replTest.stopSet();
+})();
diff --git a/src/mongo/db/commands/shutdown.h b/src/mongo/db/commands/shutdown.h
index c8686afc99e..e3725619f96 100644
--- a/src/mongo/db/commands/shutdown.h
+++ b/src/mongo/db/commands/shutdown.h
@@ -35,6 +35,9 @@
#include "mongo/db/jsobj.h"
namespace mongo {
+Status stepDownForShutdown(OperationContext* opCtx,
+ const Milliseconds& waitTime,
+ bool forceShutdown) noexcept;
class CmdShutdown : public BasicCommand {
public:
diff --git a/src/mongo/db/commands/shutdown_d.cpp b/src/mongo/db/commands/shutdown_d.cpp
index b6d8154c999..5bf404f682e 100644
--- a/src/mongo/db/commands/shutdown_d.cpp
+++ b/src/mongo/db/commands/shutdown_d.cpp
@@ -35,8 +35,37 @@
#include "mongo/db/commands/shutdown.h"
#include "mongo/db/repl/replication_coordinator.h"
+#include "mongo/db/s/transaction_coordinator_service.h"
+#include "mongo/util/log.h"
namespace mongo {
+
+Status stepDownForShutdown(OperationContext* opCtx,
+ const Milliseconds& waitTime,
+ bool forceShutdown) noexcept {
+ auto replCoord = repl::ReplicationCoordinator::get(opCtx);
+ // If this is a single node replica set, then we don't have to wait
+ // for any secondaries. Ignore stepdown.
+ if (replCoord->getConfig().getNumMembers() != 1) {
+ try {
+ replCoord->stepDown(opCtx, false /* force */, waitTime, Seconds(120));
+ } catch (const ExceptionFor<ErrorCodes::NotMaster>&) {
+ // Ignore not master errors.
+ } catch (const DBException& e) {
+ if (!forceShutdown) {
+ return e.toStatus();
+ }
+ // Ignore stepDown errors on force shutdown.
+ log() << "Error stepping down during force shutdown " << e.toStatus();
+ }
+
+ // Even if the ReplicationCoordinator failed to step down, ensure we still shut down the
+ // TransactionCoordinatorService (see SERVER-45009)
+ TransactionCoordinatorService::get(opCtx)->onStepDown();
+ }
+ return Status::OK();
+}
+
namespace {
class CmdShutdownMongoD : public CmdShutdown {
@@ -61,14 +90,7 @@ public:
timeoutSecs = cmdObj["timeoutSecs"].numberLong();
}
- try {
- repl::ReplicationCoordinator::get(opCtx)->stepDown(
- opCtx, force, Seconds(timeoutSecs), Seconds(120));
- } catch (const DBException& e) {
- if (e.code() != ErrorCodes::NotMaster) { // ignore not master
- throw;
- }
- }
+ uassertStatusOK(stepDownForShutdown(opCtx, Seconds(timeoutSecs), force));
// Never returns
shutdownHelper(cmdObj);
diff --git a/src/mongo/db/db.cpp b/src/mongo/db/db.cpp
index 4d20f50a16f..4cbafa15ee9 100644
--- a/src/mongo/db/db.cpp
+++ b/src/mongo/db/db.cpp
@@ -62,6 +62,7 @@
#include "mongo/db/clientcursor.h"
#include "mongo/db/commands/feature_compatibility_version.h"
#include "mongo/db/commands/feature_compatibility_version_gen.h"
+#include "mongo/db/commands/shutdown.h"
#include "mongo/db/concurrency/d_concurrency.h"
#include "mongo/db/concurrency/flow_control_ticketholder.h"
#include "mongo/db/concurrency/lock_state.h"
@@ -928,26 +929,12 @@ void shutdownTask(const ShutdownTaskArgs& shutdownArgs) {
opCtx = uniqueOpCtx.get();
}
- // If this is a single node replica set, then we don't have to wait
- // for any secondaries. Ignore stepdown.
- if (repl::ReplicationCoordinator::get(serviceContext)->getConfig().getNumMembers() != 1) {
- try {
- // For faster tests, we allow a short wait time with setParameter.
- auto waitTime = repl::waitForStepDownOnNonCommandShutdown.load()
- ? Milliseconds(Seconds(10))
- : Milliseconds(100);
- replCoord->stepDown(opCtx, false /* force */, waitTime, Seconds(120));
- } catch (const ExceptionFor<ErrorCodes::NotMaster>&) {
- // ignore not master errors
- } catch (const DBException& e) {
- log() << "Failed to stepDown in non-command initiated shutdown path "
- << e.toString();
- }
-
- // Even if the replCoordinator failed to step down, ensure we still shut down the
- // TransactionCoordinatorService (see SERVER-45009)
- TransactionCoordinatorService::get(serviceContext)->onStepDown();
- }
+ // For faster tests, we allow a short wait time with setParameter.
+ auto waitTime = repl::waitForStepDownOnNonCommandShutdown.load() ? Milliseconds(Seconds(10))
+ : Milliseconds(100);
+ const auto forceShutdown = true;
+ // stepDown should never return an error during force shutdown.
+ invariant(stepDownForShutdown(opCtx, waitTime, forceShutdown).isOK());
}
WaitForMajorityService::get(serviceContext).shutDown();