diff options
author | Daniel Morilha <daniel.morilha@mongodb.com> | 2022-04-11 13:50:49 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-04-11 19:28:35 +0000 |
commit | 6d4b13ac2e199b5b9a34986e4d317fdfc695b3d4 (patch) | |
tree | 7a34d4b29d69d570d9d776db94d9c1d7339a7492 /src | |
parent | 3ffd0984dd72536d9943caf4db852d12458c72ae (diff) | |
download | mongo-6d4b13ac2e199b5b9a34986e4d317fdfc695b3d4.tar.gz |
SERVER-63198 Prevent shutdown command from hanging
Diffstat (limited to 'src')
-rw-r--r-- | src/mongo/base/error_codes.yml | 2 | ||||
-rw-r--r-- | src/mongo/db/commands/shutdown.cpp | 54 | ||||
-rw-r--r-- | src/mongo/db/commands/shutdown.h | 28 | ||||
-rw-r--r-- | src/mongo/db/commands/shutdown_d.cpp | 6 | ||||
-rw-r--r-- | src/mongo/s/commands/cluster_shutdown_cmd.cpp | 2 | ||||
-rw-r--r-- | src/mongo/s/commands/strategy.cpp | 2 |
6 files changed, 60 insertions, 34 deletions
diff --git a/src/mongo/base/error_codes.yml b/src/mongo/base/error_codes.yml index 23fbd301d72..45e5aff44a0 100644 --- a/src/mongo/base/error_codes.yml +++ b/src/mongo/base/error_codes.yml @@ -484,6 +484,8 @@ error_codes: - {code: 371, name: UserWritesBlocked} + - {code: 372, name: CloseConnectionForShutdownCommand, categories: [CloseConnectionError,InternalOnly]} + # Error codes 4000-8999 are reserved. # Non-sequential error codes for compatibility only) diff --git a/src/mongo/db/commands/shutdown.cpp b/src/mongo/db/commands/shutdown.cpp index 230f268fe7e..2eec6acf288 100644 --- a/src/mongo/db/commands/shutdown.cpp +++ b/src/mongo/db/commands/shutdown.cpp @@ -31,7 +31,10 @@ #include "mongo/logv2/log.h" +#include "mongo/base/error_codes.h" #include "mongo/db/commands/shutdown.h" +#include "mongo/stdx/thread.h" +#include "mongo/util/assert_util.h" #include "mongo/util/fail_point.h" namespace mongo { @@ -41,11 +44,10 @@ namespace shutdown_detail { MONGO_FAIL_POINT_DEFINE(crashOnShutdown); int* volatile illegalAddress; // NOLINT - used for fail point only -void finishShutdown(bool force, long long timeoutSecs, Milliseconds quiesceTime) { - ShutdownTaskArgs shutdownArgs; - shutdownArgs.isUserInitiated = true; - shutdownArgs.quiesceTime = quiesceTime; - +void finishShutdown(OperationContext* opCtx, + bool force, + Milliseconds timeout, + Milliseconds quiesceTime) { crashOnShutdown.execute([&](const BSONObj& data) { if (data["how"].str() == "fault") { ++*illegalAddress; @@ -57,22 +59,38 @@ void finishShutdown(bool force, long long timeoutSecs, Milliseconds quiesceTime) LOGV2(4695400, "Terminating via shutdown command", "force"_attr = force, - "timeoutSecs"_attr = timeoutSecs); + "timeout"_attr = timeout); -#if defined(_WIN32) - // Signal the ServiceMain thread to shutdown. - if (ntservice::shouldStartService()) { - shutdownNoTerminate(shutdownArgs); + // Only allow the first shutdown command to spawn a new thread and execute the shutdown. + // Late arrivers will skip and wait until operations are killed. + static StaticImmortal<AtomicWord<bool>> shutdownAlreadyInProgress{false}; + if (!shutdownAlreadyInProgress->swap(true)) { + stdx::thread([quiesceTime] { + ShutdownTaskArgs shutdownArgs; + shutdownArgs.isUserInitiated = true; + shutdownArgs.quiesceTime = quiesceTime; - // Client expects us to abruptly close the socket as part of exiting - // so this function is not allowed to return. - // The ServiceMain thread will quit for us so just sleep until it does. - while (true) - sleepsecs(60); // Loop forever - return; - } +#if defined(_WIN32) + // Signal the ServiceMain thread to shutdown. + if (ntservice::shouldStartService()) { + shutdownNoTerminate(shutdownArgs); + return; + } #endif - shutdown(EXIT_CLEAN, shutdownArgs); // this never returns + shutdown(EXIT_CLEAN, shutdownArgs); // this never returns + }) + .detach(); + } + + // Client expects the shutdown command to abruptly close the socket as part of exiting. + // This function is not allowed to return until the server interrupts its operation. + // The following requires the shutdown task to kill all the operations after the server + // stops accepting incoming connections. + while (opCtx->checkForInterruptNoAssert().isOK()) + sleepsecs(1); + + iasserted({ErrorCodes::CloseConnectionForShutdownCommand, + "Closing the connection running the shutdown command"}); } } // namespace shutdown_detail diff --git a/src/mongo/db/commands/shutdown.h b/src/mongo/db/commands/shutdown.h index 88422e9e32f..757f14b432c 100644 --- a/src/mongo/db/commands/shutdown.h +++ b/src/mongo/db/commands/shutdown.h @@ -46,10 +46,13 @@ Status stepDownForShutdown(OperationContext* opCtx, namespace shutdown_detail { /** - * Completes the shutdown. 'timeoutSecs' is the total time permitted for shutdown-related timeouts. + * Completes the shutdown. 'timeout' is the total time permitted for shutdown-related timeouts. * 'quiesceTime' is the remaining time allowed for quiescing. */ -void finishShutdown(bool force, long long timeoutSecs, Milliseconds quiesceTime); +void finishShutdown(OperationContext* opCtx, + bool force, + Milliseconds timeout, + Milliseconds quiesceTime); } // namespace shutdown_detail template <typename Derived> @@ -64,18 +67,21 @@ public: void typedRun(OperationContext* opCtx) { auto force = Base::request().getForce(); - auto timeoutSecs = Base::request().getTimeoutSecs(); - auto shutdownStartTime = opCtx->getServiceContext()->getPreciseClockSource()->now(); + Seconds timeout{Base::request().getTimeoutSecs()}; + + auto getCurrentTime = [&] { + return opCtx->getServiceContext()->getPreciseClockSource()->now(); + }; + + auto shutdownStartTime = getCurrentTime(); // Commands derived from CmdShutdown should define their own // `beginShutdown` methods. - Derived::beginShutdown(opCtx, force, timeoutSecs); - Milliseconds quiesceTime = - std::max(Milliseconds::zero(), - Milliseconds(Seconds(timeoutSecs)) - - (opCtx->getServiceContext()->getPreciseClockSource()->now() - - shutdownStartTime)); - shutdown_detail::finishShutdown(force, timeoutSecs, quiesceTime); + Derived::beginShutdown(opCtx, force, timeout); + + auto quiesceTime = + std::max(shutdownStartTime + timeout - getCurrentTime(), Milliseconds{0}); + shutdown_detail::finishShutdown(opCtx, force, timeout, quiesceTime); } private: diff --git a/src/mongo/db/commands/shutdown_d.cpp b/src/mongo/db/commands/shutdown_d.cpp index dad66bfcfa7..d1d06cf6adb 100644 --- a/src/mongo/db/commands/shutdown_d.cpp +++ b/src/mongo/db/commands/shutdown_d.cpp @@ -94,12 +94,12 @@ public: "node is the primary of a replica set, waits up to 'timeoutSecs' for an electable " "node to be caught up before stepping down. If 'force' is false and no electable " "node was able to catch up, does not shut down. If the node is in state SECONDARY " - "after the attempted stepdown, any remaining time in 'timeoutSecs' is used for " + "after the attempted stepdown, any remaining time in 'timeout' is used for " "quiesce mode, where the database continues to allow operations to run, but directs " "clients to route new operations to other replica set members."; } - static void beginShutdown(OperationContext* opCtx, bool force, long long timeoutSecs) { + static void beginShutdown(OperationContext* opCtx, bool force, Milliseconds timeout) { // This code may race with a new index build starting up. We may get 0 active index builds // from the IndexBuildsCoordinator shutdown to proceed, but there is nothing to prevent a // new index build from starting after that check. @@ -113,7 +113,7 @@ public: numIndexBuilds == 0U); } - uassertStatusOK(stepDownForShutdown(opCtx, Seconds(timeoutSecs), force)); + uassertStatusOK(stepDownForShutdown(opCtx, timeout, force)); } } cmdShutdownMongoD; diff --git a/src/mongo/s/commands/cluster_shutdown_cmd.cpp b/src/mongo/s/commands/cluster_shutdown_cmd.cpp index c1fbe6be8a8..0b5633d390f 100644 --- a/src/mongo/s/commands/cluster_shutdown_cmd.cpp +++ b/src/mongo/s/commands/cluster_shutdown_cmd.cpp @@ -44,7 +44,7 @@ public: "run, but directs clients to route new operations to other mongos nodes."; } - static void beginShutdown(OperationContext* opCtx, bool force, long long timeoutSecs) {} + static void beginShutdown(OperationContext* opCtx, bool force, Milliseconds timeout) {} } clusterShutdownCmd; diff --git a/src/mongo/s/commands/strategy.cpp b/src/mongo/s/commands/strategy.cpp index edefcf546fe..c3ade949332 100644 --- a/src/mongo/s/commands/strategy.cpp +++ b/src/mongo/s/commands/strategy.cpp @@ -1221,7 +1221,7 @@ Future<void> ClientCommand::_execute() { } Future<void> ClientCommand::_handleException(Status status) { - if (_propagateException) { + if (status == ErrorCodes::CloseConnectionForShutdownCommand || _propagateException) { return status; } |