summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorDaniel Morilha <daniel.morilha@mongodb.com>2022-04-11 13:50:49 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-04-11 19:28:35 +0000
commit6d4b13ac2e199b5b9a34986e4d317fdfc695b3d4 (patch)
tree7a34d4b29d69d570d9d776db94d9c1d7339a7492 /src
parent3ffd0984dd72536d9943caf4db852d12458c72ae (diff)
downloadmongo-6d4b13ac2e199b5b9a34986e4d317fdfc695b3d4.tar.gz
SERVER-63198 Prevent shutdown command from hanging
Diffstat (limited to 'src')
-rw-r--r--src/mongo/base/error_codes.yml2
-rw-r--r--src/mongo/db/commands/shutdown.cpp54
-rw-r--r--src/mongo/db/commands/shutdown.h28
-rw-r--r--src/mongo/db/commands/shutdown_d.cpp6
-rw-r--r--src/mongo/s/commands/cluster_shutdown_cmd.cpp2
-rw-r--r--src/mongo/s/commands/strategy.cpp2
6 files changed, 60 insertions, 34 deletions
diff --git a/src/mongo/base/error_codes.yml b/src/mongo/base/error_codes.yml
index 23fbd301d72..45e5aff44a0 100644
--- a/src/mongo/base/error_codes.yml
+++ b/src/mongo/base/error_codes.yml
@@ -484,6 +484,8 @@ error_codes:
- {code: 371, name: UserWritesBlocked}
+ - {code: 372, name: CloseConnectionForShutdownCommand, categories: [CloseConnectionError,InternalOnly]}
+
# Error codes 4000-8999 are reserved.
# Non-sequential error codes for compatibility only)
diff --git a/src/mongo/db/commands/shutdown.cpp b/src/mongo/db/commands/shutdown.cpp
index 230f268fe7e..2eec6acf288 100644
--- a/src/mongo/db/commands/shutdown.cpp
+++ b/src/mongo/db/commands/shutdown.cpp
@@ -31,7 +31,10 @@
#include "mongo/logv2/log.h"
+#include "mongo/base/error_codes.h"
#include "mongo/db/commands/shutdown.h"
+#include "mongo/stdx/thread.h"
+#include "mongo/util/assert_util.h"
#include "mongo/util/fail_point.h"
namespace mongo {
@@ -41,11 +44,10 @@ namespace shutdown_detail {
MONGO_FAIL_POINT_DEFINE(crashOnShutdown);
int* volatile illegalAddress; // NOLINT - used for fail point only
-void finishShutdown(bool force, long long timeoutSecs, Milliseconds quiesceTime) {
- ShutdownTaskArgs shutdownArgs;
- shutdownArgs.isUserInitiated = true;
- shutdownArgs.quiesceTime = quiesceTime;
-
+void finishShutdown(OperationContext* opCtx,
+ bool force,
+ Milliseconds timeout,
+ Milliseconds quiesceTime) {
crashOnShutdown.execute([&](const BSONObj& data) {
if (data["how"].str() == "fault") {
++*illegalAddress;
@@ -57,22 +59,38 @@ void finishShutdown(bool force, long long timeoutSecs, Milliseconds quiesceTime)
LOGV2(4695400,
"Terminating via shutdown command",
"force"_attr = force,
- "timeoutSecs"_attr = timeoutSecs);
+ "timeout"_attr = timeout);
-#if defined(_WIN32)
- // Signal the ServiceMain thread to shutdown.
- if (ntservice::shouldStartService()) {
- shutdownNoTerminate(shutdownArgs);
+ // Only allow the first shutdown command to spawn a new thread and execute the shutdown.
+ // Late arrivers will skip and wait until operations are killed.
+ static StaticImmortal<AtomicWord<bool>> shutdownAlreadyInProgress{false};
+ if (!shutdownAlreadyInProgress->swap(true)) {
+ stdx::thread([quiesceTime] {
+ ShutdownTaskArgs shutdownArgs;
+ shutdownArgs.isUserInitiated = true;
+ shutdownArgs.quiesceTime = quiesceTime;
- // Client expects us to abruptly close the socket as part of exiting
- // so this function is not allowed to return.
- // The ServiceMain thread will quit for us so just sleep until it does.
- while (true)
- sleepsecs(60); // Loop forever
- return;
- }
+#if defined(_WIN32)
+ // Signal the ServiceMain thread to shutdown.
+ if (ntservice::shouldStartService()) {
+ shutdownNoTerminate(shutdownArgs);
+ return;
+ }
#endif
- shutdown(EXIT_CLEAN, shutdownArgs); // this never returns
+ shutdown(EXIT_CLEAN, shutdownArgs); // this never returns
+ })
+ .detach();
+ }
+
+ // Client expects the shutdown command to abruptly close the socket as part of exiting.
+ // This function is not allowed to return until the server interrupts its operation.
+ // The following requires the shutdown task to kill all the operations after the server
+ // stops accepting incoming connections.
+ while (opCtx->checkForInterruptNoAssert().isOK())
+ sleepsecs(1);
+
+ iasserted({ErrorCodes::CloseConnectionForShutdownCommand,
+ "Closing the connection running the shutdown command"});
}
} // namespace shutdown_detail
diff --git a/src/mongo/db/commands/shutdown.h b/src/mongo/db/commands/shutdown.h
index 88422e9e32f..757f14b432c 100644
--- a/src/mongo/db/commands/shutdown.h
+++ b/src/mongo/db/commands/shutdown.h
@@ -46,10 +46,13 @@ Status stepDownForShutdown(OperationContext* opCtx,
namespace shutdown_detail {
/**
- * Completes the shutdown. 'timeoutSecs' is the total time permitted for shutdown-related timeouts.
+ * Completes the shutdown. 'timeout' is the total time permitted for shutdown-related timeouts.
* 'quiesceTime' is the remaining time allowed for quiescing.
*/
-void finishShutdown(bool force, long long timeoutSecs, Milliseconds quiesceTime);
+void finishShutdown(OperationContext* opCtx,
+ bool force,
+ Milliseconds timeout,
+ Milliseconds quiesceTime);
} // namespace shutdown_detail
template <typename Derived>
@@ -64,18 +67,21 @@ public:
void typedRun(OperationContext* opCtx) {
auto force = Base::request().getForce();
- auto timeoutSecs = Base::request().getTimeoutSecs();
- auto shutdownStartTime = opCtx->getServiceContext()->getPreciseClockSource()->now();
+ Seconds timeout{Base::request().getTimeoutSecs()};
+
+ auto getCurrentTime = [&] {
+ return opCtx->getServiceContext()->getPreciseClockSource()->now();
+ };
+
+ auto shutdownStartTime = getCurrentTime();
// Commands derived from CmdShutdown should define their own
// `beginShutdown` methods.
- Derived::beginShutdown(opCtx, force, timeoutSecs);
- Milliseconds quiesceTime =
- std::max(Milliseconds::zero(),
- Milliseconds(Seconds(timeoutSecs)) -
- (opCtx->getServiceContext()->getPreciseClockSource()->now() -
- shutdownStartTime));
- shutdown_detail::finishShutdown(force, timeoutSecs, quiesceTime);
+ Derived::beginShutdown(opCtx, force, timeout);
+
+ auto quiesceTime =
+ std::max(shutdownStartTime + timeout - getCurrentTime(), Milliseconds{0});
+ shutdown_detail::finishShutdown(opCtx, force, timeout, quiesceTime);
}
private:
diff --git a/src/mongo/db/commands/shutdown_d.cpp b/src/mongo/db/commands/shutdown_d.cpp
index dad66bfcfa7..d1d06cf6adb 100644
--- a/src/mongo/db/commands/shutdown_d.cpp
+++ b/src/mongo/db/commands/shutdown_d.cpp
@@ -94,12 +94,12 @@ public:
"node is the primary of a replica set, waits up to 'timeoutSecs' for an electable "
"node to be caught up before stepping down. If 'force' is false and no electable "
"node was able to catch up, does not shut down. If the node is in state SECONDARY "
- "after the attempted stepdown, any remaining time in 'timeoutSecs' is used for "
+ "after the attempted stepdown, any remaining time in 'timeout' is used for "
"quiesce mode, where the database continues to allow operations to run, but directs "
"clients to route new operations to other replica set members.";
}
- static void beginShutdown(OperationContext* opCtx, bool force, long long timeoutSecs) {
+ static void beginShutdown(OperationContext* opCtx, bool force, Milliseconds timeout) {
// This code may race with a new index build starting up. We may get 0 active index builds
// from the IndexBuildsCoordinator shutdown to proceed, but there is nothing to prevent a
// new index build from starting after that check.
@@ -113,7 +113,7 @@ public:
numIndexBuilds == 0U);
}
- uassertStatusOK(stepDownForShutdown(opCtx, Seconds(timeoutSecs), force));
+ uassertStatusOK(stepDownForShutdown(opCtx, timeout, force));
}
} cmdShutdownMongoD;
diff --git a/src/mongo/s/commands/cluster_shutdown_cmd.cpp b/src/mongo/s/commands/cluster_shutdown_cmd.cpp
index c1fbe6be8a8..0b5633d390f 100644
--- a/src/mongo/s/commands/cluster_shutdown_cmd.cpp
+++ b/src/mongo/s/commands/cluster_shutdown_cmd.cpp
@@ -44,7 +44,7 @@ public:
"run, but directs clients to route new operations to other mongos nodes.";
}
- static void beginShutdown(OperationContext* opCtx, bool force, long long timeoutSecs) {}
+ static void beginShutdown(OperationContext* opCtx, bool force, Milliseconds timeout) {}
} clusterShutdownCmd;
diff --git a/src/mongo/s/commands/strategy.cpp b/src/mongo/s/commands/strategy.cpp
index edefcf546fe..c3ade949332 100644
--- a/src/mongo/s/commands/strategy.cpp
+++ b/src/mongo/s/commands/strategy.cpp
@@ -1221,7 +1221,7 @@ Future<void> ClientCommand::_execute() {
}
Future<void> ClientCommand::_handleException(Status status) {
- if (_propagateException) {
+ if (status == ErrorCodes::CloseConnectionForShutdownCommand || _propagateException) {
return status;
}