summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorNick Thomas <nick@gitlab.com>2019-02-28 23:25:37 +0000
committerStan Hu <stanhu@gmail.com>2019-03-04 09:06:41 -0800
commitf0c52df5e540e825be0babd04cc557f3f40cf1c6 (patch)
tree0d39c1df112c4ac71938c6051d65dc4751c9d526 /lib
parent6b507626ed3499c1796706b507c30b8f46c706b7 (diff)
downloadgitlab-ce-f0c52df5e540e825be0babd04cc557f3f40cf1c6.tar.gz
sidekiq: terminate child processes at shutdown
Sidekiq jobs frequently spawn long-lived child processes to do work. In some circumstances, these can be reparented to init when sidekiq is terminated, leading to duplication of work and strange concurrency problems. This commit changes sidekiq so that, if run as a process group leader, it will forward `INT` and `TERM` signals to the whole process group. If the memory killer is active, it will also use the process group when resorting to `kill -9` to shut down. These changes mean that a naive `kill <pid-of-sidekiq>` will now do the right thing, killing any child processes spawned by sidekiq, as long as the process supervisor placed it in its own process group. If sidekiq isn't a process group leader, this new code is skipped.
Diffstat (limited to 'lib')
-rw-r--r--lib/gitlab/sidekiq_middleware/memory_killer.rb17
-rw-r--r--lib/gitlab/sidekiq_signals.rb42
2 files changed, 57 insertions, 2 deletions
diff --git a/lib/gitlab/sidekiq_middleware/memory_killer.rb b/lib/gitlab/sidekiq_middleware/memory_killer.rb
index 47333d257eb..ed2c7ee9a2d 100644
--- a/lib/gitlab/sidekiq_middleware/memory_killer.rb
+++ b/lib/gitlab/sidekiq_middleware/memory_killer.rb
@@ -36,11 +36,13 @@ module Gitlab
# Wait `SHUTDOWN_WAIT` to give already fetched jobs time to finish.
# Then, tell Sidekiq to gracefully shut down by giving jobs a few more
# moments to finish, killing and requeuing them if they didn't, and
- # then terminating itself.
+ # then terminating itself. Sidekiq will replicate the TERM to all its
+ # children if it can.
wait_and_signal(SHUTDOWN_WAIT, 'SIGTERM', 'gracefully shut down')
# Wait for Sidekiq to shutdown gracefully, and kill it if it didn't.
- wait_and_signal(Sidekiq.options[:timeout] + 2, 'SIGKILL', 'die')
+ # Kill the whole pgroup, so we can be sure no children are left behind
+ wait_and_signal_pgroup(Sidekiq.options[:timeout] + 2, 'SIGKILL', 'die')
end
end
@@ -53,6 +55,17 @@ module Gitlab
output.to_i
end
+ # If this sidekiq process is pgroup leader, signal to the whole pgroup
+ def wait_and_signal_pgroup(time, signal, explanation)
+ return wait_and_signal(time, signal, explanation) unless Process.getpgrp == pid
+
+ Sidekiq.logger.warn "waiting #{time} seconds before sending Sidekiq worker PGRP-#{pid} #{signal} (#{explanation})"
+ sleep(time)
+
+ Sidekiq.logger.warn "sending Sidekiq worker PGRP-#{pid} #{signal} (#{explanation})"
+ Process.kill(signal, "-#{pid}")
+ end
+
def wait_and_signal(time, signal, explanation)
Sidekiq.logger.warn "waiting #{time} seconds before sending Sidekiq worker PID-#{pid} #{signal} (#{explanation})"
sleep(time)
diff --git a/lib/gitlab/sidekiq_signals.rb b/lib/gitlab/sidekiq_signals.rb
new file mode 100644
index 00000000000..b704ee9a0a9
--- /dev/null
+++ b/lib/gitlab/sidekiq_signals.rb
@@ -0,0 +1,42 @@
+# frozen_string_literal: true
+
+module Gitlab
+ # As a process group leader, we can ensure that children of sidekiq are killed
+ # at the same time as sidekiq itself, to stop long-lived children from being
+ # reparented to init and "escaping". To do this, we override the default
+ # handlers used by sidekiq for INT and TERM signals
+ module SidekiqSignals
+ REPLACE_SIGNALS = %w[INT TERM].freeze
+
+ SIDEKIQ_CHANGED_MESSAGE =
+ "Intercepting signal handlers: #{REPLACE_SIGNALS.join(", ")} failed. " \
+ "Sidekiq should have registered them, but appears not to have done so."
+
+ def self.install!(sidekiq_handlers)
+ # This only works if we're process group leader
+ return unless Process.getpgrp == Process.pid
+
+ raise SIDEKIQ_CHANGED_MESSAGE unless
+ REPLACE_SIGNALS == sidekiq_handlers.keys & REPLACE_SIGNALS
+
+ REPLACE_SIGNALS.each do |signal|
+ old_handler = sidekiq_handlers[signal]
+ sidekiq_handlers[signal] = ->(cli) do
+ blindly_signal_pgroup!(signal)
+ old_handler.call(cli)
+ end
+ end
+ end
+
+ # The process group leader can forward INT and TERM signals to the whole
+ # group. However, the forwarded signal is *also* received by the leader,
+ # which could lead to an infinite loop. We can avoid this by temporarily
+ # ignoring the forwarded signal. This may cause us to miss some repeated
+ # signals from outside the process group, but that isn't fatal.
+ def self.blindly_signal_pgroup!(signal)
+ old_trap = trap(signal, 'IGNORE')
+ Process.kill(signal, "-#{Process.getpgrp}")
+ trap(signal, old_trap)
+ end
+ end
+end