diff options
Diffstat (limited to 'lib/gitlab/memory')
-rw-r--r-- | lib/gitlab/memory/reporter.rb | 4 | ||||
-rw-r--r-- | lib/gitlab/memory/watchdog.rb | 44 | ||||
-rw-r--r-- | lib/gitlab/memory/watchdog/configuration.rb | 2 | ||||
-rw-r--r-- | lib/gitlab/memory/watchdog/configurator.rb | 12 | ||||
-rw-r--r-- | lib/gitlab/memory/watchdog/handlers/null_handler.rb | 24 | ||||
-rw-r--r-- | lib/gitlab/memory/watchdog/handlers/puma_handler.rb | 23 | ||||
-rw-r--r-- | lib/gitlab/memory/watchdog/handlers/sidekiq_handler.rb | 63 |
7 files changed, 124 insertions, 48 deletions
diff --git a/lib/gitlab/memory/reporter.rb b/lib/gitlab/memory/reporter.rb index 5effafc9f5b..db0fd24983b 100644 --- a/lib/gitlab/memory/reporter.rb +++ b/lib/gitlab/memory/reporter.rb @@ -69,14 +69,14 @@ module Gitlab report_file = file_name(report) tmp_file_path = File.join(tmp_dir, report_file) - write_heap_dump_file(report, tmp_file_path) + write_compressed_file(report, tmp_file_path) File.join(@reports_path, report_file).tap do |report_file_path| FileUtils.mv(tmp_file_path, report_file_path) end end - def write_heap_dump_file(report, path) + def write_compressed_file(report, path) io_r, io_w = IO.pipe err_r, err_w = IO.pipe pid = nil diff --git a/lib/gitlab/memory/watchdog.rb b/lib/gitlab/memory/watchdog.rb index c94dbed1d46..cc335c00e26 100644 --- a/lib/gitlab/memory/watchdog.rb +++ b/lib/gitlab/memory/watchdog.rb @@ -6,47 +6,6 @@ module Gitlab # into a handler when the Ruby process violates defined limits # for an extended period of time. class Watchdog - # This handler does nothing. It returns `false` to indicate to the - # caller that the situation has not been dealt with so it will - # receive calls repeatedly if fragmentation remains high. - # - # This is useful for "dress rehearsals" in production since it allows - # us to observe how frequently the handler is invoked before taking action. - class NullHandler - include Singleton - - def call - # NOP - false - end - end - - # This handler sends SIGTERM and considers the situation handled. - class TermProcessHandler - def initialize(pid = $$) - @pid = pid - end - - def call - Process.kill(:TERM, @pid) - true - end - end - - # This handler invokes Puma's graceful termination handler, which takes - # into account a configurable grace period during which a process may - # remain unresponsive to a SIGTERM. - class PumaHandler - def initialize(puma_options = ::Puma.cli_config.options) - @worker = ::Puma::Cluster::WorkerHandle.new(0, $$, 0, puma_options) - end - - def call - @worker.term - true - end - end - def initialize @configuration = Configuration.new @alive = true @@ -73,6 +32,7 @@ module Gitlab def stop stop_working(reason: 'background task stopped') + handler.stop if handler.respond_to?(:stop) end private @@ -111,7 +71,7 @@ module Gitlab def handler # This allows us to keep the watchdog running but turn it into "friendly mode" where # all that happens is we collect logs and Prometheus events for fragmentation violations. - return NullHandler.instance unless Feature.enabled?(:enforce_memory_watchdog, type: :ops) + return Handlers::NullHandler.instance unless Feature.enabled?(:enforce_memory_watchdog, type: :ops) configuration.handler end diff --git a/lib/gitlab/memory/watchdog/configuration.rb b/lib/gitlab/memory/watchdog/configuration.rb index 5c459220be8..6ab199bf816 100644 --- a/lib/gitlab/memory/watchdog/configuration.rb +++ b/lib/gitlab/memory/watchdog/configuration.rb @@ -48,7 +48,7 @@ module Gitlab end def handler - @handler ||= NullHandler.instance + @handler ||= Handlers::NullHandler.instance end def event_reporter diff --git a/lib/gitlab/memory/watchdog/configurator.rb b/lib/gitlab/memory/watchdog/configurator.rb index 04c04cbde02..4a6640ba901 100644 --- a/lib/gitlab/memory/watchdog/configurator.rb +++ b/lib/gitlab/memory/watchdog/configurator.rb @@ -12,12 +12,12 @@ module Gitlab DEFAULT_MAX_HEAP_FRAG = 0.5 DEFAULT_MAX_MEM_GROWTH = 3.0 # grace_time / sleep_interval = max_strikes allowed for Sidekiq process to violate defined limits. - DEFAULT_SIDEKIQ_GRACE_TIME_S = 300 + DEFAULT_SIDEKIQ_GRACE_TIME_S = 900 class << self def configure_for_puma ->(config) do - config.handler = Gitlab::Memory::Watchdog::PumaHandler.new + config.handler = Gitlab::Memory::Watchdog::Handlers::PumaHandler.new config.sleep_time_seconds = ENV.fetch('GITLAB_MEMWD_SLEEP_TIME_SEC', DEFAULT_SLEEP_INTERVAL_S).to_i config.monitors(&configure_monitors_for_puma) end @@ -25,7 +25,13 @@ module Gitlab def configure_for_sidekiq ->(config) do - config.handler = Gitlab::Memory::Watchdog::TermProcessHandler.new + # Give Sidekiq up to 30 seconds to allow existing jobs to finish after exceeding the limit + shutdown_timeout_seconds = ENV.fetch('SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT', 30).to_i + + config.handler = Gitlab::Memory::Watchdog::Handlers::SidekiqHandler.new( + shutdown_timeout_seconds, + sidekiq_sleep_time + ) config.sleep_time_seconds = sidekiq_sleep_time config.monitors(&configure_monitors_for_sidekiq) config.event_reporter = SidekiqEventReporter.new diff --git a/lib/gitlab/memory/watchdog/handlers/null_handler.rb b/lib/gitlab/memory/watchdog/handlers/null_handler.rb new file mode 100644 index 00000000000..127001003ce --- /dev/null +++ b/lib/gitlab/memory/watchdog/handlers/null_handler.rb @@ -0,0 +1,24 @@ +# frozen_string_literal: true + +module Gitlab + module Memory + class Watchdog + module Handlers + # This handler does nothing. It returns `false` to indicate to the + # caller that the situation has not been dealt with so it will + # receive calls repeatedly if fragmentation remains high. + # + # This is useful for "dress rehearsals" in production since it allows + # us to observe how frequently the handler is invoked before taking action. + class NullHandler + include Singleton + + def call + # NOP + false + end + end + end + end + end +end diff --git a/lib/gitlab/memory/watchdog/handlers/puma_handler.rb b/lib/gitlab/memory/watchdog/handlers/puma_handler.rb new file mode 100644 index 00000000000..fffd91733c8 --- /dev/null +++ b/lib/gitlab/memory/watchdog/handlers/puma_handler.rb @@ -0,0 +1,23 @@ +# frozen_string_literal: true + +module Gitlab + module Memory + class Watchdog + module Handlers + # This handler invokes Puma's graceful termination handler, which takes + # into account a configurable grace period during which a process may + # remain unresponsive to a SIGTERM. + class PumaHandler + def initialize(puma_options = ::Puma.cli_config.options) + @worker = ::Puma::Cluster::WorkerHandle.new(0, $$, 0, puma_options) + end + + def call + @worker.term + true + end + end + end + end + end +end diff --git a/lib/gitlab/memory/watchdog/handlers/sidekiq_handler.rb b/lib/gitlab/memory/watchdog/handlers/sidekiq_handler.rb new file mode 100644 index 00000000000..47ed608c576 --- /dev/null +++ b/lib/gitlab/memory/watchdog/handlers/sidekiq_handler.rb @@ -0,0 +1,63 @@ +# frozen_string_literal: true + +module Gitlab + module Memory + class Watchdog + module Handlers + class SidekiqHandler + def initialize(shutdown_timeout_seconds, sleep_time_seconds) + @shutdown_timeout_seconds = shutdown_timeout_seconds + @sleep_time_seconds = sleep_time_seconds + @alive = true + end + + def call + # Tell Sidekiq to stop fetching new jobs + # We first SIGNAL and then wait given time + send_signal(:TSTP, $$, 'stop fetching new jobs', @shutdown_timeout_seconds) + return true unless @alive + + # Tell sidekiq to restart itself + # Keep extra safe to wait `Sidekiq[:timeout] + 2` seconds before SIGKILL + send_signal(:TERM, $$, 'gracefully shut down', Sidekiq[:timeout] + 2) + return true unless @alive + + # Ideally we should never reach this condition + # Wait for Sidekiq to shutdown gracefully, and kill it if it didn't + # If process is group leader, kill the whole pgroup, so we can be sure no children are left behind + send_signal(:KILL, Process.getpgrp == $$ ? 0 : $$, 'hard shut down') + + true + end + + def stop + @alive = false + end + + private + + def send_signal(signal, pid, explanation, wait_time = nil) + Sidekiq.logger.warn( + pid: pid, + worker_id: ::Prometheus::PidProvider.worker_id, + memwd_handler_class: self.class.to_s, + memwd_signal: signal, + memwd_explanation: explanation, + memwd_wait_time: wait_time, + message: "Sending signal and waiting" + ) + + ProcessManagement.signal(pid, signal) + + return unless wait_time + + deadline = Gitlab::Metrics::System.monotonic_time + wait_time + + # Sleep until timeout reached + sleep(@sleep_time_seconds) while @alive && Gitlab::Metrics::System.monotonic_time < deadline + end + end + end + end + end +end |