summaryrefslogtreecommitdiff
path: root/lib/gitlab/memory
diff options
context:
space:
mode:
Diffstat (limited to 'lib/gitlab/memory')
-rw-r--r--lib/gitlab/memory/reporter.rb4
-rw-r--r--lib/gitlab/memory/watchdog.rb44
-rw-r--r--lib/gitlab/memory/watchdog/configuration.rb2
-rw-r--r--lib/gitlab/memory/watchdog/configurator.rb12
-rw-r--r--lib/gitlab/memory/watchdog/handlers/null_handler.rb24
-rw-r--r--lib/gitlab/memory/watchdog/handlers/puma_handler.rb23
-rw-r--r--lib/gitlab/memory/watchdog/handlers/sidekiq_handler.rb63
7 files changed, 124 insertions, 48 deletions
diff --git a/lib/gitlab/memory/reporter.rb b/lib/gitlab/memory/reporter.rb
index 5effafc9f5b..db0fd24983b 100644
--- a/lib/gitlab/memory/reporter.rb
+++ b/lib/gitlab/memory/reporter.rb
@@ -69,14 +69,14 @@ module Gitlab
report_file = file_name(report)
tmp_file_path = File.join(tmp_dir, report_file)
- write_heap_dump_file(report, tmp_file_path)
+ write_compressed_file(report, tmp_file_path)
File.join(@reports_path, report_file).tap do |report_file_path|
FileUtils.mv(tmp_file_path, report_file_path)
end
end
- def write_heap_dump_file(report, path)
+ def write_compressed_file(report, path)
io_r, io_w = IO.pipe
err_r, err_w = IO.pipe
pid = nil
diff --git a/lib/gitlab/memory/watchdog.rb b/lib/gitlab/memory/watchdog.rb
index c94dbed1d46..cc335c00e26 100644
--- a/lib/gitlab/memory/watchdog.rb
+++ b/lib/gitlab/memory/watchdog.rb
@@ -6,47 +6,6 @@ module Gitlab
# into a handler when the Ruby process violates defined limits
# for an extended period of time.
class Watchdog
- # This handler does nothing. It returns `false` to indicate to the
- # caller that the situation has not been dealt with so it will
- # receive calls repeatedly if fragmentation remains high.
- #
- # This is useful for "dress rehearsals" in production since it allows
- # us to observe how frequently the handler is invoked before taking action.
- class NullHandler
- include Singleton
-
- def call
- # NOP
- false
- end
- end
-
- # This handler sends SIGTERM and considers the situation handled.
- class TermProcessHandler
- def initialize(pid = $$)
- @pid = pid
- end
-
- def call
- Process.kill(:TERM, @pid)
- true
- end
- end
-
- # This handler invokes Puma's graceful termination handler, which takes
- # into account a configurable grace period during which a process may
- # remain unresponsive to a SIGTERM.
- class PumaHandler
- def initialize(puma_options = ::Puma.cli_config.options)
- @worker = ::Puma::Cluster::WorkerHandle.new(0, $$, 0, puma_options)
- end
-
- def call
- @worker.term
- true
- end
- end
-
def initialize
@configuration = Configuration.new
@alive = true
@@ -73,6 +32,7 @@ module Gitlab
def stop
stop_working(reason: 'background task stopped')
+ handler.stop if handler.respond_to?(:stop)
end
private
@@ -111,7 +71,7 @@ module Gitlab
def handler
# This allows us to keep the watchdog running but turn it into "friendly mode" where
# all that happens is we collect logs and Prometheus events for fragmentation violations.
- return NullHandler.instance unless Feature.enabled?(:enforce_memory_watchdog, type: :ops)
+ return Handlers::NullHandler.instance unless Feature.enabled?(:enforce_memory_watchdog, type: :ops)
configuration.handler
end
diff --git a/lib/gitlab/memory/watchdog/configuration.rb b/lib/gitlab/memory/watchdog/configuration.rb
index 5c459220be8..6ab199bf816 100644
--- a/lib/gitlab/memory/watchdog/configuration.rb
+++ b/lib/gitlab/memory/watchdog/configuration.rb
@@ -48,7 +48,7 @@ module Gitlab
end
def handler
- @handler ||= NullHandler.instance
+ @handler ||= Handlers::NullHandler.instance
end
def event_reporter
diff --git a/lib/gitlab/memory/watchdog/configurator.rb b/lib/gitlab/memory/watchdog/configurator.rb
index 04c04cbde02..4a6640ba901 100644
--- a/lib/gitlab/memory/watchdog/configurator.rb
+++ b/lib/gitlab/memory/watchdog/configurator.rb
@@ -12,12 +12,12 @@ module Gitlab
DEFAULT_MAX_HEAP_FRAG = 0.5
DEFAULT_MAX_MEM_GROWTH = 3.0
# grace_time / sleep_interval = max_strikes allowed for Sidekiq process to violate defined limits.
- DEFAULT_SIDEKIQ_GRACE_TIME_S = 300
+ DEFAULT_SIDEKIQ_GRACE_TIME_S = 900
class << self
def configure_for_puma
->(config) do
- config.handler = Gitlab::Memory::Watchdog::PumaHandler.new
+ config.handler = Gitlab::Memory::Watchdog::Handlers::PumaHandler.new
config.sleep_time_seconds = ENV.fetch('GITLAB_MEMWD_SLEEP_TIME_SEC', DEFAULT_SLEEP_INTERVAL_S).to_i
config.monitors(&configure_monitors_for_puma)
end
@@ -25,7 +25,13 @@ module Gitlab
def configure_for_sidekiq
->(config) do
- config.handler = Gitlab::Memory::Watchdog::TermProcessHandler.new
+ # Give Sidekiq up to 30 seconds to allow existing jobs to finish after exceeding the limit
+ shutdown_timeout_seconds = ENV.fetch('SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT', 30).to_i
+
+ config.handler = Gitlab::Memory::Watchdog::Handlers::SidekiqHandler.new(
+ shutdown_timeout_seconds,
+ sidekiq_sleep_time
+ )
config.sleep_time_seconds = sidekiq_sleep_time
config.monitors(&configure_monitors_for_sidekiq)
config.event_reporter = SidekiqEventReporter.new
diff --git a/lib/gitlab/memory/watchdog/handlers/null_handler.rb b/lib/gitlab/memory/watchdog/handlers/null_handler.rb
new file mode 100644
index 00000000000..127001003ce
--- /dev/null
+++ b/lib/gitlab/memory/watchdog/handlers/null_handler.rb
@@ -0,0 +1,24 @@
+# frozen_string_literal: true
+
+module Gitlab
+ module Memory
+ class Watchdog
+ module Handlers
+ # This handler does nothing. It returns `false` to indicate to the
+ # caller that the situation has not been dealt with so it will
+ # receive calls repeatedly if fragmentation remains high.
+ #
+ # This is useful for "dress rehearsals" in production since it allows
+ # us to observe how frequently the handler is invoked before taking action.
+ class NullHandler
+ include Singleton
+
+ def call
+ # NOP
+ false
+ end
+ end
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/memory/watchdog/handlers/puma_handler.rb b/lib/gitlab/memory/watchdog/handlers/puma_handler.rb
new file mode 100644
index 00000000000..fffd91733c8
--- /dev/null
+++ b/lib/gitlab/memory/watchdog/handlers/puma_handler.rb
@@ -0,0 +1,23 @@
+# frozen_string_literal: true
+
+module Gitlab
+ module Memory
+ class Watchdog
+ module Handlers
+ # This handler invokes Puma's graceful termination handler, which takes
+ # into account a configurable grace period during which a process may
+ # remain unresponsive to a SIGTERM.
+ class PumaHandler
+ def initialize(puma_options = ::Puma.cli_config.options)
+ @worker = ::Puma::Cluster::WorkerHandle.new(0, $$, 0, puma_options)
+ end
+
+ def call
+ @worker.term
+ true
+ end
+ end
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/memory/watchdog/handlers/sidekiq_handler.rb b/lib/gitlab/memory/watchdog/handlers/sidekiq_handler.rb
new file mode 100644
index 00000000000..47ed608c576
--- /dev/null
+++ b/lib/gitlab/memory/watchdog/handlers/sidekiq_handler.rb
@@ -0,0 +1,63 @@
+# frozen_string_literal: true
+
+module Gitlab
+ module Memory
+ class Watchdog
+ module Handlers
+ class SidekiqHandler
+ def initialize(shutdown_timeout_seconds, sleep_time_seconds)
+ @shutdown_timeout_seconds = shutdown_timeout_seconds
+ @sleep_time_seconds = sleep_time_seconds
+ @alive = true
+ end
+
+ def call
+ # Tell Sidekiq to stop fetching new jobs
+ # We first SIGNAL and then wait given time
+ send_signal(:TSTP, $$, 'stop fetching new jobs', @shutdown_timeout_seconds)
+ return true unless @alive
+
+ # Tell sidekiq to restart itself
+ # Keep extra safe to wait `Sidekiq[:timeout] + 2` seconds before SIGKILL
+ send_signal(:TERM, $$, 'gracefully shut down', Sidekiq[:timeout] + 2)
+ return true unless @alive
+
+ # Ideally we should never reach this condition
+ # Wait for Sidekiq to shutdown gracefully, and kill it if it didn't
+ # If process is group leader, kill the whole pgroup, so we can be sure no children are left behind
+ send_signal(:KILL, Process.getpgrp == $$ ? 0 : $$, 'hard shut down')
+
+ true
+ end
+
+ def stop
+ @alive = false
+ end
+
+ private
+
+ def send_signal(signal, pid, explanation, wait_time = nil)
+ Sidekiq.logger.warn(
+ pid: pid,
+ worker_id: ::Prometheus::PidProvider.worker_id,
+ memwd_handler_class: self.class.to_s,
+ memwd_signal: signal,
+ memwd_explanation: explanation,
+ memwd_wait_time: wait_time,
+ message: "Sending signal and waiting"
+ )
+
+ ProcessManagement.signal(pid, signal)
+
+ return unless wait_time
+
+ deadline = Gitlab::Metrics::System.monotonic_time + wait_time
+
+ # Sleep until timeout reached
+ sleep(@sleep_time_seconds) while @alive && Gitlab::Metrics::System.monotonic_time < deadline
+ end
+ end
+ end
+ end
+ end
+end