summaryrefslogtreecommitdiff
path: root/lib/gitlab/memory/watchdog.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/gitlab/memory/watchdog.rb')
-rw-r--r--lib/gitlab/memory/watchdog.rb181
1 files changed, 37 insertions, 144 deletions
diff --git a/lib/gitlab/memory/watchdog.rb b/lib/gitlab/memory/watchdog.rb
index 38231fa933b..7007fdfe386 100644
--- a/lib/gitlab/memory/watchdog.rb
+++ b/lib/gitlab/memory/watchdog.rb
@@ -2,25 +2,10 @@
module Gitlab
module Memory
- # A background thread that observes Ruby heap fragmentation and calls
- # into a handler when the Ruby heap has been fragmented for an extended
- # period of time.
- #
- # See Gitlab::Metrics::Memory for how heap fragmentation is defined.
- #
- # To decide whether a given fragmentation level is being exceeded,
- # the watchdog regularly polls the GC. Whenever a violation occurs
- # a strike is issued. If the maximum number of strikes are reached,
- # a handler is invoked to deal with the situation.
- #
- # The duration for which a process may be above a given fragmentation
- # threshold is computed as `max_strikes * sleep_time_seconds`.
+ # A background thread that monitors Ruby memory and calls
+ # into a handler when the Ruby process violates defined limits
+ # for an extended period of time.
class Watchdog
- DEFAULT_SLEEP_TIME_SECONDS = 60 * 5
- DEFAULT_MAX_HEAP_FRAG = 0.5
- DEFAULT_MAX_MEM_GROWTH = 3.0
- DEFAULT_MAX_STRIKES = 5
-
# This handler does nothing. It returns `false` to indicate to the
# caller that the situation has not been dealt with so it will
# receive calls repeatedly if fragmentation remains high.
@@ -62,73 +47,27 @@ module Gitlab
end
end
- # max_heap_fragmentation:
- # The degree to which the Ruby heap is allowed to be fragmented. Range [0,1].
- # max_mem_growth:
- # A multiplier for how much excess private memory a worker can map compared to a reference process
- # (itself or the primary in a pre-fork server.)
- # max_strikes:
- # How many times the process is allowed to be above max_heap_fragmentation before
- # a handler is invoked.
- # sleep_time_seconds:
- # Used to control the frequency with which the watchdog will wake up and poll the GC.
- def initialize(
- handler: NullHandler.instance,
- logger: Logger.new($stdout),
- max_heap_fragmentation: ENV['GITLAB_MEMWD_MAX_HEAP_FRAG']&.to_f || DEFAULT_MAX_HEAP_FRAG,
- max_mem_growth: ENV['GITLAB_MEMWD_MAX_MEM_GROWTH']&.to_f || DEFAULT_MAX_MEM_GROWTH,
- max_strikes: ENV['GITLAB_MEMWD_MAX_STRIKES']&.to_i || DEFAULT_MAX_STRIKES,
- sleep_time_seconds: ENV['GITLAB_MEMWD_SLEEP_TIME_SEC']&.to_i || DEFAULT_SLEEP_TIME_SECONDS,
- **options)
- super(**options)
-
- @handler = handler
- @logger = logger
- @sleep_time_seconds = sleep_time_seconds
- @max_strikes = max_strikes
- @stats = {
- heap_frag: {
- max: max_heap_fragmentation,
- strikes: 0
- },
- mem_growth: {
- max: max_mem_growth,
- strikes: 0
- }
- }
-
+ def initialize
+ @configuration = Configuration.new
@alive = true
- init_prometheus_metrics(max_heap_fragmentation)
- end
-
- attr_reader :max_strikes, :sleep_time_seconds
-
- def max_heap_fragmentation
- @stats[:heap_frag][:max]
- end
-
- def max_mem_growth
- @stats[:mem_growth][:max]
+ init_prometheus_metrics
end
- def strikes(stat)
- @stats[stat][:strikes]
+ def configure
+ yield @configuration
end
def call
- @logger.info(log_labels.merge(message: 'started'))
+ logger.info(log_labels.merge(message: 'started'))
while @alive
- sleep(@sleep_time_seconds)
-
- next unless Feature.enabled?(:gitlab_memory_watchdog, type: :ops)
+ sleep(sleep_time_seconds)
- monitor_heap_fragmentation
- monitor_memory_growth
+ monitor if Feature.enabled?(:gitlab_memory_watchdog, type: :ops)
end
- @logger.info(log_labels.merge(message: 'stopped'))
+ logger.info(log_labels.merge(message: 'stopped'))
end
def stop
@@ -137,71 +76,24 @@ module Gitlab
private
- def monitor_memory_condition(stat_key)
- return unless @alive
-
- stat = @stats[stat_key]
-
- ok, labels = yield(stat)
+ def monitor
+ @configuration.monitors.call_each do |result|
+ break unless @alive
- if ok
- stat[:strikes] = 0
- else
- stat[:strikes] += 1
- @counter_violations.increment(reason: stat_key.to_s)
- end
+ next unless result.threshold_violated?
- if stat[:strikes] > @max_strikes
- @alive = !memory_limit_exceeded_callback(stat_key, labels)
- stat[:strikes] = 0
- end
- end
+ @counter_violations.increment(reason: result.monitor_name)
- def monitor_heap_fragmentation
- monitor_memory_condition(:heap_frag) do |stat|
- heap_fragmentation = Gitlab::Metrics::Memory.gc_heap_fragmentation
- [
- heap_fragmentation <= stat[:max],
- {
- message: 'heap fragmentation limit exceeded',
- memwd_cur_heap_frag: heap_fragmentation,
- memwd_max_heap_frag: stat[:max]
- }
- ]
- end
- end
+ next unless result.strikes_exceeded?
- def monitor_memory_growth
- monitor_memory_condition(:mem_growth) do |stat|
- worker_uss = Gitlab::Metrics::System.memory_usage_uss_pss[:uss]
- reference_uss = reference_mem[:uss]
- memory_limit = stat[:max] * reference_uss
- [
- worker_uss <= memory_limit,
- {
- message: 'memory limit exceeded',
- memwd_uss_bytes: worker_uss,
- memwd_ref_uss_bytes: reference_uss,
- memwd_max_uss_bytes: memory_limit
- }
- ]
+ @alive = !memory_limit_exceeded_callback(result.monitor_name, result.payload)
end
end
- # On pre-fork systems this would be the primary process memory from which workers fork.
- # Otherwise it is the current process' memory.
- #
- # We initialize this lazily because in the initializer the application may not have
- # finished booting yet, which would yield an incorrect baseline.
- def reference_mem
- @reference_mem ||= Gitlab::Metrics::System.memory_usage_uss_pss(pid: Gitlab::Cluster::PRIMARY_PID)
- end
-
- def memory_limit_exceeded_callback(stat_key, handler_labels)
- all_labels = log_labels.merge(handler_labels)
- .merge(memwd_cur_strikes: strikes(stat_key))
- @logger.warn(all_labels)
- @counter_violations_handled.increment(reason: stat_key.to_s)
+ def memory_limit_exceeded_callback(monitor_name, monitor_payload)
+ all_labels = log_labels.merge(monitor_payload)
+ logger.warn(all_labels)
+ @counter_violations_handled.increment(reason: monitor_name)
handler.call
end
@@ -211,7 +103,15 @@ module Gitlab
# all that happens is we collect logs and Prometheus events for fragmentation violations.
return NullHandler.instance unless Feature.enabled?(:enforce_memory_watchdog, type: :ops)
- @handler
+ @configuration.handler
+ end
+
+ def logger
+ @configuration.logger
+ end
+
+ def sleep_time_seconds
+ @configuration.sleep_time_seconds
end
def log_labels
@@ -219,27 +119,20 @@ module Gitlab
pid: $$,
worker_id: worker_id,
memwd_handler_class: handler.class.name,
- memwd_sleep_time_s: @sleep_time_seconds,
- memwd_max_strikes: @max_strikes,
+ memwd_sleep_time_s: sleep_time_seconds,
memwd_rss_bytes: process_rss_bytes
}
end
- def worker_id
- ::Prometheus::PidProvider.worker_id
- end
-
def process_rss_bytes
Gitlab::Metrics::System.memory_usage_rss
end
- def init_prometheus_metrics(max_heap_fragmentation)
- @heap_frag_limit = Gitlab::Metrics.gauge(
- :gitlab_memwd_heap_frag_limit,
- 'The configured limit for how fragmented the Ruby heap is allowed to be'
- )
- @heap_frag_limit.set({}, max_heap_fragmentation)
+ def worker_id
+ ::Prometheus::PidProvider.worker_id
+ end
+ def init_prometheus_metrics
default_labels = { pid: worker_id }
@counter_violations = Gitlab::Metrics.counter(
:gitlab_memwd_violations_total,