lib/gitlab/memory/watchdog.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150

# frozen_string_literal: true

module Gitlab
  module Memory
    # A background thread that monitors Ruby memory and calls
    # into a handler when the Ruby process violates defined limits
    # for an extended period of time.
    class Watchdog
      # This handler does nothing. It returns `false` to indicate to the
      # caller that the situation has not been dealt with so it will
      # receive calls repeatedly if fragmentation remains high.
      #
      # This is useful for "dress rehearsals" in production since it allows
      # us to observe how frequently the handler is invoked before taking action.
      class NullHandler
        include Singleton

        def call
          # NOP
          false
        end
      end

      # This handler sends SIGTERM and considers the situation handled.
      class TermProcessHandler
        def initialize(pid = $$)
          @pid = pid
        end

        def call
          Process.kill(:TERM, @pid)
          true
        end
      end

      # This handler invokes Puma's graceful termination handler, which takes
      # into account a configurable grace period during which a process may
      # remain unresponsive to a SIGTERM.
      class PumaHandler
        def initialize(puma_options = ::Puma.cli_config.options)
          @worker = ::Puma::Cluster::WorkerHandle.new(0, $$, 0, puma_options)
        end

        def call
          @worker.term
          true
        end
      end

      def initialize
        @configuration = Configuration.new
        @alive = true

        init_prometheus_metrics
      end

      def configure
        yield @configuration
      end

      def call
        logger.info(log_labels.merge(message: 'started'))

        while @alive
          sleep(sleep_time_seconds)

          monitor if Feature.enabled?(:gitlab_memory_watchdog, type: :ops)
        end

        logger.info(log_labels.merge(message: 'stopped'))
      end

      def stop
        @alive = false
      end

      private

      def monitor
        @configuration.monitors.call_each do |result|
          break unless @alive

          next unless result.threshold_violated?

          @counter_violations.increment(reason: result.monitor_name)

          next unless result.strikes_exceeded?

          @alive = !memory_limit_exceeded_callback(result.monitor_name, result.payload)
        end
      end

      def memory_limit_exceeded_callback(monitor_name, monitor_payload)
        all_labels = log_labels.merge(monitor_payload)
        logger.warn(all_labels)
        @counter_violations_handled.increment(reason: monitor_name)

        handler.call
      end

      def handler
        # This allows us to keep the watchdog running but turn it into "friendly mode" where
        # all that happens is we collect logs and Prometheus events for fragmentation violations.
        return NullHandler.instance unless Feature.enabled?(:enforce_memory_watchdog, type: :ops)

        @configuration.handler
      end

      def logger
        @configuration.logger
      end

      def sleep_time_seconds
        @configuration.sleep_time_seconds
      end

      def log_labels
        {
          pid: $$,
          worker_id: worker_id,
          memwd_handler_class: handler.class.name,
          memwd_sleep_time_s: sleep_time_seconds,
          memwd_rss_bytes: process_rss_bytes
        }
      end

      def process_rss_bytes
        Gitlab::Metrics::System.memory_usage_rss
      end

      def worker_id
        ::Prometheus::PidProvider.worker_id
      end

      def init_prometheus_metrics
        default_labels = { pid: worker_id }
        @counter_violations = Gitlab::Metrics.counter(
          :gitlab_memwd_violations_total,
          'Total number of times a Ruby process violated a memory threshold',
          default_labels
        )
        @counter_violations_handled = Gitlab::Metrics.counter(
          :gitlab_memwd_violations_handled_total,
          'Total number of times Ruby process memory violations were handled',
          default_labels
        )
      end
    end
  end
end