summaryrefslogtreecommitdiff
path: root/lib/gitlab/memory/watchdog.rb
blob: aac70a2f6aa358c20532378415140cb38738f929 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# frozen_string_literal: true

module Gitlab
  module Memory
    # A background thread that monitors Ruby memory and calls
    # into a handler when the Ruby process violates defined limits
    # for an extended period of time.
    class Watchdog
      # This handler does nothing. It returns `false` to indicate to the
      # caller that the situation has not been dealt with so it will
      # receive calls repeatedly if fragmentation remains high.
      #
      # This is useful for "dress rehearsals" in production since it allows
      # us to observe how frequently the handler is invoked before taking action.
      class NullHandler
        include Singleton

        def call
          # NOP
          false
        end
      end

      # This handler sends SIGTERM and considers the situation handled.
      class TermProcessHandler
        def initialize(pid = $$)
          @pid = pid
        end

        def call
          Process.kill(:TERM, @pid)
          true
        end
      end

      # This handler invokes Puma's graceful termination handler, which takes
      # into account a configurable grace period during which a process may
      # remain unresponsive to a SIGTERM.
      class PumaHandler
        def initialize(puma_options = ::Puma.cli_config.options)
          @worker = ::Puma::Cluster::WorkerHandle.new(0, $$, 0, puma_options)
        end

        def call
          @worker.term
          true
        end
      end

      def initialize
        @configuration = Configuration.new
        @alive = true
      end

      ##
      # Configuration for Watchdog, see Gitlab::Memory::Watchdog::Configurator
      # for examples.
      def configure
        yield configuration
      end

      def call
        event_reporter.started(log_labels)

        while @alive
          sleep(sleep_time_seconds)

          monitor
        end

        event_reporter.stopped(log_labels(memwd_reason: @reason).compact)
      end

      def stop(reason: nil)
        @reason = reason
        @alive = false
      end

      private

      attr_reader :configuration

      delegate :event_reporter, :monitors, :sleep_time_seconds, to: :configuration

      def monitor
        if monitors.empty?
          stop(reason: 'monitors are not configured')
          return
        end

        monitors.call_each do |result|
          break unless @alive

          next unless result.threshold_violated?

          event_reporter.threshold_violated(result.monitor_name)

          next unless result.strikes_exceeded?

          strike_exceeded_callback(result.monitor_name, result.payload)
        end
      end

      def strike_exceeded_callback(monitor_name, monitor_payload)
        event_reporter.strikes_exceeded(monitor_name, log_labels(monitor_payload))

        Gitlab::Memory::Reports::HeapDump.enqueue!

        stop(reason: 'successfully handled') if handler.call
      end

      def handler
        # This allows us to keep the watchdog running but turn it into "friendly mode" where
        # all that happens is we collect logs and Prometheus events for fragmentation violations.
        return NullHandler.instance unless Feature.enabled?(:enforce_memory_watchdog, type: :ops)

        configuration.handler
      end

      def log_labels(extra = {})
        extra.merge(
          memwd_handler_class: handler.class.name,
          memwd_sleep_time_s: sleep_time_seconds
        )
      end
    end
  end
end