1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
|
# frozen_string_literal: true
module Gitlab
module Memory
# A background thread that monitors Ruby memory and calls
# into a handler when the Ruby process violates defined limits
# for an extended period of time.
class Watchdog
# This handler does nothing. It returns `false` to indicate to the
# caller that the situation has not been dealt with so it will
# receive calls repeatedly if fragmentation remains high.
#
# This is useful for "dress rehearsals" in production since it allows
# us to observe how frequently the handler is invoked before taking action.
class NullHandler
include Singleton
def call
# NOP
false
end
end
# This handler sends SIGTERM and considers the situation handled.
class TermProcessHandler
def initialize(pid = $$)
@pid = pid
end
def call
Process.kill(:TERM, @pid)
true
end
end
# This handler invokes Puma's graceful termination handler, which takes
# into account a configurable grace period during which a process may
# remain unresponsive to a SIGTERM.
class PumaHandler
def initialize(puma_options = ::Puma.cli_config.options)
@worker = ::Puma::Cluster::WorkerHandle.new(0, $$, 0, puma_options)
end
def call
@worker.term
true
end
end
def initialize
@configuration = Configuration.new
@alive = true
end
##
# Configuration for Watchdog, see Gitlab::Memory::Watchdog::Configurator
# for examples.
def configure
yield configuration
end
def call
event_reporter.started(log_labels)
while @alive
sleep(sleep_time_seconds)
monitor
end
event_reporter.stopped(log_labels(memwd_reason: @stop_reason).compact)
end
def stop
stop_working(reason: 'background task stopped')
end
private
attr_reader :configuration
delegate :event_reporter, :monitors, :sleep_time_seconds, to: :configuration
def monitor
if monitors.empty?
stop_working(reason: 'monitors are not configured')
return
end
monitors.call_each do |result|
break unless @alive
next unless result.threshold_violated?
event_reporter.threshold_violated(result.monitor_name)
next unless result.strikes_exceeded?
strike_exceeded_callback(result.monitor_name, result.payload)
end
end
def strike_exceeded_callback(monitor_name, monitor_payload)
event_reporter.strikes_exceeded(monitor_name, log_labels(monitor_payload))
Gitlab::Memory::Reports::HeapDump.enqueue!
stop_working(reason: 'successfully handled') if handler.call
end
def handler
# This allows us to keep the watchdog running but turn it into "friendly mode" where
# all that happens is we collect logs and Prometheus events for fragmentation violations.
return NullHandler.instance unless Feature.enabled?(:enforce_memory_watchdog, type: :ops)
configuration.handler
end
def log_labels(extra = {})
extra.merge(
memwd_handler_class: handler.class.name,
memwd_sleep_time_s: sleep_time_seconds
)
end
def stop_working(reason:)
return unless @alive
@stop_reason = reason
@alive = false
end
end
end
end
|