summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorQingyu Zhao <qzhao@gitlab.com>2019-08-31 00:05:04 +1000
committerQingyu Zhao <qzhao@gitlab.com>2019-09-17 18:07:30 +1000
commitd7b03e28a77f40ed51c99c3668981999db27e6d1 (patch)
treedc3493327fffd4eeb54db5829e856f5fe44a0284
parent3c372d87a701b600f42fccbf4707eebd67ff7cac (diff)
downloadgitlab-ce-59754-independent-sidekiq-memory-killer.tar.gz
Add Sidekiq daemon memory killer59754-independent-sidekiq-memory-killer
When enable daemon memory killer, old memory killer will be disabled The daemon memory killer(Gitlab::SidekiqDaemon::MemoryKiller) - is to replace existing Gitlab::SidekiqMiddleware::MemoryKiller - allows Sidekiq memory balloon reasonably, within given time limit - reduces monitoring overhead(when there are many small jobs) - monitors Sidekiq memory timely(for long running jobs) - respects Worker sidekiq_options `memory_killer_memory_growth_kb` - respects Worker sidekiq_options `max_memory_killer_memory_growth_kb` Update document on Sidekiq memory killer ENV variables
-rw-r--r--config/initializers/sidekiq.rb14
-rw-r--r--doc/administration/operations/sidekiq_memory_killer.md52
-rw-r--r--doc/development/import_export.md4
-rw-r--r--doc/user/gitlab_com/index.md15
-rw-r--r--lib/gitlab/sidekiq_daemon/memory_killer.rb222
-rw-r--r--lib/gitlab/sidekiq_daemon/monitor.rb50
-rw-r--r--lib/gitlab/sidekiq_middleware/monitor.rb2
-rw-r--r--spec/lib/gitlab/sidekiq_daemon/memory_killer_spec.rb424
-rw-r--r--spec/lib/gitlab/sidekiq_daemon/monitor_spec.rb48
-rw-r--r--spec/lib/gitlab/sidekiq_middleware/monitor_spec.rb2
10 files changed, 783 insertions, 50 deletions
diff --git a/config/initializers/sidekiq.rb b/config/initializers/sidekiq.rb
index 20f31ff6810..e3505579204 100644
--- a/config/initializers/sidekiq.rb
+++ b/config/initializers/sidekiq.rb
@@ -28,16 +28,18 @@ if Rails.env.development?
end
enable_json_logs = Gitlab.config.sidekiq.log_format == 'json'
-enable_sidekiq_monitor = ENV.fetch("SIDEKIQ_MONITOR_WORKER", 0).to_i.nonzero?
+enable_sidekiq_memory_killer = ENV['SIDEKIQ_MEMORY_KILLER_MAX_RSS'].to_i.nonzero?
+use_sidekiq_daemon_memory_killer = ENV["SIDEKIQ_DAEMON_MEMORY_KILLER"].to_i.nonzero?
+use_sidekiq_legacy_memory_killer = !use_sidekiq_daemon_memory_killer
Sidekiq.configure_server do |config|
config.redis = queues_config_hash
config.server_middleware do |chain|
- chain.add Gitlab::SidekiqMiddleware::Monitor if enable_sidekiq_monitor
+ chain.add Gitlab::SidekiqMiddleware::Monitor
chain.add Gitlab::SidekiqMiddleware::Metrics if Settings.monitoring.sidekiq_exporter
chain.add Gitlab::SidekiqMiddleware::ArgumentsLogger if ENV['SIDEKIQ_LOG_ARGUMENTS'] && !enable_json_logs
- chain.add Gitlab::SidekiqMiddleware::MemoryKiller if ENV['SIDEKIQ_MEMORY_KILLER_MAX_RSS']
+ chain.add Gitlab::SidekiqMiddleware::MemoryKiller if enable_sidekiq_memory_killer && use_sidekiq_legacy_memory_killer
chain.add Gitlab::SidekiqMiddleware::RequestStoreMiddleware unless ENV['SIDEKIQ_REQUEST_STORE'] == '0'
chain.add Gitlab::SidekiqMiddleware::BatchLoader
chain.add Gitlab::SidekiqMiddleware::CorrelationLogger
@@ -60,7 +62,11 @@ Sidekiq.configure_server do |config|
# Sidekiq (e.g. in an initializer).
ActiveRecord::Base.clear_all_connections!
- Gitlab::SidekiqDaemon::Monitor.instance.start if enable_sidekiq_monitor
+ # Start monitor to track running jobs. By default, cancel job is not enabled
+ # To cancel job, it requires `SIDEKIQ_MONITOR_WORKER=1` to enable notification channel
+ Gitlab::SidekiqDaemon::Monitor.instance.start
+
+ Gitlab::SidekiqDaemon::MemoryKiller.instance.start if enable_sidekiq_memory_killer && use_sidekiq_daemon_memory_killer
end
if enable_reliable_fetch?
diff --git a/doc/administration/operations/sidekiq_memory_killer.md b/doc/administration/operations/sidekiq_memory_killer.md
index 8eac42f2fe2..eb4dea91ebb 100644
--- a/doc/administration/operations/sidekiq_memory_killer.md
+++ b/doc/administration/operations/sidekiq_memory_killer.md
@@ -26,18 +26,48 @@ run as a process group leader (e.g., using `chpst -P`). If using Omnibus or the
The MemoryKiller is controlled using environment variables.
-- `SIDEKIQ_MEMORY_KILLER_MAX_RSS`: if this variable is set, and its value is
- greater than 0, then after each Sidekiq job, the MemoryKiller will check the
- RSS of the Sidekiq process that executed the job. If the RSS of the Sidekiq
- process (expressed in kilobytes) exceeds SIDEKIQ_MEMORY_KILLER_MAX_RSS, a
- delayed shutdown is triggered. The default value for Omnibus packages is set
+- `SIDEKIQ_DAEMON_MEMORY_KILLER`: defaults to 0. When set to 1, the MemoryKiller
+ works in daemon mode. Otherwise, the MemoryKiller works in legacy mode.
+
+ In legacy mode, the MemoryKiller check the Sidekiq process RSS after each job.
+ In daemon mode, the MemoryKiller check the Sidekiq process RSS every 3 seconds.
+
+- `SIDEKIQ_MEMORY_KILLER_MAX_RSS`: if this variable is set, and its value is greater
+ than 0, the MemoryKiller is enabled. Otherwise the MemoryKiller is disabled.
+
+ In legacy mode, SIDEKIQ_MEMORY_KILLER_MAX_RSS acts as hard limit. If the Sidekiq
+ process RSS (expressed in kilobytes) exceeds SIDEKIQ_MEMORY_KILLER_MAX_RSS,
+ a delayed shutdown is triggered.
+ In daemon mode, SIDEKIQ_MEMORY_KILLER_MAX_RSS acts as soft limit. The RSS
+ of the Sidekiq process is allowed to exceed SIDEKIQ_MEMORY_KILLER_MAX_RSS within
+ limited time.
+
+ The default value for Omnibus packages is set
[in the omnibus-gitlab
repository](https://gitlab.com/gitlab-org/omnibus-gitlab/blob/master/files/gitlab-cookbooks/gitlab/attributes/default.rb).
+
+- `SIDEKIQ_MEMORY_KILLER_HARD_LIMIT_RSS`: only for daemon mode. If the Sidekiq
+ process RSS (expressed in kilobytes) exceeds SIDEKIQ_MEMORY_KILLER_HARD_LIMIT_RSS,
+ a delayed shutdown is triggered.
+
+- `SIDEKIQ_MEMORY_KILLER_GRACE_BALLOON_TIME`: only for daemon mode. Defaults to
+ 900 seconds (15 minutes). As long as the RSS of the Sidekiq process is under
+ SIDEKIQ_MEMORY_KILLER_HARD_LIMIT_RSS, it can exceed SIDEKIQ_MEMORY_KILLER_MAX_RSS
+ for 15 minutes. After that, a delayed shutdown is triggered.
+
+- `SIDEKIQ_MEMORY_KILLER_CHECK_INTERVAL`: only for daemon mode. Defaults to
+ 3 seconds. Check Sidekiq process RSS every 3 seconds. If Sidekiq process RSS
+ is out of range, a delayed shutdown is triggered.
+
- `SIDEKIQ_MEMORY_KILLER_GRACE_TIME`: defaults to 900 seconds (15 minutes). When
- a shutdown is triggered, the Sidekiq process will keep working normally for
+ a shutdown is triggered, the Sidekiq process will keep working for up to
another 15 minutes.
-- `SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT`: defaults to 30 seconds. When the grace
- time has expired, the MemoryKiller tells Sidekiq to stop accepting new jobs.
- Existing jobs get 30 seconds to finish. After that, the MemoryKiller tells
- Sidekiq to shut down, and an external supervision mechanism (e.g. Runit) must
- restart Sidekiq.
+
+ In legacy mode, Sidekiq process continue accepting new jobs during grace time.
+ In daemon mode, Sidekiq process will stop accepting new jobs during grace time.
+
+- `SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT`: defaults to 30 seconds. After graceful
+ shutdown period(either timeout or all jobs finished), the MemoryKiller tells
+ Sidekiq to shutdown. If the shutdown did not finish within 30 seconds, the
+ MemoryKiller will terminate Sidekiq forcefully with SIGKILL, and an external
+ supervision mechanism (e.g. Runit) must restart Sidekiq.
diff --git a/doc/development/import_export.md b/doc/development/import_export.md
index 0c343bc22e4..74cdc4c545f 100644
--- a/doc/development/import_export.md
+++ b/doc/development/import_export.md
@@ -31,7 +31,9 @@ Read through the current performance problems using the Import/Export below.
Out of memory (OOM) errors are normally caused by the [Sidekiq Memory Killer](../administration/operations/sidekiq_memory_killer.md):
```bash
-SIDEKIQ_MEMORY_KILLER_MAX_RSS = 2GB in GitLab.com
+SIDEKIQ_MEMORY_KILLER_MAX_RSS = 2000000
+SIDEKIQ_MEMORY_KILLER_HARD_LIMIT_RSS = 3000000
+SIDEKIQ_MEMORY_KILLER_GRACE_BALLOON_TIME = 900
```
An import status `started`, and the following sidekiq logs will signal a memory issue:
diff --git a/doc/user/gitlab_com/index.md b/doc/user/gitlab_com/index.md
index 2f2955f5a1c..2683f2612ae 100644
--- a/doc/user/gitlab_com/index.md
+++ b/doc/user/gitlab_com/index.md
@@ -177,11 +177,16 @@ sentry_dsn = "X"
GitLab.com runs [Sidekiq][sidekiq] with arguments `--timeout=4 --concurrency=4`
and the following environment variables:
-| Setting | GitLab.com | Default |
-|-------- |----------- |-------- |
-| `SIDEKIQ_MEMORY_KILLER_MAX_RSS` | `1000000` | `2000000` |
-| `SIDEKIQ_MEMORY_KILLER_SHUTDOWN_SIGNAL` | `SIGKILL` | - |
-| `SIDEKIQ_LOG_ARGUMENTS` | `1` | - |
+| Setting | GitLab.com | Default |
+|-------- |----------- |-------- |
+| `SIDEKIQ_DAEMON_MEMORY_KILLER` | - | - |
+| `SIDEKIQ_MEMORY_KILLER_MAX_RSS` | `16000000` | `2000000` |
+| `SIDEKIQ_MEMORY_KILLER_HARD_LIMIT_RSS` | - | - |
+| `SIDEKIQ_MEMORY_KILLER_GRACE_BALLOON_TIME` | - | `900` |
+| `SIDEKIQ_MEMORY_KILLER_CHECK_INTERVAL` | - | `3` |
+| `SIDEKIQ_MEMORY_KILLER_GRACE_TIME` | - | `900` |
+| `SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT` | - | `30` |
+| `SIDEKIQ_LOG_ARGUMENTS` | `1` | - |
## Cron jobs
diff --git a/lib/gitlab/sidekiq_daemon/memory_killer.rb b/lib/gitlab/sidekiq_daemon/memory_killer.rb
new file mode 100644
index 00000000000..854f0a079ef
--- /dev/null
+++ b/lib/gitlab/sidekiq_daemon/memory_killer.rb
@@ -0,0 +1,222 @@
+# frozen_string_literal: true
+
+module Gitlab
+ module SidekiqDaemon
+ class MemoryKiller < Daemon
+ include ::Gitlab::Utils::StrongMemoize
+
+ # Today 64-bit CPU support max 256T memory. It is big enough.
+ MAX_MEMORY_KB = 256 * 1024 * 1024 * 1024
+ # RSS below `soft_limit_rss` is considered safe
+ SOFT_LIMIT_RSS_KB = ENV.fetch('SIDEKIQ_MEMORY_KILLER_MAX_RSS', 2000000).to_i
+ # RSS above `hard_limit_rss` will be stopped
+ HARD_LIMIT_RSS_KB = ENV.fetch('SIDEKIQ_MEMORY_KILLER_HARD_LIMIT_RSS', MAX_MEMORY_KB).to_i
+ # RSS in range (soft_limit_rss, hard_limit_rss) is allowed for GRACE_BALLOON_SECONDS
+ GRACE_BALLOON_SECONDS = ENV.fetch('SIDEKIQ_MEMORY_KILLER_GRACE_BALLOON_TIME', 15 * 60).to_i
+ # Check RSS every CHECK_INTERVAL_SECONDS, minimum 2 seconds
+ CHECK_INTERVAL_SECONDS = [ENV.fetch('SIDEKIQ_MEMORY_KILLER_CHECK_INTERVAL', 3).to_i, 2].max
+ # Give Sidekiq up to 15 minutes of grace time to allow existing jobs to finish after exceeding the limit
+ GRACEFUL_SHUTDOWN_TIMEOUT_SECONDS = ENV.fetch('SIDEKIQ_MEMORY_KILLER_GRACE_TIME', 15 * 60).to_i
+ # Give Sidekiq up to 30 seconds to interrupt all running jobs and shutdown
+ SHUTDOWN_TIMEOUT_SECONDS = ENV.fetch('SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT', 30).to_i
+
+ def initialize
+ super
+
+ @enabled = true
+ end
+
+ private
+
+ def start_working
+ Sidekiq.logger.info(
+ class: self.class.to_s,
+ action: 'start',
+ pid: pid,
+ message: 'Starting Gitlab::SidekiqDaemon::MemoryKiller Daemon'
+ )
+
+ while enabled?
+ begin
+ restart_sidekiq unless rss_within_range?
+ sleep(CHECK_INTERVAL_SECONDS)
+ rescue => e
+ log_exception(e, __method__)
+ rescue Exception => e # rubocop:disable Lint/RescueException
+ log_exception(e, __method__ )
+ raise e
+ end
+ end
+ ensure
+ Sidekiq.logger.warn(
+ class: self.class.to_s,
+ action: 'stop',
+ pid: pid,
+ message: 'Stopping Gitlab::SidekiqDaemon::MemoryKiller Daemon'
+ )
+ end
+
+ def log_exception(exception, method)
+ Sidekiq.logger.warn(
+ class: self.class.to_s,
+ pid: pid,
+ message: "Exception from #{method}: #{exception.message}"
+ )
+ end
+
+ def stop_working
+ @enabled = false
+ end
+
+ def enabled?
+ @enabled
+ end
+
+ def restart_sidekiq
+ # Tell Sidekiq to stop fetching new jobs
+ # We first SIGNAL and then wait given time
+ # We also monitor a number of running jobs and allow to restart early
+ signal_and_wait(GRACEFUL_SHUTDOWN_TIMEOUT_SECONDS, 'SIGTSTP', 'stop fetching new jobs')
+ return unless enabled?
+
+ # Tell sidekiq to restart itself
+ signal_and_wait(SHUTDOWN_TIMEOUT_SECONDS, 'SIGTERM', 'gracefully shut down')
+ return unless enabled?
+
+ # Ideally we should never reach this condition
+ # Wait for Sidekiq to shutdown gracefully, and kill it if it didn't
+ # Kill the whole pgroup, so we can be sure no children are left behind
+ # Keep extra safe to wait `Sidekiq.options[:timeout] + 2` seconds before SIGKILL
+ signal_pgroup(Sidekiq.options[:timeout] + 2, 'SIGKILL', 'die')
+ end
+
+ def rss_within_range?
+ current_rss = nil
+ deadline = Time.now + GRACE_BALLOON_SECONDS.seconds
+ loop do
+ return true unless enabled?
+
+ current_rss = get_rss
+
+ # RSS go above hard limit should trigger forcible shutdown right away
+ break if current_rss > hard_limit_rss
+
+ # RSS go below the soft limit
+ return true if current_rss < soft_limit_rss
+
+ # RSS did not go below the soft limit within deadline, restart
+ break if Time.now > deadline
+
+ sleep(CHECK_INTERVAL_SECONDS)
+ end
+
+ log_rss_out_of_range(current_rss, hard_limit_rss, soft_limit_rss)
+
+ false
+ end
+
+ def log_rss_out_of_range(current_rss, hard_limit_rss, soft_limit_rss)
+ Sidekiq.logger.warn(
+ class: self.class.to_s,
+ pid: pid,
+ message: 'Sidekiq worker rss out of range',
+ reason: out_of_range_description(current_rss, hard_limit_rss, soft_limit_rss)
+ )
+ end
+
+ def out_of_range_description(rss, hard_limit, soft_limit)
+ if rss > hard_limit
+ "current_rss(#{rss}) > hard_limit_rss(#{hard_limit})"
+ else
+ "current_rss(#{rss}) > soft_limit_rss(#{soft_limit}) longer than GRACE_BALLOON_SECONDS(#{GRACE_BALLOON_SECONDS})"
+ end
+ end
+
+ def get_rss
+ output, status = Gitlab::Popen.popen(%W(ps -o rss= -p #{pid}), Rails.root.to_s)
+ return 0 unless status&.zero?
+
+ output.to_i
+ end
+
+ def soft_limit_rss
+ SOFT_LIMIT_RSS_KB + rss_increase_by_jobs
+ end
+
+ def hard_limit_rss
+ HARD_LIMIT_RSS_KB
+ end
+
+ def signal_and_wait(time, signal, explanation)
+ Sidekiq.logger.warn(
+ class: self.class.to_s,
+ pid: pid,
+ message: "sending Sidekiq worker #{signal} (#{explanation}). Then wait at most #{time} seconds."
+ )
+ Process.kill(signal, pid)
+
+ deadline = Time.now + time
+
+ # we try to finish as early as all jobs finished
+ # so we retest that in loop
+ sleep(CHECK_INTERVAL_SECONDS) while enabled? && any_jobs? && Time.now < deadline
+ end
+
+ def signal_pgroup(time, signal, explanation)
+ if Process.getpgrp == pid
+ pid_or_pgrp_str = 'PGRP'
+ pid_to_signal = 0
+ else
+ pid_or_pgrp_str = 'PID'
+ pid_to_signal = pid
+ end
+
+ Sidekiq.logger.warn(
+ class: self.class.to_s,
+ signal: signal,
+ pid: pid,
+ message: "waiting #{time} seconds before sending Sidekiq worker #{pid_or_pgrp_str}-#{pid} #{signal} (#{explanation})"
+ )
+ sleep(time)
+
+ Sidekiq.logger.warn(
+ class: self.class.to_s,
+ signal: signal,
+ pid: pid,
+ message: "sending Sidekiq worker #{pid_or_pgrp_str}-#{pid} #{signal} (#{explanation})"
+ )
+ Process.kill(signal, pid_to_signal)
+ end
+
+ def rss_increase_by_jobs
+ Gitlab::SidekiqDaemon::Monitor.instance.jobs.sum do |job| # rubocop:disable CodeReuse/ActiveRecord
+ rss_increase_by_job(job)
+ end
+ end
+
+ def rss_increase_by_job(job)
+ memory_growth_kb = get_job_options(job, 'memory_killer_memory_growth_kb', 0).to_i
+ max_memory_growth_kb = get_job_options(job, 'memory_killer_max_memory_growth_kb', MAX_MEMORY_KB).to_i
+
+ return 0 if memory_growth_kb.zero?
+
+ time_elapsed = Time.now.to_i - job[:started_at]
+ [memory_growth_kb * time_elapsed, max_memory_growth_kb].min
+ end
+
+ def get_job_options(job, key, default)
+ job[:worker_class].sidekiq_options.fetch(key, default)
+ rescue
+ default
+ end
+
+ def pid
+ Process.pid
+ end
+
+ def any_jobs?
+ Gitlab::SidekiqDaemon::Monitor.instance.jobs.any?
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/sidekiq_daemon/monitor.rb b/lib/gitlab/sidekiq_daemon/monitor.rb
index bbfca130425..2b82adda430 100644
--- a/lib/gitlab/sidekiq_daemon/monitor.rb
+++ b/lib/gitlab/sidekiq_daemon/monitor.rb
@@ -14,19 +14,19 @@ module Gitlab
# that should not be caught by application
CancelledError = Class.new(Exception) # rubocop:disable Lint/InheritException
- attr_reader :jobs_thread
+ attr_reader :jobs
attr_reader :jobs_mutex
def initialize
super
- @jobs_thread = {}
+ @jobs = {}
@jobs_mutex = Mutex.new
end
- def within_job(jid, queue)
+ def within_job(worker_class, jid, queue)
jobs_mutex.synchronize do
- jobs_thread[jid] = Thread.current
+ jobs[jid] = { worker_class: worker_class, thread: Thread.current, started_at: Time.now.to_i }
end
if cancelled?(jid)
@@ -43,7 +43,7 @@ module Gitlab
yield
ensure
jobs_mutex.synchronize do
- jobs_thread.delete(jid)
+ jobs.delete(jid)
end
end
@@ -62,23 +62,27 @@ module Gitlab
private
def start_working
- Sidekiq.logger.info(
- class: self.class.to_s,
- action: 'start',
- message: 'Starting Monitor Daemon'
- )
+ return unless notification_channel_enabled?
- while enabled?
- process_messages
- sleep(RECONNECT_TIME)
- end
+ begin
+ Sidekiq.logger.info(
+ class: self.class.to_s,
+ action: 'start',
+ message: 'Starting Monitor Daemon'
+ )
- ensure
- Sidekiq.logger.warn(
- class: self.class.to_s,
- action: 'stop',
- message: 'Stopping Monitor Daemon'
- )
+ while enabled?
+ process_messages
+ sleep(RECONNECT_TIME)
+ end
+
+ ensure
+ Sidekiq.logger.warn(
+ class: self.class.to_s,
+ action: 'stop',
+ message: 'Stopping Monitor Daemon'
+ )
+ end
end
def stop_working
@@ -156,7 +160,7 @@ module Gitlab
# This is why it passes thread in block,
# to ensure that we do process this thread
def find_thread_unsafe(jid)
- jobs_thread[jid]
+ jobs.dig(jid, :thread)
end
def find_thread_with_lock(jid)
@@ -179,6 +183,10 @@ module Gitlab
def self.cancel_job_key(jid)
"sidekiq:cancel:#{jid}"
end
+
+ def notification_channel_enabled?
+ ENV.fetch("SIDEKIQ_MONITOR_WORKER", 0).to_i.nonzero?
+ end
end
end
end
diff --git a/lib/gitlab/sidekiq_middleware/monitor.rb b/lib/gitlab/sidekiq_middleware/monitor.rb
index 00965bf5506..ed825dbfd60 100644
--- a/lib/gitlab/sidekiq_middleware/monitor.rb
+++ b/lib/gitlab/sidekiq_middleware/monitor.rb
@@ -4,7 +4,7 @@ module Gitlab
module SidekiqMiddleware
class Monitor
def call(worker, job, queue)
- Gitlab::SidekiqDaemon::Monitor.instance.within_job(job['jid'], queue) do
+ Gitlab::SidekiqDaemon::Monitor.instance.within_job(worker.class, job['jid'], queue) do
yield
end
rescue Gitlab::SidekiqDaemon::Monitor::CancelledError
diff --git a/spec/lib/gitlab/sidekiq_daemon/memory_killer_spec.rb b/spec/lib/gitlab/sidekiq_daemon/memory_killer_spec.rb
new file mode 100644
index 00000000000..6280b9bd66e
--- /dev/null
+++ b/spec/lib/gitlab/sidekiq_daemon/memory_killer_spec.rb
@@ -0,0 +1,424 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+describe Gitlab::SidekiqDaemon::MemoryKiller do
+ let(:memory_killer) { described_class.new }
+ let(:pid) { 12345 }
+
+ before do
+ allow(memory_killer).to receive(:pid).and_return(pid)
+ allow(Sidekiq.logger).to receive(:info)
+ allow(Sidekiq.logger).to receive(:warn)
+ end
+
+ describe '#start_working' do
+ subject { memory_killer.send(:start_working) }
+
+ before do
+ # let enabled? return 3 times: true, true, false
+ allow(memory_killer).to receive(:enabled?).and_return(true, true, false)
+ end
+
+ context 'when structured logging is used' do
+ it 'logs start message once' do
+ expect(Sidekiq.logger).to receive(:info).once
+ .with(
+ class: described_class.to_s,
+ action: 'start',
+ pid: pid,
+ message: 'Starting Gitlab::SidekiqDaemon::MemoryKiller Daemon')
+
+ subject
+ end
+
+ it 'logs StandardError message twice' do
+ expect(Sidekiq.logger).to receive(:warn).twice
+ .with(
+ class: described_class.to_s,
+ pid: pid,
+ message: "Exception from start_working: My Exception")
+
+ expect(memory_killer).to receive(:rss_within_range?).twice.and_raise(StandardError, 'My Exception')
+
+ expect { subject }.not_to raise_exception
+ end
+
+ it 'logs exception message once and raise execption and log stop message' do
+ expect(Sidekiq.logger).to receive(:warn).once
+ .with(
+ class: described_class.to_s,
+ pid: pid,
+ message: "Exception from start_working: My Exception")
+
+ expect(memory_killer).to receive(:rss_within_range?).once.and_raise(Exception, 'My Exception')
+
+ expect(Sidekiq.logger).to receive(:warn).once
+ .with(
+ class: described_class.to_s,
+ action: 'stop',
+ pid: pid,
+ message: 'Stopping Gitlab::SidekiqDaemon::MemoryKiller Daemon')
+
+ expect { subject }.to raise_exception
+ end
+
+ it 'logs stop message once' do
+ expect(Sidekiq.logger).to receive(:warn).once
+ .with(
+ class: described_class.to_s,
+ action: 'stop',
+ pid: pid,
+ message: 'Stopping Gitlab::SidekiqDaemon::MemoryKiller Daemon')
+
+ subject
+ end
+ end
+
+ it 'invoke rss_within_range? twice' do
+ expect(memory_killer).to receive(:rss_within_range?).twice
+
+ subject
+ end
+
+ it 'not invoke restart_sidekiq when rss in range' do
+ expect(memory_killer).to receive(:rss_within_range?).twice.and_return(true)
+
+ expect(memory_killer).not_to receive(:restart_sidekiq)
+
+ subject
+ end
+
+ it 'invoke restart_sidekiq when rss not in range' do
+ expect(memory_killer).to receive(:rss_within_range?).at_least(:once).and_return(false)
+
+ expect(memory_killer).to receive(:restart_sidekiq).at_least(:once)
+
+ subject
+ end
+ end
+
+ describe '#stop_working' do
+ subject { memory_killer.send(:stop_working)}
+
+ it 'changed enable? to false' do
+ expect(memory_killer.send(:enabled?)).to be true
+ subject
+ expect(memory_killer.send(:enabled?)).to be false
+ end
+ end
+
+ describe '#rss_within_range?' do
+ let(:graceful_shutdown_timeout_seconds) { 6 }
+ let(:shutdown_timeout_seconds) { 7 }
+ let(:check_interval_seconds) { 2 }
+ let(:grace_balloon_seconds) { 5 }
+
+ subject { memory_killer.send(:rss_within_range?) }
+
+ before do
+ stub_const("#{described_class}::GRACEFUL_SHUTDOWN_TIMEOUT_SECONDS", graceful_shutdown_timeout_seconds)
+ stub_const("#{described_class}::SHUTDOWN_TIMEOUT_SECONDS", shutdown_timeout_seconds)
+ stub_const("#{described_class}::CHECK_INTERVAL_SECONDS", check_interval_seconds)
+ stub_const("#{described_class}::GRACE_BALLOON_SECONDS", grace_balloon_seconds)
+ allow(Process).to receive(:getpgrp).and_return(pid)
+ allow(Sidekiq).to receive(:options).and_return(timeout: 9)
+ end
+
+ it 'return true when everything is within limit' do
+ expect(memory_killer).to receive(:get_rss).and_return(100)
+ expect(memory_killer).to receive(:soft_limit_rss).and_return(200)
+ expect(memory_killer).to receive(:hard_limit_rss).and_return(300)
+
+ expect(Time).to receive(:now).and_call_original
+ expect(memory_killer).not_to receive(:log_rss_out_of_range)
+
+ expect(subject).to be true
+ end
+
+ it 'return false when rss exceeds hard_limit_rss' do
+ expect(memory_killer).to receive(:get_rss).and_return(400)
+ expect(memory_killer).to receive(:soft_limit_rss).at_least(:once).and_return(200)
+ expect(memory_killer).to receive(:hard_limit_rss).at_least(:once).and_return(300)
+
+ expect(Time).to receive(:now).and_call_original
+
+ expect(memory_killer).to receive(:log_rss_out_of_range).with(400, 300, 200)
+
+ expect(subject).to be false
+ end
+
+ it 'return false when rss exceed hard_limit_rss after a while' do
+ expect(memory_killer).to receive(:get_rss).and_return(250, 400)
+ expect(memory_killer).to receive(:soft_limit_rss).at_least(:once).and_return(200)
+ expect(memory_killer).to receive(:hard_limit_rss).at_least(:once).and_return(300)
+
+ expect(Time).to receive(:now).twice.and_call_original
+ expect(memory_killer).to receive(:sleep).with(check_interval_seconds)
+
+ expect(memory_killer).to receive(:log_rss_out_of_range).with(400, 300, 200)
+
+ expect(subject).to be false
+ end
+
+ it 'return true when rss below soft_limit_rss after a while within GRACE_BALLOON_SECONDS' do
+ expect(memory_killer).to receive(:get_rss).and_return(250, 100)
+ expect(memory_killer).to receive(:soft_limit_rss).and_return(200, 200)
+ expect(memory_killer).to receive(:hard_limit_rss).and_return(300, 300)
+
+ expect(Time).to receive(:now).twice.and_call_original
+ expect(memory_killer).to receive(:sleep).with(check_interval_seconds)
+
+ expect(memory_killer).not_to receive(:log_rss_out_of_range)
+
+ expect(subject).to be true
+ end
+
+ it 'return false when rss exceed soft_limit_rss longer than GRACE_BALLOON_SECONDS' do
+ expect(memory_killer).to receive(:get_rss).exactly(4).times.and_return(250)
+ expect(memory_killer).to receive(:soft_limit_rss).exactly(5).times.and_return(200)
+ expect(memory_killer).to receive(:hard_limit_rss).exactly(5).times.and_return(300)
+
+ expect(Time).to receive(:now).exactly(5).times.and_call_original
+ expect(memory_killer).to receive(:sleep).exactly(3).times.with(check_interval_seconds).and_call_original
+
+ expect(memory_killer).to receive(:log_rss_out_of_range).with(250, 300, 200)
+
+ expect(subject).to be false
+ end
+ end
+
+ describe '#restart_sidekiq' do
+ let(:graceful_shutdown_timeout_seconds) { 6 }
+ let(:shutdown_timeout_seconds) { 7 }
+
+ subject { memory_killer.send(:restart_sidekiq) }
+
+ before do
+ stub_const("#{described_class}::GRACEFUL_SHUTDOWN_TIMEOUT_SECONDS", graceful_shutdown_timeout_seconds)
+ stub_const("#{described_class}::SHUTDOWN_TIMEOUT_SECONDS", shutdown_timeout_seconds)
+ allow(Sidekiq).to receive(:options).and_return(timeout: 9)
+ end
+
+ it 'send signal' do
+ expect(memory_killer).to receive(:signal_and_wait).with(graceful_shutdown_timeout_seconds, 'SIGTSTP', 'stop fetching new jobs').ordered
+ expect(memory_killer).to receive(:signal_and_wait).with(shutdown_timeout_seconds, 'SIGTERM', 'gracefully shut down').ordered
+ expect(memory_killer).to receive(:signal_pgroup).with(11, 'SIGKILL', 'die').ordered
+
+ subject
+ end
+ end
+
+ describe '#signal_and_wait' do
+ let(:time) { 7 }
+ let(:signal) { 'my-signal' }
+ let(:explanation) { 'my-explanation' }
+ let(:check_interval_seconds) { 2 }
+
+ subject { memory_killer.send(:signal_and_wait, time, signal, explanation) }
+
+ before do
+ stub_const("#{described_class}::CHECK_INTERVAL_SECONDS", check_interval_seconds)
+ end
+
+ it 'send signal and return when all jobs finished' do
+ expect(Process).to receive(:kill).with(signal, pid).ordered
+ expect(Time).to receive(:now).and_call_original
+
+ expect(memory_killer).to receive(:enabled?).and_return(true)
+ expect(memory_killer).to receive(:any_jobs?).and_return(false)
+
+ expect(memory_killer).not_to receive(:sleep)
+
+ subject
+ end
+
+ it 'send signal and wait till deadline if any job not finished' do
+ expect(Process).to receive(:kill).with(signal, pid).ordered
+ expect(Time).to receive(:now).and_call_original.at_least(:once)
+
+ expect(memory_killer).to receive(:enabled?).and_return(true).at_least(:once)
+ expect(memory_killer).to receive(:any_jobs?).and_return(true).at_least(:once)
+
+ expect(memory_killer).to receive(:sleep).and_call_original.exactly(4).times
+
+ subject
+ end
+ end
+
+ describe '#signal_pgroup' do
+ let(:time) { 2 }
+ let(:signal) { 'my-signal' }
+ let(:explanation) { 'my-explanation' }
+
+ subject { memory_killer.send(:signal_pgroup, time, signal, explanation) }
+
+ it 'send signal to this proces if it is not group leader' do
+ expect(Process).to receive(:getpgrp).and_return(pid + 1)
+
+ expect(Sidekiq.logger).to receive(:warn).once
+ .with(
+ class: described_class.to_s,
+ signal: signal,
+ pid: pid,
+ message: "waiting #{time} seconds before sending Sidekiq worker PID-#{pid} #{signal} (#{explanation})")
+ expect(memory_killer).to receive(:sleep).with(time).ordered
+ expect(Sidekiq.logger).to receive(:warn).once
+ .with(
+ class: described_class.to_s,
+ signal: signal,
+ pid: pid,
+ message: "sending Sidekiq worker PID-#{pid} #{signal} (#{explanation})")
+ expect(Process).to receive(:kill).with(signal, pid).ordered
+
+ subject
+ end
+
+ it 'send signal to whole process group as group leader' do
+ expect(Process).to receive(:getpgrp).and_return(pid)
+
+ expect(Sidekiq.logger).to receive(:warn).once
+ .with(
+ class: described_class.to_s,
+ signal: signal,
+ pid: pid,
+ message: "waiting #{time} seconds before sending Sidekiq worker PGRP-#{pid} #{signal} (#{explanation})")
+ expect(memory_killer).to receive(:sleep).with(time).ordered
+ expect(Sidekiq.logger).to receive(:warn).once
+ .with(
+ class: described_class.to_s,
+ signal: signal,
+ pid: pid,
+ message: "sending Sidekiq worker PGRP-#{pid} #{signal} (#{explanation})")
+ expect(Process).to receive(:kill).with(signal, 0).ordered
+
+ subject
+ end
+ end
+
+ describe '#log_rss_out_of_range' do
+ let(:current_rss) { 100 }
+ let(:soft_limit_rss) { 200 }
+ let(:hard_limit_rss) { 300 }
+ let(:reason) { 'rss out of range reason description' }
+
+ subject { memory_killer.send(:log_rss_out_of_range, current_rss, hard_limit_rss, soft_limit_rss) }
+
+ it 'invoke sidekiq logger warn' do
+ expect(memory_killer).to receive(:out_of_range_description).with(current_rss, hard_limit_rss, soft_limit_rss).and_return(reason)
+ expect(Sidekiq.logger).to receive(:warn)
+ .with(
+ class: described_class.to_s,
+ pid: pid,
+ message: 'Sidekiq worker rss out of range',
+ reason: reason)
+
+ subject
+ end
+ end
+
+ describe '#out_of_range_description' do
+ let(:hard_limit) { 300 }
+ let(:soft_limit) { 200 }
+ let(:grace_balloon_seconds) { 12 }
+
+ subject { memory_killer.send(:out_of_range_description, rss, hard_limit, soft_limit) }
+
+ context 'when rss > hard_limit' do
+ let(:rss) { 400 }
+
+ it 'tells reason' do
+ expect(subject).to eq("current_rss(#{rss}) > hard_limit_rss(#{hard_limit})")
+ end
+ end
+
+ context 'when rss <= hard_limit' do
+ let(:rss) { 300 }
+
+ it 'tells reason' do
+ stub_const("#{described_class}::GRACE_BALLOON_SECONDS", grace_balloon_seconds)
+ expect(subject).to eq("current_rss(#{rss}) > soft_limit_rss(#{soft_limit}) longer than GRACE_BALLOON_SECONDS(#{grace_balloon_seconds})")
+ end
+ end
+ end
+
+ describe '#rss_increase_by_jobs' do
+ let(:running_jobs) { { id1: 'job1', id2: 'job2' } }
+
+ subject { memory_killer.send(:rss_increase_by_jobs) }
+
+ it 'adds up individual rss_increase_by_job' do
+ expect(Gitlab::SidekiqDaemon::Monitor).to receive_message_chain(:instance, :jobs).and_return(running_jobs)
+ expect(memory_killer).to receive(:rss_increase_by_job).and_return(11, 22)
+ expect(subject).to eq(33)
+ end
+
+ it 'return 0 if no job' do
+ expect(Gitlab::SidekiqDaemon::Monitor).to receive_message_chain(:instance, :jobs).and_return({})
+ expect(subject).to eq(0)
+ end
+ end
+
+ describe '#rss_increase_by_job' do
+ let(:worker_class) { Chaos::SleepWorker }
+ let(:job) { { worker_class: worker_class, started_at: 321 } }
+ let(:max_memory_kb) { 100000 }
+
+ subject { memory_killer.send(:rss_increase_by_job, job) }
+
+ before do
+ stub_const("#{described_class}::MAX_MEMORY_KB", max_memory_kb)
+ end
+
+ it 'return 0 if memory_growth_kb return 0' do
+ expect(memory_killer).to receive(:get_job_options).with(job, 'memory_killer_memory_growth_kb', 0).and_return(0)
+ expect(memory_killer).to receive(:get_job_options).with(job, 'memory_killer_max_memory_growth_kb', max_memory_kb).and_return(0)
+
+ expect(Time).not_to receive(:now)
+ expect(subject).to eq(0)
+ end
+
+ it 'return time factored growth value when it does not exceed max growth limit for whilited job' do
+ expect(memory_killer).to receive(:get_job_options).with(job, 'memory_killer_memory_growth_kb', 0).and_return(10)
+ expect(memory_killer).to receive(:get_job_options).with(job, 'memory_killer_max_memory_growth_kb', max_memory_kb).and_return(100)
+
+ expect(Time).to receive(:now).and_return(323)
+ expect(subject).to eq(20)
+ end
+
+ it 'return max growth limit when time factored growth value exceed max growth limit for whilited job' do
+ expect(memory_killer).to receive(:get_job_options).with(job, 'memory_killer_memory_growth_kb', 0).and_return(10)
+ expect(memory_killer).to receive(:get_job_options).with(job, 'memory_killer_max_memory_growth_kb', max_memory_kb).and_return(100)
+
+ expect(Time).to receive(:now).and_return(332)
+ expect(subject).to eq(100)
+ end
+ end
+
+ describe '#get_job_options' do
+ let(:worker_class) { Chaos::SleepWorker }
+ let(:job) { { worker_class: worker_class, started_at: 321 } }
+ let(:key) { 'my-key' }
+ let(:default) { 'my-default' }
+
+ subject { memory_killer.send(:get_job_options, job, key, default) }
+
+ it 'return default if key is not defined' do
+ expect(worker_class).to receive(:sidekiq_options).and_return({ "retry" => 5 })
+
+ expect(subject).to eq(default)
+ end
+
+ it 'return default if get StandardError when retrieve sidekiq_options' do
+ expect(worker_class).to receive(:sidekiq_options).and_raise(StandardError)
+
+ expect(subject).to eq(default)
+ end
+
+ it 'return right value if sidekiq_options has the key' do
+ expect(worker_class).to receive(:sidekiq_options).and_return({ key => 10 })
+
+ expect(subject).to eq(10)
+ end
+ end
+end
diff --git a/spec/lib/gitlab/sidekiq_daemon/monitor_spec.rb b/spec/lib/gitlab/sidekiq_daemon/monitor_spec.rb
index acbb09e3542..397098ed5a4 100644
--- a/spec/lib/gitlab/sidekiq_daemon/monitor_spec.rb
+++ b/spec/lib/gitlab/sidekiq_daemon/monitor_spec.rb
@@ -8,12 +8,12 @@ describe Gitlab::SidekiqDaemon::Monitor do
describe '#within_job' do
it 'tracks thread' do
blk = proc do
- expect(monitor.jobs_thread['jid']).not_to be_nil
+ expect(monitor.jobs.dig('jid', :thread)).not_to be_nil
"OK"
end
- expect(monitor.within_job('jid', 'queue', &blk)).to eq("OK")
+ expect(monitor.within_job('worker_class', 'jid', 'queue', &blk)).to eq("OK")
end
context 'when job is canceled' do
@@ -25,19 +25,34 @@ describe Gitlab::SidekiqDaemon::Monitor do
it 'does not execute a block' do
expect do |blk|
- monitor.within_job(jid, 'queue', &blk)
+ monitor.within_job('worker_class', jid, 'queue', &blk)
rescue described_class::CancelledError
end.not_to yield_control
end
it 'raises exception' do
- expect { monitor.within_job(jid, 'queue') }.to raise_error(
+ expect { monitor.within_job('worker_class', jid, 'queue') }.to raise_error(
described_class::CancelledError)
end
end
end
- describe '#start_working' do
+ describe '#start_working when notification channel not enabled' do
+ subject { monitor.send(:start_working) }
+
+ it 'return directly' do
+ allow(monitor).to receive(:notification_channel_enabled?).and_return(nil)
+
+ expect(Sidekiq.logger).not_to receive(:info)
+ expect(Sidekiq.logger).not_to receive(:warn)
+ expect(monitor).not_to receive(:enabled?)
+ expect(monitor).not_to receive(:process_messages)
+
+ subject
+ end
+ end
+
+ describe '#start_working when notification channel enabled' do
subject { monitor.send(:start_working) }
before do
@@ -45,6 +60,7 @@ describe Gitlab::SidekiqDaemon::Monitor do
# we toggle `enabled?` flag after the first call
stub_const('Gitlab::SidekiqDaemon::Monitor::RECONNECT_TIME', 0)
allow(monitor).to receive(:enabled?).and_return(true, false)
+ allow(monitor).to receive(:notification_channel_enabled?).and_return(1)
allow(Sidekiq.logger).to receive(:info)
allow(Sidekiq.logger).to receive(:warn)
@@ -204,7 +220,7 @@ describe Gitlab::SidekiqDaemon::Monitor do
let(:thread) { Thread.new { sleep 1000 } }
before do
- monitor.jobs_thread[jid] = thread
+ monitor.jobs[jid] = { worker_class: 'worker_class', thread: thread, started_at: Time.now.to_i }
end
after do
@@ -258,4 +274,24 @@ describe Gitlab::SidekiqDaemon::Monitor do
subject
end
end
+
+ describe '#notification_channel_enabled?' do
+ subject { monitor.send(:notification_channel_enabled?) }
+
+ it 'return nil when SIDEKIQ_MONITOR_WORKER is not set' do
+ expect(subject).to be nil
+ end
+
+ it 'return nil when SIDEKIQ_MONITOR_WORKER set to 0' do
+ allow(ENV).to receive(:fetch).with('SIDEKIQ_MONITOR_WORKER', 0).and_return("0")
+
+ expect(subject).to be nil
+ end
+
+ it 'return 1 when SIDEKIQ_MONITOR_WORKER set to 1' do
+ allow(ENV).to receive(:fetch).with('SIDEKIQ_MONITOR_WORKER', 0).and_return("1")
+
+ expect(subject).to be 1
+ end
+ end
end
diff --git a/spec/lib/gitlab/sidekiq_middleware/monitor_spec.rb b/spec/lib/gitlab/sidekiq_middleware/monitor_spec.rb
index 023df1a6391..398144025ea 100644
--- a/spec/lib/gitlab/sidekiq_middleware/monitor_spec.rb
+++ b/spec/lib/gitlab/sidekiq_middleware/monitor_spec.rb
@@ -12,7 +12,7 @@ describe Gitlab::SidekiqMiddleware::Monitor do
it 'calls Gitlab::SidekiqDaemon::Monitor' do
expect(Gitlab::SidekiqDaemon::Monitor.instance).to receive(:within_job)
- .with('job-id', 'my-queue')
+ .with(anything, 'job-id', 'my-queue')
.and_call_original
expect { |blk| monitor.call(worker, job, queue, &blk) }.to yield_control