diff options
author | Qingyu Zhao <qzhao@gitlab.com> | 2019-08-31 00:05:04 +1000 |
---|---|---|
committer | Qingyu Zhao <qzhao@gitlab.com> | 2019-09-17 18:07:30 +1000 |
commit | d7b03e28a77f40ed51c99c3668981999db27e6d1 (patch) | |
tree | dc3493327fffd4eeb54db5829e856f5fe44a0284 | |
parent | 3c372d87a701b600f42fccbf4707eebd67ff7cac (diff) | |
download | gitlab-ce-59754-independent-sidekiq-memory-killer.tar.gz |
Add Sidekiq daemon memory killer59754-independent-sidekiq-memory-killer
When enable daemon memory killer, old memory killer will be disabled
The daemon memory killer(Gitlab::SidekiqDaemon::MemoryKiller)
- is to replace existing Gitlab::SidekiqMiddleware::MemoryKiller
- allows Sidekiq memory balloon reasonably, within given time limit
- reduces monitoring overhead(when there are many small jobs)
- monitors Sidekiq memory timely(for long running jobs)
- respects Worker sidekiq_options `memory_killer_memory_growth_kb`
- respects Worker sidekiq_options `max_memory_killer_memory_growth_kb`
Update document on Sidekiq memory killer ENV variables
-rw-r--r-- | config/initializers/sidekiq.rb | 14 | ||||
-rw-r--r-- | doc/administration/operations/sidekiq_memory_killer.md | 52 | ||||
-rw-r--r-- | doc/development/import_export.md | 4 | ||||
-rw-r--r-- | doc/user/gitlab_com/index.md | 15 | ||||
-rw-r--r-- | lib/gitlab/sidekiq_daemon/memory_killer.rb | 222 | ||||
-rw-r--r-- | lib/gitlab/sidekiq_daemon/monitor.rb | 50 | ||||
-rw-r--r-- | lib/gitlab/sidekiq_middleware/monitor.rb | 2 | ||||
-rw-r--r-- | spec/lib/gitlab/sidekiq_daemon/memory_killer_spec.rb | 424 | ||||
-rw-r--r-- | spec/lib/gitlab/sidekiq_daemon/monitor_spec.rb | 48 | ||||
-rw-r--r-- | spec/lib/gitlab/sidekiq_middleware/monitor_spec.rb | 2 |
10 files changed, 783 insertions, 50 deletions
diff --git a/config/initializers/sidekiq.rb b/config/initializers/sidekiq.rb index 20f31ff6810..e3505579204 100644 --- a/config/initializers/sidekiq.rb +++ b/config/initializers/sidekiq.rb @@ -28,16 +28,18 @@ if Rails.env.development? end enable_json_logs = Gitlab.config.sidekiq.log_format == 'json' -enable_sidekiq_monitor = ENV.fetch("SIDEKIQ_MONITOR_WORKER", 0).to_i.nonzero? +enable_sidekiq_memory_killer = ENV['SIDEKIQ_MEMORY_KILLER_MAX_RSS'].to_i.nonzero? +use_sidekiq_daemon_memory_killer = ENV["SIDEKIQ_DAEMON_MEMORY_KILLER"].to_i.nonzero? +use_sidekiq_legacy_memory_killer = !use_sidekiq_daemon_memory_killer Sidekiq.configure_server do |config| config.redis = queues_config_hash config.server_middleware do |chain| - chain.add Gitlab::SidekiqMiddleware::Monitor if enable_sidekiq_monitor + chain.add Gitlab::SidekiqMiddleware::Monitor chain.add Gitlab::SidekiqMiddleware::Metrics if Settings.monitoring.sidekiq_exporter chain.add Gitlab::SidekiqMiddleware::ArgumentsLogger if ENV['SIDEKIQ_LOG_ARGUMENTS'] && !enable_json_logs - chain.add Gitlab::SidekiqMiddleware::MemoryKiller if ENV['SIDEKIQ_MEMORY_KILLER_MAX_RSS'] + chain.add Gitlab::SidekiqMiddleware::MemoryKiller if enable_sidekiq_memory_killer && use_sidekiq_legacy_memory_killer chain.add Gitlab::SidekiqMiddleware::RequestStoreMiddleware unless ENV['SIDEKIQ_REQUEST_STORE'] == '0' chain.add Gitlab::SidekiqMiddleware::BatchLoader chain.add Gitlab::SidekiqMiddleware::CorrelationLogger @@ -60,7 +62,11 @@ Sidekiq.configure_server do |config| # Sidekiq (e.g. in an initializer). ActiveRecord::Base.clear_all_connections! - Gitlab::SidekiqDaemon::Monitor.instance.start if enable_sidekiq_monitor + # Start monitor to track running jobs. By default, cancel job is not enabled + # To cancel job, it requires `SIDEKIQ_MONITOR_WORKER=1` to enable notification channel + Gitlab::SidekiqDaemon::Monitor.instance.start + + Gitlab::SidekiqDaemon::MemoryKiller.instance.start if enable_sidekiq_memory_killer && use_sidekiq_daemon_memory_killer end if enable_reliable_fetch? diff --git a/doc/administration/operations/sidekiq_memory_killer.md b/doc/administration/operations/sidekiq_memory_killer.md index 8eac42f2fe2..eb4dea91ebb 100644 --- a/doc/administration/operations/sidekiq_memory_killer.md +++ b/doc/administration/operations/sidekiq_memory_killer.md @@ -26,18 +26,48 @@ run as a process group leader (e.g., using `chpst -P`). If using Omnibus or the The MemoryKiller is controlled using environment variables. -- `SIDEKIQ_MEMORY_KILLER_MAX_RSS`: if this variable is set, and its value is - greater than 0, then after each Sidekiq job, the MemoryKiller will check the - RSS of the Sidekiq process that executed the job. If the RSS of the Sidekiq - process (expressed in kilobytes) exceeds SIDEKIQ_MEMORY_KILLER_MAX_RSS, a - delayed shutdown is triggered. The default value for Omnibus packages is set +- `SIDEKIQ_DAEMON_MEMORY_KILLER`: defaults to 0. When set to 1, the MemoryKiller + works in daemon mode. Otherwise, the MemoryKiller works in legacy mode. + + In legacy mode, the MemoryKiller check the Sidekiq process RSS after each job. + In daemon mode, the MemoryKiller check the Sidekiq process RSS every 3 seconds. + +- `SIDEKIQ_MEMORY_KILLER_MAX_RSS`: if this variable is set, and its value is greater + than 0, the MemoryKiller is enabled. Otherwise the MemoryKiller is disabled. + + In legacy mode, SIDEKIQ_MEMORY_KILLER_MAX_RSS acts as hard limit. If the Sidekiq + process RSS (expressed in kilobytes) exceeds SIDEKIQ_MEMORY_KILLER_MAX_RSS, + a delayed shutdown is triggered. + In daemon mode, SIDEKIQ_MEMORY_KILLER_MAX_RSS acts as soft limit. The RSS + of the Sidekiq process is allowed to exceed SIDEKIQ_MEMORY_KILLER_MAX_RSS within + limited time. + + The default value for Omnibus packages is set [in the omnibus-gitlab repository](https://gitlab.com/gitlab-org/omnibus-gitlab/blob/master/files/gitlab-cookbooks/gitlab/attributes/default.rb). + +- `SIDEKIQ_MEMORY_KILLER_HARD_LIMIT_RSS`: only for daemon mode. If the Sidekiq + process RSS (expressed in kilobytes) exceeds SIDEKIQ_MEMORY_KILLER_HARD_LIMIT_RSS, + a delayed shutdown is triggered. + +- `SIDEKIQ_MEMORY_KILLER_GRACE_BALLOON_TIME`: only for daemon mode. Defaults to + 900 seconds (15 minutes). As long as the RSS of the Sidekiq process is under + SIDEKIQ_MEMORY_KILLER_HARD_LIMIT_RSS, it can exceed SIDEKIQ_MEMORY_KILLER_MAX_RSS + for 15 minutes. After that, a delayed shutdown is triggered. + +- `SIDEKIQ_MEMORY_KILLER_CHECK_INTERVAL`: only for daemon mode. Defaults to + 3 seconds. Check Sidekiq process RSS every 3 seconds. If Sidekiq process RSS + is out of range, a delayed shutdown is triggered. + - `SIDEKIQ_MEMORY_KILLER_GRACE_TIME`: defaults to 900 seconds (15 minutes). When - a shutdown is triggered, the Sidekiq process will keep working normally for + a shutdown is triggered, the Sidekiq process will keep working for up to another 15 minutes. -- `SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT`: defaults to 30 seconds. When the grace - time has expired, the MemoryKiller tells Sidekiq to stop accepting new jobs. - Existing jobs get 30 seconds to finish. After that, the MemoryKiller tells - Sidekiq to shut down, and an external supervision mechanism (e.g. Runit) must - restart Sidekiq. + + In legacy mode, Sidekiq process continue accepting new jobs during grace time. + In daemon mode, Sidekiq process will stop accepting new jobs during grace time. + +- `SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT`: defaults to 30 seconds. After graceful + shutdown period(either timeout or all jobs finished), the MemoryKiller tells + Sidekiq to shutdown. If the shutdown did not finish within 30 seconds, the + MemoryKiller will terminate Sidekiq forcefully with SIGKILL, and an external + supervision mechanism (e.g. Runit) must restart Sidekiq. diff --git a/doc/development/import_export.md b/doc/development/import_export.md index 0c343bc22e4..74cdc4c545f 100644 --- a/doc/development/import_export.md +++ b/doc/development/import_export.md @@ -31,7 +31,9 @@ Read through the current performance problems using the Import/Export below. Out of memory (OOM) errors are normally caused by the [Sidekiq Memory Killer](../administration/operations/sidekiq_memory_killer.md): ```bash -SIDEKIQ_MEMORY_KILLER_MAX_RSS = 2GB in GitLab.com +SIDEKIQ_MEMORY_KILLER_MAX_RSS = 2000000 +SIDEKIQ_MEMORY_KILLER_HARD_LIMIT_RSS = 3000000 +SIDEKIQ_MEMORY_KILLER_GRACE_BALLOON_TIME = 900 ``` An import status `started`, and the following sidekiq logs will signal a memory issue: diff --git a/doc/user/gitlab_com/index.md b/doc/user/gitlab_com/index.md index 2f2955f5a1c..2683f2612ae 100644 --- a/doc/user/gitlab_com/index.md +++ b/doc/user/gitlab_com/index.md @@ -177,11 +177,16 @@ sentry_dsn = "X" GitLab.com runs [Sidekiq][sidekiq] with arguments `--timeout=4 --concurrency=4` and the following environment variables: -| Setting | GitLab.com | Default | -|-------- |----------- |-------- | -| `SIDEKIQ_MEMORY_KILLER_MAX_RSS` | `1000000` | `2000000` | -| `SIDEKIQ_MEMORY_KILLER_SHUTDOWN_SIGNAL` | `SIGKILL` | - | -| `SIDEKIQ_LOG_ARGUMENTS` | `1` | - | +| Setting | GitLab.com | Default | +|-------- |----------- |-------- | +| `SIDEKIQ_DAEMON_MEMORY_KILLER` | - | - | +| `SIDEKIQ_MEMORY_KILLER_MAX_RSS` | `16000000` | `2000000` | +| `SIDEKIQ_MEMORY_KILLER_HARD_LIMIT_RSS` | - | - | +| `SIDEKIQ_MEMORY_KILLER_GRACE_BALLOON_TIME` | - | `900` | +| `SIDEKIQ_MEMORY_KILLER_CHECK_INTERVAL` | - | `3` | +| `SIDEKIQ_MEMORY_KILLER_GRACE_TIME` | - | `900` | +| `SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT` | - | `30` | +| `SIDEKIQ_LOG_ARGUMENTS` | `1` | - | ## Cron jobs diff --git a/lib/gitlab/sidekiq_daemon/memory_killer.rb b/lib/gitlab/sidekiq_daemon/memory_killer.rb new file mode 100644 index 00000000000..854f0a079ef --- /dev/null +++ b/lib/gitlab/sidekiq_daemon/memory_killer.rb @@ -0,0 +1,222 @@ +# frozen_string_literal: true + +module Gitlab + module SidekiqDaemon + class MemoryKiller < Daemon + include ::Gitlab::Utils::StrongMemoize + + # Today 64-bit CPU support max 256T memory. It is big enough. + MAX_MEMORY_KB = 256 * 1024 * 1024 * 1024 + # RSS below `soft_limit_rss` is considered safe + SOFT_LIMIT_RSS_KB = ENV.fetch('SIDEKIQ_MEMORY_KILLER_MAX_RSS', 2000000).to_i + # RSS above `hard_limit_rss` will be stopped + HARD_LIMIT_RSS_KB = ENV.fetch('SIDEKIQ_MEMORY_KILLER_HARD_LIMIT_RSS', MAX_MEMORY_KB).to_i + # RSS in range (soft_limit_rss, hard_limit_rss) is allowed for GRACE_BALLOON_SECONDS + GRACE_BALLOON_SECONDS = ENV.fetch('SIDEKIQ_MEMORY_KILLER_GRACE_BALLOON_TIME', 15 * 60).to_i + # Check RSS every CHECK_INTERVAL_SECONDS, minimum 2 seconds + CHECK_INTERVAL_SECONDS = [ENV.fetch('SIDEKIQ_MEMORY_KILLER_CHECK_INTERVAL', 3).to_i, 2].max + # Give Sidekiq up to 15 minutes of grace time to allow existing jobs to finish after exceeding the limit + GRACEFUL_SHUTDOWN_TIMEOUT_SECONDS = ENV.fetch('SIDEKIQ_MEMORY_KILLER_GRACE_TIME', 15 * 60).to_i + # Give Sidekiq up to 30 seconds to interrupt all running jobs and shutdown + SHUTDOWN_TIMEOUT_SECONDS = ENV.fetch('SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT', 30).to_i + + def initialize + super + + @enabled = true + end + + private + + def start_working + Sidekiq.logger.info( + class: self.class.to_s, + action: 'start', + pid: pid, + message: 'Starting Gitlab::SidekiqDaemon::MemoryKiller Daemon' + ) + + while enabled? + begin + restart_sidekiq unless rss_within_range? + sleep(CHECK_INTERVAL_SECONDS) + rescue => e + log_exception(e, __method__) + rescue Exception => e # rubocop:disable Lint/RescueException + log_exception(e, __method__ ) + raise e + end + end + ensure + Sidekiq.logger.warn( + class: self.class.to_s, + action: 'stop', + pid: pid, + message: 'Stopping Gitlab::SidekiqDaemon::MemoryKiller Daemon' + ) + end + + def log_exception(exception, method) + Sidekiq.logger.warn( + class: self.class.to_s, + pid: pid, + message: "Exception from #{method}: #{exception.message}" + ) + end + + def stop_working + @enabled = false + end + + def enabled? + @enabled + end + + def restart_sidekiq + # Tell Sidekiq to stop fetching new jobs + # We first SIGNAL and then wait given time + # We also monitor a number of running jobs and allow to restart early + signal_and_wait(GRACEFUL_SHUTDOWN_TIMEOUT_SECONDS, 'SIGTSTP', 'stop fetching new jobs') + return unless enabled? + + # Tell sidekiq to restart itself + signal_and_wait(SHUTDOWN_TIMEOUT_SECONDS, 'SIGTERM', 'gracefully shut down') + return unless enabled? + + # Ideally we should never reach this condition + # Wait for Sidekiq to shutdown gracefully, and kill it if it didn't + # Kill the whole pgroup, so we can be sure no children are left behind + # Keep extra safe to wait `Sidekiq.options[:timeout] + 2` seconds before SIGKILL + signal_pgroup(Sidekiq.options[:timeout] + 2, 'SIGKILL', 'die') + end + + def rss_within_range? + current_rss = nil + deadline = Time.now + GRACE_BALLOON_SECONDS.seconds + loop do + return true unless enabled? + + current_rss = get_rss + + # RSS go above hard limit should trigger forcible shutdown right away + break if current_rss > hard_limit_rss + + # RSS go below the soft limit + return true if current_rss < soft_limit_rss + + # RSS did not go below the soft limit within deadline, restart + break if Time.now > deadline + + sleep(CHECK_INTERVAL_SECONDS) + end + + log_rss_out_of_range(current_rss, hard_limit_rss, soft_limit_rss) + + false + end + + def log_rss_out_of_range(current_rss, hard_limit_rss, soft_limit_rss) + Sidekiq.logger.warn( + class: self.class.to_s, + pid: pid, + message: 'Sidekiq worker rss out of range', + reason: out_of_range_description(current_rss, hard_limit_rss, soft_limit_rss) + ) + end + + def out_of_range_description(rss, hard_limit, soft_limit) + if rss > hard_limit + "current_rss(#{rss}) > hard_limit_rss(#{hard_limit})" + else + "current_rss(#{rss}) > soft_limit_rss(#{soft_limit}) longer than GRACE_BALLOON_SECONDS(#{GRACE_BALLOON_SECONDS})" + end + end + + def get_rss + output, status = Gitlab::Popen.popen(%W(ps -o rss= -p #{pid}), Rails.root.to_s) + return 0 unless status&.zero? + + output.to_i + end + + def soft_limit_rss + SOFT_LIMIT_RSS_KB + rss_increase_by_jobs + end + + def hard_limit_rss + HARD_LIMIT_RSS_KB + end + + def signal_and_wait(time, signal, explanation) + Sidekiq.logger.warn( + class: self.class.to_s, + pid: pid, + message: "sending Sidekiq worker #{signal} (#{explanation}). Then wait at most #{time} seconds." + ) + Process.kill(signal, pid) + + deadline = Time.now + time + + # we try to finish as early as all jobs finished + # so we retest that in loop + sleep(CHECK_INTERVAL_SECONDS) while enabled? && any_jobs? && Time.now < deadline + end + + def signal_pgroup(time, signal, explanation) + if Process.getpgrp == pid + pid_or_pgrp_str = 'PGRP' + pid_to_signal = 0 + else + pid_or_pgrp_str = 'PID' + pid_to_signal = pid + end + + Sidekiq.logger.warn( + class: self.class.to_s, + signal: signal, + pid: pid, + message: "waiting #{time} seconds before sending Sidekiq worker #{pid_or_pgrp_str}-#{pid} #{signal} (#{explanation})" + ) + sleep(time) + + Sidekiq.logger.warn( + class: self.class.to_s, + signal: signal, + pid: pid, + message: "sending Sidekiq worker #{pid_or_pgrp_str}-#{pid} #{signal} (#{explanation})" + ) + Process.kill(signal, pid_to_signal) + end + + def rss_increase_by_jobs + Gitlab::SidekiqDaemon::Monitor.instance.jobs.sum do |job| # rubocop:disable CodeReuse/ActiveRecord + rss_increase_by_job(job) + end + end + + def rss_increase_by_job(job) + memory_growth_kb = get_job_options(job, 'memory_killer_memory_growth_kb', 0).to_i + max_memory_growth_kb = get_job_options(job, 'memory_killer_max_memory_growth_kb', MAX_MEMORY_KB).to_i + + return 0 if memory_growth_kb.zero? + + time_elapsed = Time.now.to_i - job[:started_at] + [memory_growth_kb * time_elapsed, max_memory_growth_kb].min + end + + def get_job_options(job, key, default) + job[:worker_class].sidekiq_options.fetch(key, default) + rescue + default + end + + def pid + Process.pid + end + + def any_jobs? + Gitlab::SidekiqDaemon::Monitor.instance.jobs.any? + end + end + end +end diff --git a/lib/gitlab/sidekiq_daemon/monitor.rb b/lib/gitlab/sidekiq_daemon/monitor.rb index bbfca130425..2b82adda430 100644 --- a/lib/gitlab/sidekiq_daemon/monitor.rb +++ b/lib/gitlab/sidekiq_daemon/monitor.rb @@ -14,19 +14,19 @@ module Gitlab # that should not be caught by application CancelledError = Class.new(Exception) # rubocop:disable Lint/InheritException - attr_reader :jobs_thread + attr_reader :jobs attr_reader :jobs_mutex def initialize super - @jobs_thread = {} + @jobs = {} @jobs_mutex = Mutex.new end - def within_job(jid, queue) + def within_job(worker_class, jid, queue) jobs_mutex.synchronize do - jobs_thread[jid] = Thread.current + jobs[jid] = { worker_class: worker_class, thread: Thread.current, started_at: Time.now.to_i } end if cancelled?(jid) @@ -43,7 +43,7 @@ module Gitlab yield ensure jobs_mutex.synchronize do - jobs_thread.delete(jid) + jobs.delete(jid) end end @@ -62,23 +62,27 @@ module Gitlab private def start_working - Sidekiq.logger.info( - class: self.class.to_s, - action: 'start', - message: 'Starting Monitor Daemon' - ) + return unless notification_channel_enabled? - while enabled? - process_messages - sleep(RECONNECT_TIME) - end + begin + Sidekiq.logger.info( + class: self.class.to_s, + action: 'start', + message: 'Starting Monitor Daemon' + ) - ensure - Sidekiq.logger.warn( - class: self.class.to_s, - action: 'stop', - message: 'Stopping Monitor Daemon' - ) + while enabled? + process_messages + sleep(RECONNECT_TIME) + end + + ensure + Sidekiq.logger.warn( + class: self.class.to_s, + action: 'stop', + message: 'Stopping Monitor Daemon' + ) + end end def stop_working @@ -156,7 +160,7 @@ module Gitlab # This is why it passes thread in block, # to ensure that we do process this thread def find_thread_unsafe(jid) - jobs_thread[jid] + jobs.dig(jid, :thread) end def find_thread_with_lock(jid) @@ -179,6 +183,10 @@ module Gitlab def self.cancel_job_key(jid) "sidekiq:cancel:#{jid}" end + + def notification_channel_enabled? + ENV.fetch("SIDEKIQ_MONITOR_WORKER", 0).to_i.nonzero? + end end end end diff --git a/lib/gitlab/sidekiq_middleware/monitor.rb b/lib/gitlab/sidekiq_middleware/monitor.rb index 00965bf5506..ed825dbfd60 100644 --- a/lib/gitlab/sidekiq_middleware/monitor.rb +++ b/lib/gitlab/sidekiq_middleware/monitor.rb @@ -4,7 +4,7 @@ module Gitlab module SidekiqMiddleware class Monitor def call(worker, job, queue) - Gitlab::SidekiqDaemon::Monitor.instance.within_job(job['jid'], queue) do + Gitlab::SidekiqDaemon::Monitor.instance.within_job(worker.class, job['jid'], queue) do yield end rescue Gitlab::SidekiqDaemon::Monitor::CancelledError diff --git a/spec/lib/gitlab/sidekiq_daemon/memory_killer_spec.rb b/spec/lib/gitlab/sidekiq_daemon/memory_killer_spec.rb new file mode 100644 index 00000000000..6280b9bd66e --- /dev/null +++ b/spec/lib/gitlab/sidekiq_daemon/memory_killer_spec.rb @@ -0,0 +1,424 @@ +# frozen_string_literal: true + +require 'spec_helper' + +describe Gitlab::SidekiqDaemon::MemoryKiller do + let(:memory_killer) { described_class.new } + let(:pid) { 12345 } + + before do + allow(memory_killer).to receive(:pid).and_return(pid) + allow(Sidekiq.logger).to receive(:info) + allow(Sidekiq.logger).to receive(:warn) + end + + describe '#start_working' do + subject { memory_killer.send(:start_working) } + + before do + # let enabled? return 3 times: true, true, false + allow(memory_killer).to receive(:enabled?).and_return(true, true, false) + end + + context 'when structured logging is used' do + it 'logs start message once' do + expect(Sidekiq.logger).to receive(:info).once + .with( + class: described_class.to_s, + action: 'start', + pid: pid, + message: 'Starting Gitlab::SidekiqDaemon::MemoryKiller Daemon') + + subject + end + + it 'logs StandardError message twice' do + expect(Sidekiq.logger).to receive(:warn).twice + .with( + class: described_class.to_s, + pid: pid, + message: "Exception from start_working: My Exception") + + expect(memory_killer).to receive(:rss_within_range?).twice.and_raise(StandardError, 'My Exception') + + expect { subject }.not_to raise_exception + end + + it 'logs exception message once and raise execption and log stop message' do + expect(Sidekiq.logger).to receive(:warn).once + .with( + class: described_class.to_s, + pid: pid, + message: "Exception from start_working: My Exception") + + expect(memory_killer).to receive(:rss_within_range?).once.and_raise(Exception, 'My Exception') + + expect(Sidekiq.logger).to receive(:warn).once + .with( + class: described_class.to_s, + action: 'stop', + pid: pid, + message: 'Stopping Gitlab::SidekiqDaemon::MemoryKiller Daemon') + + expect { subject }.to raise_exception + end + + it 'logs stop message once' do + expect(Sidekiq.logger).to receive(:warn).once + .with( + class: described_class.to_s, + action: 'stop', + pid: pid, + message: 'Stopping Gitlab::SidekiqDaemon::MemoryKiller Daemon') + + subject + end + end + + it 'invoke rss_within_range? twice' do + expect(memory_killer).to receive(:rss_within_range?).twice + + subject + end + + it 'not invoke restart_sidekiq when rss in range' do + expect(memory_killer).to receive(:rss_within_range?).twice.and_return(true) + + expect(memory_killer).not_to receive(:restart_sidekiq) + + subject + end + + it 'invoke restart_sidekiq when rss not in range' do + expect(memory_killer).to receive(:rss_within_range?).at_least(:once).and_return(false) + + expect(memory_killer).to receive(:restart_sidekiq).at_least(:once) + + subject + end + end + + describe '#stop_working' do + subject { memory_killer.send(:stop_working)} + + it 'changed enable? to false' do + expect(memory_killer.send(:enabled?)).to be true + subject + expect(memory_killer.send(:enabled?)).to be false + end + end + + describe '#rss_within_range?' do + let(:graceful_shutdown_timeout_seconds) { 6 } + let(:shutdown_timeout_seconds) { 7 } + let(:check_interval_seconds) { 2 } + let(:grace_balloon_seconds) { 5 } + + subject { memory_killer.send(:rss_within_range?) } + + before do + stub_const("#{described_class}::GRACEFUL_SHUTDOWN_TIMEOUT_SECONDS", graceful_shutdown_timeout_seconds) + stub_const("#{described_class}::SHUTDOWN_TIMEOUT_SECONDS", shutdown_timeout_seconds) + stub_const("#{described_class}::CHECK_INTERVAL_SECONDS", check_interval_seconds) + stub_const("#{described_class}::GRACE_BALLOON_SECONDS", grace_balloon_seconds) + allow(Process).to receive(:getpgrp).and_return(pid) + allow(Sidekiq).to receive(:options).and_return(timeout: 9) + end + + it 'return true when everything is within limit' do + expect(memory_killer).to receive(:get_rss).and_return(100) + expect(memory_killer).to receive(:soft_limit_rss).and_return(200) + expect(memory_killer).to receive(:hard_limit_rss).and_return(300) + + expect(Time).to receive(:now).and_call_original + expect(memory_killer).not_to receive(:log_rss_out_of_range) + + expect(subject).to be true + end + + it 'return false when rss exceeds hard_limit_rss' do + expect(memory_killer).to receive(:get_rss).and_return(400) + expect(memory_killer).to receive(:soft_limit_rss).at_least(:once).and_return(200) + expect(memory_killer).to receive(:hard_limit_rss).at_least(:once).and_return(300) + + expect(Time).to receive(:now).and_call_original + + expect(memory_killer).to receive(:log_rss_out_of_range).with(400, 300, 200) + + expect(subject).to be false + end + + it 'return false when rss exceed hard_limit_rss after a while' do + expect(memory_killer).to receive(:get_rss).and_return(250, 400) + expect(memory_killer).to receive(:soft_limit_rss).at_least(:once).and_return(200) + expect(memory_killer).to receive(:hard_limit_rss).at_least(:once).and_return(300) + + expect(Time).to receive(:now).twice.and_call_original + expect(memory_killer).to receive(:sleep).with(check_interval_seconds) + + expect(memory_killer).to receive(:log_rss_out_of_range).with(400, 300, 200) + + expect(subject).to be false + end + + it 'return true when rss below soft_limit_rss after a while within GRACE_BALLOON_SECONDS' do + expect(memory_killer).to receive(:get_rss).and_return(250, 100) + expect(memory_killer).to receive(:soft_limit_rss).and_return(200, 200) + expect(memory_killer).to receive(:hard_limit_rss).and_return(300, 300) + + expect(Time).to receive(:now).twice.and_call_original + expect(memory_killer).to receive(:sleep).with(check_interval_seconds) + + expect(memory_killer).not_to receive(:log_rss_out_of_range) + + expect(subject).to be true + end + + it 'return false when rss exceed soft_limit_rss longer than GRACE_BALLOON_SECONDS' do + expect(memory_killer).to receive(:get_rss).exactly(4).times.and_return(250) + expect(memory_killer).to receive(:soft_limit_rss).exactly(5).times.and_return(200) + expect(memory_killer).to receive(:hard_limit_rss).exactly(5).times.and_return(300) + + expect(Time).to receive(:now).exactly(5).times.and_call_original + expect(memory_killer).to receive(:sleep).exactly(3).times.with(check_interval_seconds).and_call_original + + expect(memory_killer).to receive(:log_rss_out_of_range).with(250, 300, 200) + + expect(subject).to be false + end + end + + describe '#restart_sidekiq' do + let(:graceful_shutdown_timeout_seconds) { 6 } + let(:shutdown_timeout_seconds) { 7 } + + subject { memory_killer.send(:restart_sidekiq) } + + before do + stub_const("#{described_class}::GRACEFUL_SHUTDOWN_TIMEOUT_SECONDS", graceful_shutdown_timeout_seconds) + stub_const("#{described_class}::SHUTDOWN_TIMEOUT_SECONDS", shutdown_timeout_seconds) + allow(Sidekiq).to receive(:options).and_return(timeout: 9) + end + + it 'send signal' do + expect(memory_killer).to receive(:signal_and_wait).with(graceful_shutdown_timeout_seconds, 'SIGTSTP', 'stop fetching new jobs').ordered + expect(memory_killer).to receive(:signal_and_wait).with(shutdown_timeout_seconds, 'SIGTERM', 'gracefully shut down').ordered + expect(memory_killer).to receive(:signal_pgroup).with(11, 'SIGKILL', 'die').ordered + + subject + end + end + + describe '#signal_and_wait' do + let(:time) { 7 } + let(:signal) { 'my-signal' } + let(:explanation) { 'my-explanation' } + let(:check_interval_seconds) { 2 } + + subject { memory_killer.send(:signal_and_wait, time, signal, explanation) } + + before do + stub_const("#{described_class}::CHECK_INTERVAL_SECONDS", check_interval_seconds) + end + + it 'send signal and return when all jobs finished' do + expect(Process).to receive(:kill).with(signal, pid).ordered + expect(Time).to receive(:now).and_call_original + + expect(memory_killer).to receive(:enabled?).and_return(true) + expect(memory_killer).to receive(:any_jobs?).and_return(false) + + expect(memory_killer).not_to receive(:sleep) + + subject + end + + it 'send signal and wait till deadline if any job not finished' do + expect(Process).to receive(:kill).with(signal, pid).ordered + expect(Time).to receive(:now).and_call_original.at_least(:once) + + expect(memory_killer).to receive(:enabled?).and_return(true).at_least(:once) + expect(memory_killer).to receive(:any_jobs?).and_return(true).at_least(:once) + + expect(memory_killer).to receive(:sleep).and_call_original.exactly(4).times + + subject + end + end + + describe '#signal_pgroup' do + let(:time) { 2 } + let(:signal) { 'my-signal' } + let(:explanation) { 'my-explanation' } + + subject { memory_killer.send(:signal_pgroup, time, signal, explanation) } + + it 'send signal to this proces if it is not group leader' do + expect(Process).to receive(:getpgrp).and_return(pid + 1) + + expect(Sidekiq.logger).to receive(:warn).once + .with( + class: described_class.to_s, + signal: signal, + pid: pid, + message: "waiting #{time} seconds before sending Sidekiq worker PID-#{pid} #{signal} (#{explanation})") + expect(memory_killer).to receive(:sleep).with(time).ordered + expect(Sidekiq.logger).to receive(:warn).once + .with( + class: described_class.to_s, + signal: signal, + pid: pid, + message: "sending Sidekiq worker PID-#{pid} #{signal} (#{explanation})") + expect(Process).to receive(:kill).with(signal, pid).ordered + + subject + end + + it 'send signal to whole process group as group leader' do + expect(Process).to receive(:getpgrp).and_return(pid) + + expect(Sidekiq.logger).to receive(:warn).once + .with( + class: described_class.to_s, + signal: signal, + pid: pid, + message: "waiting #{time} seconds before sending Sidekiq worker PGRP-#{pid} #{signal} (#{explanation})") + expect(memory_killer).to receive(:sleep).with(time).ordered + expect(Sidekiq.logger).to receive(:warn).once + .with( + class: described_class.to_s, + signal: signal, + pid: pid, + message: "sending Sidekiq worker PGRP-#{pid} #{signal} (#{explanation})") + expect(Process).to receive(:kill).with(signal, 0).ordered + + subject + end + end + + describe '#log_rss_out_of_range' do + let(:current_rss) { 100 } + let(:soft_limit_rss) { 200 } + let(:hard_limit_rss) { 300 } + let(:reason) { 'rss out of range reason description' } + + subject { memory_killer.send(:log_rss_out_of_range, current_rss, hard_limit_rss, soft_limit_rss) } + + it 'invoke sidekiq logger warn' do + expect(memory_killer).to receive(:out_of_range_description).with(current_rss, hard_limit_rss, soft_limit_rss).and_return(reason) + expect(Sidekiq.logger).to receive(:warn) + .with( + class: described_class.to_s, + pid: pid, + message: 'Sidekiq worker rss out of range', + reason: reason) + + subject + end + end + + describe '#out_of_range_description' do + let(:hard_limit) { 300 } + let(:soft_limit) { 200 } + let(:grace_balloon_seconds) { 12 } + + subject { memory_killer.send(:out_of_range_description, rss, hard_limit, soft_limit) } + + context 'when rss > hard_limit' do + let(:rss) { 400 } + + it 'tells reason' do + expect(subject).to eq("current_rss(#{rss}) > hard_limit_rss(#{hard_limit})") + end + end + + context 'when rss <= hard_limit' do + let(:rss) { 300 } + + it 'tells reason' do + stub_const("#{described_class}::GRACE_BALLOON_SECONDS", grace_balloon_seconds) + expect(subject).to eq("current_rss(#{rss}) > soft_limit_rss(#{soft_limit}) longer than GRACE_BALLOON_SECONDS(#{grace_balloon_seconds})") + end + end + end + + describe '#rss_increase_by_jobs' do + let(:running_jobs) { { id1: 'job1', id2: 'job2' } } + + subject { memory_killer.send(:rss_increase_by_jobs) } + + it 'adds up individual rss_increase_by_job' do + expect(Gitlab::SidekiqDaemon::Monitor).to receive_message_chain(:instance, :jobs).and_return(running_jobs) + expect(memory_killer).to receive(:rss_increase_by_job).and_return(11, 22) + expect(subject).to eq(33) + end + + it 'return 0 if no job' do + expect(Gitlab::SidekiqDaemon::Monitor).to receive_message_chain(:instance, :jobs).and_return({}) + expect(subject).to eq(0) + end + end + + describe '#rss_increase_by_job' do + let(:worker_class) { Chaos::SleepWorker } + let(:job) { { worker_class: worker_class, started_at: 321 } } + let(:max_memory_kb) { 100000 } + + subject { memory_killer.send(:rss_increase_by_job, job) } + + before do + stub_const("#{described_class}::MAX_MEMORY_KB", max_memory_kb) + end + + it 'return 0 if memory_growth_kb return 0' do + expect(memory_killer).to receive(:get_job_options).with(job, 'memory_killer_memory_growth_kb', 0).and_return(0) + expect(memory_killer).to receive(:get_job_options).with(job, 'memory_killer_max_memory_growth_kb', max_memory_kb).and_return(0) + + expect(Time).not_to receive(:now) + expect(subject).to eq(0) + end + + it 'return time factored growth value when it does not exceed max growth limit for whilited job' do + expect(memory_killer).to receive(:get_job_options).with(job, 'memory_killer_memory_growth_kb', 0).and_return(10) + expect(memory_killer).to receive(:get_job_options).with(job, 'memory_killer_max_memory_growth_kb', max_memory_kb).and_return(100) + + expect(Time).to receive(:now).and_return(323) + expect(subject).to eq(20) + end + + it 'return max growth limit when time factored growth value exceed max growth limit for whilited job' do + expect(memory_killer).to receive(:get_job_options).with(job, 'memory_killer_memory_growth_kb', 0).and_return(10) + expect(memory_killer).to receive(:get_job_options).with(job, 'memory_killer_max_memory_growth_kb', max_memory_kb).and_return(100) + + expect(Time).to receive(:now).and_return(332) + expect(subject).to eq(100) + end + end + + describe '#get_job_options' do + let(:worker_class) { Chaos::SleepWorker } + let(:job) { { worker_class: worker_class, started_at: 321 } } + let(:key) { 'my-key' } + let(:default) { 'my-default' } + + subject { memory_killer.send(:get_job_options, job, key, default) } + + it 'return default if key is not defined' do + expect(worker_class).to receive(:sidekiq_options).and_return({ "retry" => 5 }) + + expect(subject).to eq(default) + end + + it 'return default if get StandardError when retrieve sidekiq_options' do + expect(worker_class).to receive(:sidekiq_options).and_raise(StandardError) + + expect(subject).to eq(default) + end + + it 'return right value if sidekiq_options has the key' do + expect(worker_class).to receive(:sidekiq_options).and_return({ key => 10 }) + + expect(subject).to eq(10) + end + end +end diff --git a/spec/lib/gitlab/sidekiq_daemon/monitor_spec.rb b/spec/lib/gitlab/sidekiq_daemon/monitor_spec.rb index acbb09e3542..397098ed5a4 100644 --- a/spec/lib/gitlab/sidekiq_daemon/monitor_spec.rb +++ b/spec/lib/gitlab/sidekiq_daemon/monitor_spec.rb @@ -8,12 +8,12 @@ describe Gitlab::SidekiqDaemon::Monitor do describe '#within_job' do it 'tracks thread' do blk = proc do - expect(monitor.jobs_thread['jid']).not_to be_nil + expect(monitor.jobs.dig('jid', :thread)).not_to be_nil "OK" end - expect(monitor.within_job('jid', 'queue', &blk)).to eq("OK") + expect(monitor.within_job('worker_class', 'jid', 'queue', &blk)).to eq("OK") end context 'when job is canceled' do @@ -25,19 +25,34 @@ describe Gitlab::SidekiqDaemon::Monitor do it 'does not execute a block' do expect do |blk| - monitor.within_job(jid, 'queue', &blk) + monitor.within_job('worker_class', jid, 'queue', &blk) rescue described_class::CancelledError end.not_to yield_control end it 'raises exception' do - expect { monitor.within_job(jid, 'queue') }.to raise_error( + expect { monitor.within_job('worker_class', jid, 'queue') }.to raise_error( described_class::CancelledError) end end end - describe '#start_working' do + describe '#start_working when notification channel not enabled' do + subject { monitor.send(:start_working) } + + it 'return directly' do + allow(monitor).to receive(:notification_channel_enabled?).and_return(nil) + + expect(Sidekiq.logger).not_to receive(:info) + expect(Sidekiq.logger).not_to receive(:warn) + expect(monitor).not_to receive(:enabled?) + expect(monitor).not_to receive(:process_messages) + + subject + end + end + + describe '#start_working when notification channel enabled' do subject { monitor.send(:start_working) } before do @@ -45,6 +60,7 @@ describe Gitlab::SidekiqDaemon::Monitor do # we toggle `enabled?` flag after the first call stub_const('Gitlab::SidekiqDaemon::Monitor::RECONNECT_TIME', 0) allow(monitor).to receive(:enabled?).and_return(true, false) + allow(monitor).to receive(:notification_channel_enabled?).and_return(1) allow(Sidekiq.logger).to receive(:info) allow(Sidekiq.logger).to receive(:warn) @@ -204,7 +220,7 @@ describe Gitlab::SidekiqDaemon::Monitor do let(:thread) { Thread.new { sleep 1000 } } before do - monitor.jobs_thread[jid] = thread + monitor.jobs[jid] = { worker_class: 'worker_class', thread: thread, started_at: Time.now.to_i } end after do @@ -258,4 +274,24 @@ describe Gitlab::SidekiqDaemon::Monitor do subject end end + + describe '#notification_channel_enabled?' do + subject { monitor.send(:notification_channel_enabled?) } + + it 'return nil when SIDEKIQ_MONITOR_WORKER is not set' do + expect(subject).to be nil + end + + it 'return nil when SIDEKIQ_MONITOR_WORKER set to 0' do + allow(ENV).to receive(:fetch).with('SIDEKIQ_MONITOR_WORKER', 0).and_return("0") + + expect(subject).to be nil + end + + it 'return 1 when SIDEKIQ_MONITOR_WORKER set to 1' do + allow(ENV).to receive(:fetch).with('SIDEKIQ_MONITOR_WORKER', 0).and_return("1") + + expect(subject).to be 1 + end + end end diff --git a/spec/lib/gitlab/sidekiq_middleware/monitor_spec.rb b/spec/lib/gitlab/sidekiq_middleware/monitor_spec.rb index 023df1a6391..398144025ea 100644 --- a/spec/lib/gitlab/sidekiq_middleware/monitor_spec.rb +++ b/spec/lib/gitlab/sidekiq_middleware/monitor_spec.rb @@ -12,7 +12,7 @@ describe Gitlab::SidekiqMiddleware::Monitor do it 'calls Gitlab::SidekiqDaemon::Monitor' do expect(Gitlab::SidekiqDaemon::Monitor.instance).to receive(:within_job) - .with('job-id', 'my-queue') + .with(anything, 'job-id', 'my-queue') .and_call_original expect { |blk| monitor.call(worker, job, queue, &blk) }.to yield_control |