diff options
author | Bob Van Landuyt <bob@vanlanduyt.co> | 2017-11-13 16:52:07 +0100 |
---|---|---|
committer | Bob Van Landuyt <bob@vanlanduyt.co> | 2017-12-08 09:11:39 +0100 |
commit | f1ae1e39ce6b7578c5697c977bc3b52b119301ab (patch) | |
tree | 1d01033287e4e15e505c7b8b3f69ced4e6cf21c8 /app | |
parent | 12d33b883adda7093f0f4b838532871036af3925 (diff) | |
download | gitlab-ce-f1ae1e39ce6b7578c5697c977bc3b52b119301ab.tar.gz |
Move the circuitbreaker check out in a separate processbvl-circuitbreaker-process
Moving the check out of the general requests, makes sure we don't have
any slowdown in the regular requests.
To keep the process performing this checks small, the check is still
performed inside a unicorn. But that is called from a process running
on the same server.
Because the checks are now done outside normal request, we can have a
simpler failure strategy:
The check is now performed in the background every
`circuitbreaker_check_interval`. Failures are logged in redis. The
failures are reset when the check succeeds. Per check we will try
`circuitbreaker_access_retries` times within
`circuitbreaker_storage_timeout` seconds.
When the number of failures exceeds
`circuitbreaker_failure_count_threshold`, we will block access to the
storage.
After `failure_reset_time` of no checks, we will clear the stored
failures. This could happen when the process that performs the checks
is not running.
Diffstat (limited to 'app')
-rw-r--r-- | app/controllers/admin/health_check_controller.rb | 2 | ||||
-rw-r--r-- | app/controllers/health_controller.rb | 11 | ||||
-rw-r--r-- | app/helpers/application_settings_helper.rb | 19 | ||||
-rw-r--r-- | app/helpers/storage_health_helper.rb | 6 | ||||
-rw-r--r-- | app/models/application_setting.rb | 12 | ||||
-rw-r--r-- | app/views/admin/application_settings/_form.html.haml | 18 |
6 files changed, 26 insertions, 42 deletions
diff --git a/app/controllers/admin/health_check_controller.rb b/app/controllers/admin/health_check_controller.rb index 65a17828feb..61247b280b3 100644 --- a/app/controllers/admin/health_check_controller.rb +++ b/app/controllers/admin/health_check_controller.rb @@ -5,7 +5,7 @@ class Admin::HealthCheckController < Admin::ApplicationController end def reset_storage_health - Gitlab::Git::Storage::CircuitBreaker.reset_all! + Gitlab::Git::Storage::FailureInfo.reset_all! redirect_to admin_health_check_path, notice: _('Git storage health information has been reset') end diff --git a/app/controllers/health_controller.rb b/app/controllers/health_controller.rb index 98c2aaa3526..a931b456a93 100644 --- a/app/controllers/health_controller.rb +++ b/app/controllers/health_controller.rb @@ -1,5 +1,5 @@ class HealthController < ActionController::Base - protect_from_forgery with: :exception + protect_from_forgery with: :exception, except: :storage_check include RequiresWhitelistedMonitoringClient CHECKS = [ @@ -23,6 +23,15 @@ class HealthController < ActionController::Base render_check_results(results) end + def storage_check + results = Gitlab::Git::Storage::Checker.check_all + + render json: { + check_interval: Gitlab::CurrentSettings.current_application_settings.circuitbreaker_check_interval, + results: results + } + end + private def render_check_results(results) diff --git a/app/helpers/application_settings_helper.rb b/app/helpers/application_settings_helper.rb index dccde46fa33..b12ea760668 100644 --- a/app/helpers/application_settings_helper.rb +++ b/app/helpers/application_settings_helper.rb @@ -124,17 +124,6 @@ module ApplicationSettingsHelper _('The number of attempts GitLab will make to access a storage.') end - def circuitbreaker_backoff_threshold_help_text - _("The number of failures after which GitLab will start temporarily "\ - "disabling access to a storage shard on a host") - end - - def circuitbreaker_failure_wait_time_help_text - _("When access to a storage fails. GitLab will prevent access to the "\ - "storage for the time specified here. This allows the filesystem to "\ - "recover. Repositories on failing shards are temporarly unavailable") - end - def circuitbreaker_failure_reset_time_help_text _("The time in seconds GitLab will keep failure information. When no "\ "failures occur during this time, information about the mount is reset.") @@ -145,6 +134,11 @@ module ApplicationSettingsHelper "timeout error will be raised.") end + def circuitbreaker_check_interval_help_text + _("The time in seconds between storage checks. When a previous check did "\ + "complete yet, GitLab will skip a check.") + end + def visible_attributes [ :admin_notification_email, @@ -154,10 +148,9 @@ module ApplicationSettingsHelper :akismet_enabled, :auto_devops_enabled, :circuitbreaker_access_retries, - :circuitbreaker_backoff_threshold, + :circuitbreaker_check_interval, :circuitbreaker_failure_count_threshold, :circuitbreaker_failure_reset_time, - :circuitbreaker_failure_wait_time, :circuitbreaker_storage_timeout, :clientside_sentry_dsn, :clientside_sentry_enabled, diff --git a/app/helpers/storage_health_helper.rb b/app/helpers/storage_health_helper.rb index 4d2180f7eee..b76c1228220 100644 --- a/app/helpers/storage_health_helper.rb +++ b/app/helpers/storage_health_helper.rb @@ -18,16 +18,12 @@ module StorageHealthHelper current_failures = circuit_breaker.failure_count translation_params = { number_of_failures: current_failures, - maximum_failures: maximum_failures, - number_of_seconds: circuit_breaker.failure_wait_time } + maximum_failures: maximum_failures } if circuit_breaker.circuit_broken? s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\ "retry automatically. Reset storage information when the problem is "\ "resolved.") % translation_params - elsif circuit_breaker.backing_off? - _("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\ - "block access for %{number_of_seconds} seconds.") % translation_params else _("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\ "allow access on the next attempt.") % translation_params diff --git a/app/models/application_setting.rb b/app/models/application_setting.rb index 3117c98c846..253e213af81 100644 --- a/app/models/application_setting.rb +++ b/app/models/application_setting.rb @@ -153,11 +153,10 @@ class ApplicationSetting < ActiveRecord::Base presence: true, numericality: { greater_than_or_equal_to: 0 } - validates :circuitbreaker_backoff_threshold, - :circuitbreaker_failure_count_threshold, - :circuitbreaker_failure_wait_time, + validates :circuitbreaker_failure_count_threshold, :circuitbreaker_failure_reset_time, :circuitbreaker_storage_timeout, + :circuitbreaker_check_interval, presence: true, numericality: { only_integer: true, greater_than_or_equal_to: 0 } @@ -165,13 +164,6 @@ class ApplicationSetting < ActiveRecord::Base presence: true, numericality: { only_integer: true, greater_than_or_equal_to: 1 } - validates_each :circuitbreaker_backoff_threshold do |record, attr, value| - if value.to_i >= record.circuitbreaker_failure_count_threshold - record.errors.add(attr, _("The circuitbreaker backoff threshold should be "\ - "lower than the failure count threshold")) - end - end - validates :gitaly_timeout_default, presence: true, numericality: { only_integer: true, greater_than_or_equal_to: 0 } diff --git a/app/views/admin/application_settings/_form.html.haml b/app/views/admin/application_settings/_form.html.haml index a9d0503bc73..3e2dbb07a6c 100644 --- a/app/views/admin/application_settings/_form.html.haml +++ b/app/views/admin/application_settings/_form.html.haml @@ -546,6 +546,12 @@ %fieldset %legend Git Storage Circuitbreaker settings .form-group + = f.label :circuitbreaker_check_interval, _('Check interval'), class: 'control-label col-sm-2' + .col-sm-10 + = f.number_field :circuitbreaker_check_interval, class: 'form-control' + .help-block + = circuitbreaker_check_interval_help_text + .form-group = f.label :circuitbreaker_access_retries, _('Number of access attempts'), class: 'control-label col-sm-2' .col-sm-10 = f.number_field :circuitbreaker_access_retries, class: 'form-control' @@ -558,18 +564,6 @@ .help-block = circuitbreaker_storage_timeout_help_text .form-group - = f.label :circuitbreaker_backoff_threshold, _('Number of failures before backing off'), class: 'control-label col-sm-2' - .col-sm-10 - = f.number_field :circuitbreaker_backoff_threshold, class: 'form-control' - .help-block - = circuitbreaker_backoff_threshold_help_text - .form-group - = f.label :circuitbreaker_failure_wait_time, _('Seconds to wait after a storage failure'), class: 'control-label col-sm-2' - .col-sm-10 - = f.number_field :circuitbreaker_failure_wait_time, class: 'form-control' - .help-block - = circuitbreaker_failure_wait_time_help_text - .form-group = f.label :circuitbreaker_failure_count_threshold, _('Maximum git storage failures'), class: 'control-label col-sm-2' .col-sm-10 = f.number_field :circuitbreaker_failure_count_threshold, class: 'form-control' |