summaryrefslogtreecommitdiff
path: root/app
diff options
context:
space:
mode:
authorBob Van Landuyt <bob@vanlanduyt.co>2017-11-13 16:52:07 +0100
committerBob Van Landuyt <bob@vanlanduyt.co>2017-12-08 09:11:39 +0100
commitf1ae1e39ce6b7578c5697c977bc3b52b119301ab (patch)
tree1d01033287e4e15e505c7b8b3f69ced4e6cf21c8 /app
parent12d33b883adda7093f0f4b838532871036af3925 (diff)
downloadgitlab-ce-f1ae1e39ce6b7578c5697c977bc3b52b119301ab.tar.gz
Move the circuitbreaker check out in a separate processbvl-circuitbreaker-process
Moving the check out of the general requests, makes sure we don't have any slowdown in the regular requests. To keep the process performing this checks small, the check is still performed inside a unicorn. But that is called from a process running on the same server. Because the checks are now done outside normal request, we can have a simpler failure strategy: The check is now performed in the background every `circuitbreaker_check_interval`. Failures are logged in redis. The failures are reset when the check succeeds. Per check we will try `circuitbreaker_access_retries` times within `circuitbreaker_storage_timeout` seconds. When the number of failures exceeds `circuitbreaker_failure_count_threshold`, we will block access to the storage. After `failure_reset_time` of no checks, we will clear the stored failures. This could happen when the process that performs the checks is not running.
Diffstat (limited to 'app')
-rw-r--r--app/controllers/admin/health_check_controller.rb2
-rw-r--r--app/controllers/health_controller.rb11
-rw-r--r--app/helpers/application_settings_helper.rb19
-rw-r--r--app/helpers/storage_health_helper.rb6
-rw-r--r--app/models/application_setting.rb12
-rw-r--r--app/views/admin/application_settings/_form.html.haml18
6 files changed, 26 insertions, 42 deletions
diff --git a/app/controllers/admin/health_check_controller.rb b/app/controllers/admin/health_check_controller.rb
index 65a17828feb..61247b280b3 100644
--- a/app/controllers/admin/health_check_controller.rb
+++ b/app/controllers/admin/health_check_controller.rb
@@ -5,7 +5,7 @@ class Admin::HealthCheckController < Admin::ApplicationController
end
def reset_storage_health
- Gitlab::Git::Storage::CircuitBreaker.reset_all!
+ Gitlab::Git::Storage::FailureInfo.reset_all!
redirect_to admin_health_check_path,
notice: _('Git storage health information has been reset')
end
diff --git a/app/controllers/health_controller.rb b/app/controllers/health_controller.rb
index 98c2aaa3526..a931b456a93 100644
--- a/app/controllers/health_controller.rb
+++ b/app/controllers/health_controller.rb
@@ -1,5 +1,5 @@
class HealthController < ActionController::Base
- protect_from_forgery with: :exception
+ protect_from_forgery with: :exception, except: :storage_check
include RequiresWhitelistedMonitoringClient
CHECKS = [
@@ -23,6 +23,15 @@ class HealthController < ActionController::Base
render_check_results(results)
end
+ def storage_check
+ results = Gitlab::Git::Storage::Checker.check_all
+
+ render json: {
+ check_interval: Gitlab::CurrentSettings.current_application_settings.circuitbreaker_check_interval,
+ results: results
+ }
+ end
+
private
def render_check_results(results)
diff --git a/app/helpers/application_settings_helper.rb b/app/helpers/application_settings_helper.rb
index dccde46fa33..b12ea760668 100644
--- a/app/helpers/application_settings_helper.rb
+++ b/app/helpers/application_settings_helper.rb
@@ -124,17 +124,6 @@ module ApplicationSettingsHelper
_('The number of attempts GitLab will make to access a storage.')
end
- def circuitbreaker_backoff_threshold_help_text
- _("The number of failures after which GitLab will start temporarily "\
- "disabling access to a storage shard on a host")
- end
-
- def circuitbreaker_failure_wait_time_help_text
- _("When access to a storage fails. GitLab will prevent access to the "\
- "storage for the time specified here. This allows the filesystem to "\
- "recover. Repositories on failing shards are temporarly unavailable")
- end
-
def circuitbreaker_failure_reset_time_help_text
_("The time in seconds GitLab will keep failure information. When no "\
"failures occur during this time, information about the mount is reset.")
@@ -145,6 +134,11 @@ module ApplicationSettingsHelper
"timeout error will be raised.")
end
+ def circuitbreaker_check_interval_help_text
+ _("The time in seconds between storage checks. When a previous check did "\
+ "complete yet, GitLab will skip a check.")
+ end
+
def visible_attributes
[
:admin_notification_email,
@@ -154,10 +148,9 @@ module ApplicationSettingsHelper
:akismet_enabled,
:auto_devops_enabled,
:circuitbreaker_access_retries,
- :circuitbreaker_backoff_threshold,
+ :circuitbreaker_check_interval,
:circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_reset_time,
- :circuitbreaker_failure_wait_time,
:circuitbreaker_storage_timeout,
:clientside_sentry_dsn,
:clientside_sentry_enabled,
diff --git a/app/helpers/storage_health_helper.rb b/app/helpers/storage_health_helper.rb
index 4d2180f7eee..b76c1228220 100644
--- a/app/helpers/storage_health_helper.rb
+++ b/app/helpers/storage_health_helper.rb
@@ -18,16 +18,12 @@ module StorageHealthHelper
current_failures = circuit_breaker.failure_count
translation_params = { number_of_failures: current_failures,
- maximum_failures: maximum_failures,
- number_of_seconds: circuit_breaker.failure_wait_time }
+ maximum_failures: maximum_failures }
if circuit_breaker.circuit_broken?
s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\
"retry automatically. Reset storage information when the problem is "\
"resolved.") % translation_params
- elsif circuit_breaker.backing_off?
- _("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\
- "block access for %{number_of_seconds} seconds.") % translation_params
else
_("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\
"allow access on the next attempt.") % translation_params
diff --git a/app/models/application_setting.rb b/app/models/application_setting.rb
index 3117c98c846..253e213af81 100644
--- a/app/models/application_setting.rb
+++ b/app/models/application_setting.rb
@@ -153,11 +153,10 @@ class ApplicationSetting < ActiveRecord::Base
presence: true,
numericality: { greater_than_or_equal_to: 0 }
- validates :circuitbreaker_backoff_threshold,
- :circuitbreaker_failure_count_threshold,
- :circuitbreaker_failure_wait_time,
+ validates :circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_reset_time,
:circuitbreaker_storage_timeout,
+ :circuitbreaker_check_interval,
presence: true,
numericality: { only_integer: true, greater_than_or_equal_to: 0 }
@@ -165,13 +164,6 @@ class ApplicationSetting < ActiveRecord::Base
presence: true,
numericality: { only_integer: true, greater_than_or_equal_to: 1 }
- validates_each :circuitbreaker_backoff_threshold do |record, attr, value|
- if value.to_i >= record.circuitbreaker_failure_count_threshold
- record.errors.add(attr, _("The circuitbreaker backoff threshold should be "\
- "lower than the failure count threshold"))
- end
- end
-
validates :gitaly_timeout_default,
presence: true,
numericality: { only_integer: true, greater_than_or_equal_to: 0 }
diff --git a/app/views/admin/application_settings/_form.html.haml b/app/views/admin/application_settings/_form.html.haml
index a9d0503bc73..3e2dbb07a6c 100644
--- a/app/views/admin/application_settings/_form.html.haml
+++ b/app/views/admin/application_settings/_form.html.haml
@@ -546,6 +546,12 @@
%fieldset
%legend Git Storage Circuitbreaker settings
.form-group
+ = f.label :circuitbreaker_check_interval, _('Check interval'), class: 'control-label col-sm-2'
+ .col-sm-10
+ = f.number_field :circuitbreaker_check_interval, class: 'form-control'
+ .help-block
+ = circuitbreaker_check_interval_help_text
+ .form-group
= f.label :circuitbreaker_access_retries, _('Number of access attempts'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_access_retries, class: 'form-control'
@@ -558,18 +564,6 @@
.help-block
= circuitbreaker_storage_timeout_help_text
.form-group
- = f.label :circuitbreaker_backoff_threshold, _('Number of failures before backing off'), class: 'control-label col-sm-2'
- .col-sm-10
- = f.number_field :circuitbreaker_backoff_threshold, class: 'form-control'
- .help-block
- = circuitbreaker_backoff_threshold_help_text
- .form-group
- = f.label :circuitbreaker_failure_wait_time, _('Seconds to wait after a storage failure'), class: 'control-label col-sm-2'
- .col-sm-10
- = f.number_field :circuitbreaker_failure_wait_time, class: 'form-control'
- .help-block
- = circuitbreaker_failure_wait_time_help_text
- .form-group
= f.label :circuitbreaker_failure_count_threshold, _('Maximum git storage failures'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_failure_count_threshold, class: 'form-control'