From eb28364b68ce73a6dca254de8fb0ce0eeee37283 Mon Sep 17 00:00:00 2001 From: Douwe Maan Date: Tue, 17 Oct 2017 10:43:05 +0000 Subject: Merge branch 'bvl-circuitbreaker-improvements' into 'master' Make the circuitbreaker configurable at runtime See merge request gitlab-org/gitlab-ce!14842 --- app/helpers/application_settings_helper.rb | 32 ++++++++++++ app/models/application_setting.rb | 7 +++ .../admin/application_settings/_form.html.haml | 26 ++++++++++ .../unreleased/bvl-circuitbreaker-improvements.yml | 5 ++ config/gitlab.yml.example | 8 --- config/initializers/1_settings.rb | 11 ----- ...t_breaker_properties_to_application_settings.rb | 27 ++++++++++ db/schema.rb | 6 ++- doc/administration/img/circuitbreaker_config.png | Bin 0 -> 213210 bytes doc/administration/repository_storage_paths.md | 55 ++++----------------- doc/api/settings.md | 4 ++ lib/gitlab/git/storage/circuit_breaker.rb | 12 ++--- lib/gitlab/git/storage/circuit_breaker_settings.rb | 29 +++++++++++ lib/gitlab/git/storage/null_circuit_breaker.rb | 13 ++--- spec/features/admin/admin_health_check_spec.rb | 4 +- spec/initializers/settings_spec.rb | 20 -------- .../lib/gitlab/git/storage/circuit_breaker_spec.rb | 54 +++++++++++++------- .../git/storage/null_circuit_breaker_spec.rb | 4 ++ spec/models/application_setting_spec.rb | 13 +++++ spec/requests/api/settings_spec.rb | 5 +- spec/support/stub_configuration.rb | 4 -- 21 files changed, 212 insertions(+), 127 deletions(-) create mode 100644 changelogs/unreleased/bvl-circuitbreaker-improvements.yml create mode 100644 db/migrate/20171012101043_add_circuit_breaker_properties_to_application_settings.rb create mode 100644 doc/administration/img/circuitbreaker_config.png create mode 100644 lib/gitlab/git/storage/circuit_breaker_settings.rb diff --git a/app/helpers/application_settings_helper.rb b/app/helpers/application_settings_helper.rb index 7bd34df5c95..1ee8911bb1a 100644 --- a/app/helpers/application_settings_helper.rb +++ b/app/helpers/application_settings_helper.rb @@ -108,6 +108,34 @@ module ApplicationSettingsHelper options_for_select(Sidekiq::Queue.all.map(&:name), @application_setting.sidekiq_throttling_queues) end + def circuitbreaker_failure_count_help_text + health_link = link_to(s_('AdminHealthPageLink|health page'), admin_health_check_path) + api_link = link_to(s_('CircuitBreakerApiLink|circuitbreaker api'), help_page_path("api/repository_storage_health")) + message = _("The number of failures of after which GitLab will completely "\ + "prevent access to the storage. The number of failures can be "\ + "reset in the admin interface: %{link_to_health_page} or using "\ + "the %{api_documentation_link}.") + message = message % { link_to_health_page: health_link, api_documentation_link: api_link } + + message.html_safe + end + + def circuitbreaker_failure_wait_time_help_text + _("When access to a storage fails. GitLab will prevent access to the "\ + "storage for the time specified here. This allows the filesystem to "\ + "recover. Repositories on failing shards are temporarly unavailable") + end + + def circuitbreaker_failure_reset_time_help_text + _("The time in seconds GitLab will keep failure information. When no "\ + "failures occur during this time, information about the mount is reset.") + end + + def circuitbreaker_storage_timeout_help_text + _("The time in seconds GitLab will try to access storage. After this time a "\ + "timeout error will be raised.") + end + def visible_attributes [ :admin_notification_email, @@ -116,6 +144,10 @@ module ApplicationSettingsHelper :akismet_api_key, :akismet_enabled, :auto_devops_enabled, + :circuitbreaker_failure_count_threshold, + :circuitbreaker_failure_reset_time, + :circuitbreaker_failure_wait_time, + :circuitbreaker_storage_timeout, :clientside_sentry_dsn, :clientside_sentry_enabled, :container_registry_token_expire_delay, diff --git a/app/models/application_setting.rb b/app/models/application_setting.rb index 5a247566432..4dda276bb41 100644 --- a/app/models/application_setting.rb +++ b/app/models/application_setting.rb @@ -153,6 +153,13 @@ class ApplicationSetting < ActiveRecord::Base presence: true, numericality: { greater_than_or_equal_to: 0 } + validates :circuitbreaker_failure_count_threshold, + :circuitbreaker_failure_wait_time, + :circuitbreaker_failure_reset_time, + :circuitbreaker_storage_timeout, + presence: true, + numericality: { only_integer: true, greater_than_or_equal_to: 0 } + SUPPORTED_KEY_TYPES.each do |type| validates :"#{type}_key_restriction", presence: true, key_restriction: { type: type } end diff --git a/app/views/admin/application_settings/_form.html.haml b/app/views/admin/application_settings/_form.html.haml index dbaed1d09fb..2b23af9212e 100644 --- a/app/views/admin/application_settings/_form.html.haml +++ b/app/views/admin/application_settings/_form.html.haml @@ -530,6 +530,32 @@ = succeed "." do = link_to "repository storages documentation", help_page_path("administration/repository_storages") + %fieldset + %legend Git Storage Circuitbreaker settings + .form-group + = f.label :circuitbreaker_failure_count_threshold, _('Maximum git storage failures'), class: 'control-label col-sm-2' + .col-sm-10 + = f.number_field :circuitbreaker_failure_count_threshold, class: 'form-control' + .help-block + = circuitbreaker_failure_count_help_text + .form-group + = f.label :circuitbreaker_failure_wait_time, _('Seconds to wait after a storage failure'), class: 'control-label col-sm-2' + .col-sm-10 + = f.number_field :circuitbreaker_failure_wait_time, class: 'form-control' + .help-block + = circuitbreaker_failure_wait_time_help_text + .form-group + = f.label :circuitbreaker_failure_reset_time, _('Seconds before reseting failure information'), class: 'control-label col-sm-2' + .col-sm-10 + = f.number_field :circuitbreaker_failure_reset_time, class: 'form-control' + .help-block + = circuitbreaker_failure_reset_time_help_text + .form-group + = f.label :circuitbreaker_storage_timeout, _('Seconds to wait for a storage access attempt'), class: 'control-label col-sm-2' + .col-sm-10 + = f.number_field :circuitbreaker_storage_timeout, class: 'form-control' + .help-block + = circuitbreaker_storage_timeout_help_text %fieldset %legend Repository Checks diff --git a/changelogs/unreleased/bvl-circuitbreaker-improvements.yml b/changelogs/unreleased/bvl-circuitbreaker-improvements.yml new file mode 100644 index 00000000000..15cbd5592e9 --- /dev/null +++ b/changelogs/unreleased/bvl-circuitbreaker-improvements.yml @@ -0,0 +1,5 @@ +--- +title: Store circuitbreaker settings in the database instead of config +merge_request: 14842 +author: +type: changed diff --git a/config/gitlab.yml.example b/config/gitlab.yml.example index 1069c7be5f0..4bfa5be0136 100644 --- a/config/gitlab.yml.example +++ b/config/gitlab.yml.example @@ -522,11 +522,6 @@ production: &base path: /home/git/repositories/ gitaly_address: unix:/home/git/gitlab/tmp/sockets/private/gitaly.socket # TCP connections are supported too (e.g. tcp://host:port) # gitaly_token: 'special token' # Optional: override global gitaly.token for this storage. - failure_count_threshold: 10 # number of failures before stopping attempts - failure_wait_time: 30 # Seconds after an access failure before allowing access again - failure_reset_time: 1800 # Time in seconds to expire failures - storage_timeout: 30 # Time in seconds to wait before aborting a storage access attempt - ## Backup settings backup: @@ -659,9 +654,6 @@ test: default: path: tmp/tests/repositories/ gitaly_address: unix:tmp/tests/gitaly/gitaly.socket - failure_count_threshold: 999999 - failure_wait_time: 0 - storage_timeout: 30 broken: path: tmp/tests/non-existent-repositories gitaly_address: unix:tmp/tests/gitaly/gitaly.socket diff --git a/config/initializers/1_settings.rb b/config/initializers/1_settings.rb index b790df565c6..12694f8016f 100644 --- a/config/initializers/1_settings.rb +++ b/config/initializers/1_settings.rb @@ -455,17 +455,6 @@ Settings.repositories.storages.each do |key, storage| # Expand relative paths storage['path'] = Settings.absolute(storage['path']) - # Set failure defaults - storage['failure_count_threshold'] ||= 10 - storage['failure_wait_time'] ||= 30 - storage['failure_reset_time'] ||= 1800 - storage['storage_timeout'] ||= 5 - # Set turn strings into numbers - storage['failure_count_threshold'] = storage['failure_count_threshold'].to_i - storage['failure_wait_time'] = storage['failure_wait_time'].to_i - storage['failure_reset_time'] = storage['failure_reset_time'].to_i - # We might want to have a timeout shorter than 1 second. - storage['storage_timeout'] = storage['storage_timeout'].to_f Settings.repositories.storages[key] = storage end diff --git a/db/migrate/20171012101043_add_circuit_breaker_properties_to_application_settings.rb b/db/migrate/20171012101043_add_circuit_breaker_properties_to_application_settings.rb new file mode 100644 index 00000000000..bcf7dbd8e64 --- /dev/null +++ b/db/migrate/20171012101043_add_circuit_breaker_properties_to_application_settings.rb @@ -0,0 +1,27 @@ +# See http://doc.gitlab.com/ce/development/migration_style_guide.html +# for more information on how to write migrations for GitLab. + +class AddCircuitBreakerPropertiesToApplicationSettings < ActiveRecord::Migration + include Gitlab::Database::MigrationHelpers + + DOWNTIME = false + + def change + add_column :application_settings, + :circuitbreaker_failure_count_threshold, + :integer, + default: 160 + add_column :application_settings, + :circuitbreaker_failure_wait_time, + :integer, + default: 30 + add_column :application_settings, + :circuitbreaker_failure_reset_time, + :integer, + default: 1800 + add_column :application_settings, + :circuitbreaker_storage_timeout, + :integer, + default: 30 + end +end diff --git a/db/schema.rb b/db/schema.rb index aac37b6b455..d64e0fbf60a 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -11,7 +11,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema.define(version: 20171006091000) do +ActiveRecord::Schema.define(version: 20171012101043) do # These are extensions that must be enabled in order to support this database enable_extension "plpgsql" @@ -134,6 +134,10 @@ ActiveRecord::Schema.define(version: 20171006091000) do t.boolean "hashed_storage_enabled", default: false, null: false t.boolean "project_export_enabled", default: true, null: false t.boolean "auto_devops_enabled", default: false, null: false + t.integer "circuitbreaker_failure_count_threshold", default: 160 + t.integer "circuitbreaker_failure_wait_time", default: 30 + t.integer "circuitbreaker_failure_reset_time", default: 1800 + t.integer "circuitbreaker_storage_timeout", default: 30 end create_table "audit_events", force: :cascade do |t| diff --git a/doc/administration/img/circuitbreaker_config.png b/doc/administration/img/circuitbreaker_config.png new file mode 100644 index 00000000000..9250d38297c Binary files /dev/null and b/doc/administration/img/circuitbreaker_config.png differ diff --git a/doc/administration/repository_storage_paths.md b/doc/administration/repository_storage_paths.md index 624a908b3a3..efcabd69822 100644 --- a/doc/administration/repository_storage_paths.md +++ b/doc/administration/repository_storage_paths.md @@ -105,61 +105,26 @@ When GitLab detects access to the repositories storage fails repeatedly, it can gracefully prevent attempts to access the storage. This might be useful when the repositories are stored somewhere on the network. -The configuration could look as follows: +This can be configured from the admin interface: -**For Omnibus installations** - -1. Edit `/etc/gitlab/gitlab.rb`: - - ```ruby - git_data_dirs({ - "default" => { - "path" => "/mnt/nfs-01/git-data", - "failure_count_threshold" => 10, - "failure_wait_time" => 30, - "failure_reset_time" => 1800, - "storage_timeout" => 5 - } - }) - ``` - -1. Save the file and [reconfigure GitLab][reconfigure-gitlab] for the changes to take effect. - ---- - -**For installations from source** - -1. Edit `config/gitlab.yml`: - - ```yaml - repositories: - storages: # You must have at least a `default` storage path. - default: - path: /home/git/repositories/ - failure_count_threshold: 10 # number of failures before stopping attempts - failure_wait_time: 30 # Seconds after last access failure before trying again - failure_reset_time: 1800 # Time in seconds to expire failures - storage_timeout: 5 # Time in seconds to wait before aborting a storage access attempt - ``` - -1. Save the file and [restart GitLab][restart-gitlab] for the changes to take effect. +![circuitbreaker configuration](img/circuitbreaker_config.png) -**`failure_count_threshold`:** The number of failures of after which GitLab will +**Maximum git storage failures:** The number of failures of after which GitLab will completely prevent access to the storage. The number of failures can be reset in the admin interface: `https://gitlab.example.com/admin/health_check` or using the [api](../api/repository_storage_health.md) to allow access to the storage again. -**`failure_wait_time`:** When access to a storage fails. GitLab will prevent -access to the storage for the time specified here. This allows the filesystem to -recover without. +**Seconds to wait after a storage failure:** When access to a storage fails. GitLab +will prevent access to the storage for the time specified here. This allows the +filesystem to recover. -**`failure_reset_time`:** The time in seconds GitLab will keep failure -information. When no failures occur during this time, information about the +**Seconds before reseting failure information:** The time in seconds GitLab will +keep failure information. When no failures occur during this time, information about the mount is reset. -**`storage_timeout`:** The time in seconds GitLab will try to access storage. -After this time a timeout error will be raised. +**Seconds to wait for a storage access attempt:** The time in seconds GitLab will +try to access storage. After this time a timeout error will be raised. When storage failures occur, this will be visible in the admin interface like this: diff --git a/doc/api/settings.md b/doc/api/settings.md index b78f1252108..5ec96b8a6a2 100644 --- a/doc/api/settings.md +++ b/doc/api/settings.md @@ -96,6 +96,10 @@ PUT /application/settings | `dsa_key_restriction` | integer | no | The minimum allowed bit length of an uploaded DSA key. Default is `0` (no restriction). `-1` disables DSA keys. | `ecdsa_key_restriction` | integer | no | The minimum allowed curve size (in bits) of an uploaded ECDSA key. Default is `0` (no restriction). `-1` disables ECDSA keys. | `ed25519_key_restriction` | integer | no | The minimum allowed curve size (in bits) of an uploaded ED25519 key. Default is `0` (no restriction). `-1` disables ED25519 keys. +| `circuitbreaker_failure_count_threshold` | integer | no | The number of failures of after which GitLab will completely prevent access to the storage. | +| `circuitbreaker_failure_reset_time` | integer | no | Time in seconds GitLab will keep storage failure information. When no failures occur during this time, the failure information is reset. | +| `circuitbreaker_failure_wait_time` | integer | no | Time in seconds GitLab will block access to a failing storage to allow it to recover. | +| `circuitbreaker_storage_timeout` | integer | no | Seconds to wait for a storage access attempt | ```bash curl --request PUT --header "PRIVATE-TOKEN: 9koXpg98eAheJpvBs5tK" https://gitlab.example.com/api/v4/application/settings?signup_enabled=false&default_project_visibility=internal diff --git a/lib/gitlab/git/storage/circuit_breaker.rb b/lib/gitlab/git/storage/circuit_breaker.rb index ba56aa2baf7..0456ad9a1f3 100644 --- a/lib/gitlab/git/storage/circuit_breaker.rb +++ b/lib/gitlab/git/storage/circuit_breaker.rb @@ -2,15 +2,13 @@ module Gitlab module Git module Storage class CircuitBreaker + include CircuitBreakerSettings + FailureInfo = Struct.new(:last_failure, :failure_count) attr_reader :storage, :hostname, - :storage_path, - :failure_count_threshold, - :failure_wait_time, - :failure_reset_time, - :storage_timeout + :storage_path delegate :last_failure, :failure_count, to: :failure_info @@ -53,10 +51,6 @@ module Gitlab config = Gitlab.config.repositories.storages[@storage] @storage_path = config['path'] - @failure_count_threshold = config['failure_count_threshold'] - @failure_wait_time = config['failure_wait_time'] - @failure_reset_time = config['failure_reset_time'] - @storage_timeout = config['storage_timeout'] end def perform diff --git a/lib/gitlab/git/storage/circuit_breaker_settings.rb b/lib/gitlab/git/storage/circuit_breaker_settings.rb new file mode 100644 index 00000000000..d2313fe7c1b --- /dev/null +++ b/lib/gitlab/git/storage/circuit_breaker_settings.rb @@ -0,0 +1,29 @@ +module Gitlab + module Git + module Storage + module CircuitBreakerSettings + def failure_count_threshold + application_settings.circuitbreaker_failure_count_threshold + end + + def failure_wait_time + application_settings.circuitbreaker_failure_wait_time + end + + def failure_reset_time + application_settings.circuitbreaker_failure_reset_time + end + + def storage_timeout + application_settings.circuitbreaker_storage_timeout + end + + private + + def application_settings + Gitlab::CurrentSettings.current_application_settings + end + end + end + end +end diff --git a/lib/gitlab/git/storage/null_circuit_breaker.rb b/lib/gitlab/git/storage/null_circuit_breaker.rb index 297c043d054..60c6791a7e4 100644 --- a/lib/gitlab/git/storage/null_circuit_breaker.rb +++ b/lib/gitlab/git/storage/null_circuit_breaker.rb @@ -2,15 +2,14 @@ module Gitlab module Git module Storage class NullCircuitBreaker + include CircuitBreakerSettings + # These will have actual values attr_reader :storage, :hostname # These will always have nil values - attr_reader :storage_path, - :failure_wait_time, - :failure_reset_time, - :storage_timeout + attr_reader :storage_path def initialize(storage, hostname, error: nil) @storage = storage @@ -26,16 +25,12 @@ module Gitlab !!@error end - def failure_count_threshold - 1 - end - def last_failure circuit_broken? ? Time.now : nil end def failure_count - circuit_broken? ? 1 : 0 + circuit_broken? ? failure_count_threshold : 0 end def failure_info diff --git a/spec/features/admin/admin_health_check_spec.rb b/spec/features/admin/admin_health_check_spec.rb index 09e6965849a..4430fc15501 100644 --- a/spec/features/admin/admin_health_check_spec.rb +++ b/spec/features/admin/admin_health_check_spec.rb @@ -65,9 +65,11 @@ feature "Admin Health Check", :feature, :broken_storage do it 'shows storage failure information' do hostname = Gitlab::Environment.hostname + maximum_failures = Gitlab::CurrentSettings.current_application_settings + .circuitbreaker_failure_count_threshold expect(page).to have_content('broken: failed storage access attempt on host:') - expect(page).to have_content("#{hostname}: 1 of 10 failures.") + expect(page).to have_content("#{hostname}: 1 of #{maximum_failures} failures.") end it 'allows resetting storage failures' do diff --git a/spec/initializers/settings_spec.rb b/spec/initializers/settings_spec.rb index 9a974e70e8c..a11824d0ac5 100644 --- a/spec/initializers/settings_spec.rb +++ b/spec/initializers/settings_spec.rb @@ -18,26 +18,6 @@ describe Settings do end end - describe '#repositories' do - it 'assigns the default failure attributes' do - repository_settings = Gitlab.config.repositories.storages['broken'] - - expect(repository_settings['failure_count_threshold']).to eq(10) - expect(repository_settings['failure_wait_time']).to eq(30) - expect(repository_settings['failure_reset_time']).to eq(1800) - expect(repository_settings['storage_timeout']).to eq(5) - end - - it 'can be accessed with dot syntax all the way down' do - expect(Gitlab.config.repositories.storages.broken.failure_count_threshold).to eq(10) - end - - it 'can be accessed in a very specific way that breaks without reassigning each element with Settingslogic' do - storage_settings = Gitlab.config.repositories.storages['broken'] - expect(storage_settings.failure_count_threshold).to eq(10) - end - end - describe '#host_without_www' do context 'URL with protocol' do it 'returns the host' do diff --git a/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb b/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb index 4edd360d11d..c8d532df059 100644 --- a/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb +++ b/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb @@ -10,18 +10,10 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: # Override test-settings for the circuitbreaker with something more realistic # for these specs. stub_storage_settings('default' => { - 'path' => TestEnv.repos_path, - 'failure_count_threshold' => 10, - 'failure_wait_time' => 30, - 'failure_reset_time' => 1800, - 'storage_timeout' => 5 + 'path' => TestEnv.repos_path }, 'broken' => { - 'path' => 'tmp/tests/non-existent-repositories', - 'failure_count_threshold' => 10, - 'failure_wait_time' => 30, - 'failure_reset_time' => 1800, - 'storage_timeout' => 5 + 'path' => 'tmp/tests/non-existent-repositories' }, 'nopath' => { 'path' => nil } ) @@ -79,10 +71,39 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: expect(circuit_breaker.hostname).to eq(hostname) expect(circuit_breaker.storage).to eq('default') expect(circuit_breaker.storage_path).to eq(TestEnv.repos_path) - expect(circuit_breaker.failure_count_threshold).to eq(10) - expect(circuit_breaker.failure_wait_time).to eq(30) - expect(circuit_breaker.failure_reset_time).to eq(1800) - expect(circuit_breaker.storage_timeout).to eq(5) + end + end + + context 'circuitbreaker settings' do + before do + stub_application_setting(circuitbreaker_failure_count_threshold: 0, + circuitbreaker_failure_wait_time: 1, + circuitbreaker_failure_reset_time: 2, + circuitbreaker_storage_timeout: 3) + end + + describe '#failure_count_threshold' do + it 'reads the value from settings' do + expect(circuit_breaker.failure_count_threshold).to eq(0) + end + end + + describe '#failure_wait_time' do + it 'reads the value from settings' do + expect(circuit_breaker.failure_wait_time).to eq(1) + end + end + + describe '#failure_reset_time' do + it 'reads the value from settings' do + expect(circuit_breaker.failure_reset_time).to eq(2) + end + end + + describe '#storage_timeout' do + it 'reads the value from settings' do + expect(circuit_breaker.storage_timeout).to eq(3) + end end end @@ -155,10 +176,7 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: context 'the `failure_wait_time` is set to 0' do before do - stub_storage_settings('default' => { - 'failure_wait_time' => 0, - 'path' => TestEnv.repos_path - }) + stub_application_setting(circuitbreaker_failure_wait_time: 0) end it 'is working even when there is a recent failure' do diff --git a/spec/lib/gitlab/git/storage/null_circuit_breaker_spec.rb b/spec/lib/gitlab/git/storage/null_circuit_breaker_spec.rb index 0e645008c88..7ee6d2f3709 100644 --- a/spec/lib/gitlab/git/storage/null_circuit_breaker_spec.rb +++ b/spec/lib/gitlab/git/storage/null_circuit_breaker_spec.rb @@ -54,6 +54,10 @@ describe Gitlab::Git::Storage::NullCircuitBreaker do end describe '#failure_count_threshold' do + before do + stub_application_setting(circuitbreaker_failure_count_threshold: 1) + end + it { expect(breaker.failure_count_threshold).to eq(1) } end diff --git a/spec/models/application_setting_spec.rb b/spec/models/application_setting_spec.rb index a7bcb4363a8..30495fd4f5e 100644 --- a/spec/models/application_setting_spec.rb +++ b/spec/models/application_setting_spec.rb @@ -114,6 +114,19 @@ describe ApplicationSetting do it { expect(setting.repository_storages).to eq(['default']) } end + context 'circuitbreaker settings' do + [:circuitbreaker_failure_count_threshold, + :circuitbreaker_failure_wait_time, + :circuitbreaker_failure_reset_time, + :circuitbreaker_storage_timeout].each do |field| + it "Validates #{field} as number" do + is_expected.to validate_numericality_of(field) + .only_integer + .is_greater_than_or_equal_to(0) + end + end + end + context 'repository storages' do before do storages = { diff --git a/spec/requests/api/settings_spec.rb b/spec/requests/api/settings_spec.rb index 0b9a4b5c3db..c24de58ee9d 100644 --- a/spec/requests/api/settings_spec.rb +++ b/spec/requests/api/settings_spec.rb @@ -23,6 +23,7 @@ describe API::Settings, 'Settings' do expect(json_response['dsa_key_restriction']).to eq(0) expect(json_response['ecdsa_key_restriction']).to eq(0) expect(json_response['ed25519_key_restriction']).to eq(0) + expect(json_response['circuitbreaker_failure_count_threshold']).not_to be_nil end end @@ -52,7 +53,8 @@ describe API::Settings, 'Settings' do rsa_key_restriction: ApplicationSetting::FORBIDDEN_KEY_VALUE, dsa_key_restriction: 2048, ecdsa_key_restriction: 384, - ed25519_key_restriction: 256 + ed25519_key_restriction: 256, + circuitbreaker_failure_wait_time: 2 expect(response).to have_http_status(200) expect(json_response['default_projects_limit']).to eq(3) @@ -73,6 +75,7 @@ describe API::Settings, 'Settings' do expect(json_response['dsa_key_restriction']).to eq(2048) expect(json_response['ecdsa_key_restriction']).to eq(384) expect(json_response['ed25519_key_restriction']).to eq(256) + expect(json_response['circuitbreaker_failure_wait_time']).to eq(2) end end diff --git a/spec/support/stub_configuration.rb b/spec/support/stub_configuration.rb index 2dfb4d4a07f..4d448a55978 100644 --- a/spec/support/stub_configuration.rb +++ b/spec/support/stub_configuration.rb @@ -43,10 +43,6 @@ module StubConfiguration messages['default'] ||= Gitlab.config.repositories.storages.default messages.each do |storage_name, storage_settings| storage_settings['path'] = TestEnv.repos_path unless storage_settings.key?('path') - storage_settings['failure_count_threshold'] ||= 10 - storage_settings['failure_wait_time'] ||= 30 - storage_settings['failure_reset_time'] ||= 1800 - storage_settings['storage_timeout'] ||= 5 end allow(Gitlab.config.repositories).to receive(:storages).and_return(Settingslogic.new(messages)) -- cgit v1.2.1 From e32712e3a613106121a2afe8ea1fb1f9dad93783 Mon Sep 17 00:00:00 2001 From: Sean McGivern Date: Tue, 24 Oct 2017 09:10:42 +0000 Subject: Merge branch 'bvl-circuitbreaker-backoff' into 'master' Circuitbreaker backoff and retries Closes #37383 and #38231 See merge request gitlab-org/gitlab-ce!14933 --- app/helpers/application_settings_helper.rb | 11 + app/helpers/storage_health_helper.rb | 5 +- app/models/application_setting.rb | 14 +- .../admin/application_settings/_form.html.haml | 30 ++- .../unreleased/bvl-circuitbreaker-backoff.yml | 6 + ...cuitbreaker_settings_to_application_settings.rb | 16 ++ db/schema.rb | 4 +- doc/administration/img/circuitbreaker_config.png | Bin 213210 -> 335073 bytes doc/administration/repository_storage_paths.md | 14 ++ doc/api/settings.md | 2 + lib/gitlab/git/storage.rb | 1 + lib/gitlab/git/storage/circuit_breaker.rb | 41 +++- lib/gitlab/git/storage/circuit_breaker_settings.rb | 8 + lib/gitlab/git/storage/forked_storage_check.rb | 13 +- lib/gitlab/git/storage/null_circuit_breaker.rb | 4 + .../lib/gitlab/git/storage/circuit_breaker_spec.rb | 261 +++++++++------------ .../git/storage/forked_storage_check_spec.rb | 15 ++ .../git/storage/null_circuit_breaker_spec.rb | 13 +- spec/models/application_setting_spec.rb | 13 +- 19 files changed, 275 insertions(+), 196 deletions(-) create mode 100644 changelogs/unreleased/bvl-circuitbreaker-backoff.yml create mode 100644 db/migrate/20171017145932_add_new_circuitbreaker_settings_to_application_settings.rb diff --git a/app/helpers/application_settings_helper.rb b/app/helpers/application_settings_helper.rb index 1ee8911bb1a..cd1ecaadb85 100644 --- a/app/helpers/application_settings_helper.rb +++ b/app/helpers/application_settings_helper.rb @@ -120,6 +120,15 @@ module ApplicationSettingsHelper message.html_safe end + def circuitbreaker_access_retries_help_text + _('The number of attempts GitLab will make to access a storage.') + end + + def circuitbreaker_backoff_threshold_help_text + _("The number of failures after which GitLab will start temporarily "\ + "disabling access to a storage shard on a host") + end + def circuitbreaker_failure_wait_time_help_text _("When access to a storage fails. GitLab will prevent access to the "\ "storage for the time specified here. This allows the filesystem to "\ @@ -144,6 +153,8 @@ module ApplicationSettingsHelper :akismet_api_key, :akismet_enabled, :auto_devops_enabled, + :circuitbreaker_access_retries, + :circuitbreaker_backoff_threshold, :circuitbreaker_failure_count_threshold, :circuitbreaker_failure_reset_time, :circuitbreaker_failure_wait_time, diff --git a/app/helpers/storage_health_helper.rb b/app/helpers/storage_health_helper.rb index 544c9efb845..4d2180f7eee 100644 --- a/app/helpers/storage_health_helper.rb +++ b/app/helpers/storage_health_helper.rb @@ -16,17 +16,16 @@ module StorageHealthHelper def message_for_circuit_breaker(circuit_breaker) maximum_failures = circuit_breaker.failure_count_threshold current_failures = circuit_breaker.failure_count - permanently_broken = circuit_breaker.circuit_broken? && current_failures >= maximum_failures translation_params = { number_of_failures: current_failures, maximum_failures: maximum_failures, number_of_seconds: circuit_breaker.failure_wait_time } - if permanently_broken + if circuit_breaker.circuit_broken? s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\ "retry automatically. Reset storage information when the problem is "\ "resolved.") % translation_params - elsif circuit_breaker.circuit_broken? + elsif circuit_breaker.backing_off? _("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\ "block access for %{number_of_seconds} seconds.") % translation_params else diff --git a/app/models/application_setting.rb b/app/models/application_setting.rb index 4dda276bb41..f266e7db6da 100644 --- a/app/models/application_setting.rb +++ b/app/models/application_setting.rb @@ -153,13 +153,25 @@ class ApplicationSetting < ActiveRecord::Base presence: true, numericality: { greater_than_or_equal_to: 0 } - validates :circuitbreaker_failure_count_threshold, + validates :circuitbreaker_backoff_threshold, + :circuitbreaker_failure_count_threshold, :circuitbreaker_failure_wait_time, :circuitbreaker_failure_reset_time, :circuitbreaker_storage_timeout, presence: true, numericality: { only_integer: true, greater_than_or_equal_to: 0 } + validates :circuitbreaker_access_retries, + presence: true, + numericality: { only_integer: true, greater_than_or_equal_to: 1 } + + validates_each :circuitbreaker_backoff_threshold do |record, attr, value| + if value.to_i >= record.circuitbreaker_failure_count_threshold + record.errors.add(attr, _("The circuitbreaker backoff threshold should be "\ + "lower than the failure count threshold")) + end + end + SUPPORTED_KEY_TYPES.each do |type| validates :"#{type}_key_restriction", presence: true, key_restriction: { type: type } end diff --git a/app/views/admin/application_settings/_form.html.haml b/app/views/admin/application_settings/_form.html.haml index 2b23af9212e..3a4d5ce0b5c 100644 --- a/app/views/admin/application_settings/_form.html.haml +++ b/app/views/admin/application_settings/_form.html.haml @@ -533,11 +533,23 @@ %fieldset %legend Git Storage Circuitbreaker settings .form-group - = f.label :circuitbreaker_failure_count_threshold, _('Maximum git storage failures'), class: 'control-label col-sm-2' + = f.label :circuitbreaker_access_retries, _('Number of access attempts'), class: 'control-label col-sm-2' .col-sm-10 - = f.number_field :circuitbreaker_failure_count_threshold, class: 'form-control' + = f.number_field :circuitbreaker_access_retries, class: 'form-control' .help-block - = circuitbreaker_failure_count_help_text + = circuitbreaker_access_retries_help_text + .form-group + = f.label :circuitbreaker_storage_timeout, _('Seconds to wait for a storage access attempt'), class: 'control-label col-sm-2' + .col-sm-10 + = f.number_field :circuitbreaker_storage_timeout, class: 'form-control' + .help-block + = circuitbreaker_storage_timeout_help_text + .form-group + = f.label :circuitbreaker_backoff_threshold, _('Number of failures before backing off'), class: 'control-label col-sm-2' + .col-sm-10 + = f.number_field :circuitbreaker_backoff_threshold, class: 'form-control' + .help-block + = circuitbreaker_backoff_threshold_help_text .form-group = f.label :circuitbreaker_failure_wait_time, _('Seconds to wait after a storage failure'), class: 'control-label col-sm-2' .col-sm-10 @@ -545,17 +557,17 @@ .help-block = circuitbreaker_failure_wait_time_help_text .form-group - = f.label :circuitbreaker_failure_reset_time, _('Seconds before reseting failure information'), class: 'control-label col-sm-2' + = f.label :circuitbreaker_failure_count_threshold, _('Maximum git storage failures'), class: 'control-label col-sm-2' .col-sm-10 - = f.number_field :circuitbreaker_failure_reset_time, class: 'form-control' + = f.number_field :circuitbreaker_failure_count_threshold, class: 'form-control' .help-block - = circuitbreaker_failure_reset_time_help_text + = circuitbreaker_failure_count_help_text .form-group - = f.label :circuitbreaker_storage_timeout, _('Seconds to wait for a storage access attempt'), class: 'control-label col-sm-2' + = f.label :circuitbreaker_failure_reset_time, _('Seconds before reseting failure information'), class: 'control-label col-sm-2' .col-sm-10 - = f.number_field :circuitbreaker_storage_timeout, class: 'form-control' + = f.number_field :circuitbreaker_failure_reset_time, class: 'form-control' .help-block - = circuitbreaker_storage_timeout_help_text + = circuitbreaker_failure_reset_time_help_text %fieldset %legend Repository Checks diff --git a/changelogs/unreleased/bvl-circuitbreaker-backoff.yml b/changelogs/unreleased/bvl-circuitbreaker-backoff.yml new file mode 100644 index 00000000000..5cb90e7c085 --- /dev/null +++ b/changelogs/unreleased/bvl-circuitbreaker-backoff.yml @@ -0,0 +1,6 @@ +--- +title: Make the circuitbreaker more robust by adding higher thresholds, and multiple + access attempts. +merge_request: 14933 +author: +type: fixed diff --git a/db/migrate/20171017145932_add_new_circuitbreaker_settings_to_application_settings.rb b/db/migrate/20171017145932_add_new_circuitbreaker_settings_to_application_settings.rb new file mode 100644 index 00000000000..07eb25c0b0f --- /dev/null +++ b/db/migrate/20171017145932_add_new_circuitbreaker_settings_to_application_settings.rb @@ -0,0 +1,16 @@ +class AddNewCircuitbreakerSettingsToApplicationSettings < ActiveRecord::Migration + include Gitlab::Database::MigrationHelpers + + DOWNTIME = false + + def change + add_column :application_settings, + :circuitbreaker_access_retries, + :integer, + default: 3 + add_column :application_settings, + :circuitbreaker_backoff_threshold, + :integer, + default: 80 + end +end diff --git a/db/schema.rb b/db/schema.rb index d64e0fbf60a..d26481625c7 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -11,7 +11,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema.define(version: 20171012101043) do +ActiveRecord::Schema.define(version: 20171017145932) do # These are extensions that must be enabled in order to support this database enable_extension "plpgsql" @@ -138,6 +138,8 @@ ActiveRecord::Schema.define(version: 20171012101043) do t.integer "circuitbreaker_failure_wait_time", default: 30 t.integer "circuitbreaker_failure_reset_time", default: 1800 t.integer "circuitbreaker_storage_timeout", default: 30 + t.integer "circuitbreaker_access_retries", default: 3 + t.integer "circuitbreaker_backoff_threshold", default: 80 end create_table "audit_events", force: :cascade do |t| diff --git a/doc/administration/img/circuitbreaker_config.png b/doc/administration/img/circuitbreaker_config.png index 9250d38297c..e811d173634 100644 Binary files a/doc/administration/img/circuitbreaker_config.png and b/doc/administration/img/circuitbreaker_config.png differ diff --git a/doc/administration/repository_storage_paths.md b/doc/administration/repository_storage_paths.md index efcabd69822..96f436fa7c3 100644 --- a/doc/administration/repository_storage_paths.md +++ b/doc/administration/repository_storage_paths.md @@ -109,6 +109,11 @@ This can be configured from the admin interface: ![circuitbreaker configuration](img/circuitbreaker_config.png) +**Number of access attempts**: The number of attempts GitLab will make to access a +storage when probing a shard. + +**Number of failures before backing off**: The number of failures after which +GitLab will start temporarily disabling access to a storage shard on a host. **Maximum git storage failures:** The number of failures of after which GitLab will completely prevent access to the storage. The number of failures can be reset in @@ -126,6 +131,15 @@ mount is reset. **Seconds to wait for a storage access attempt:** The time in seconds GitLab will try to access storage. After this time a timeout error will be raised. +To enable the circuitbreaker for repository storage you can flip the feature flag from a rails console: + +``` +Feature.enable('git_storage_circuit_breaker') +``` + +Alternatively it can be enabled by setting `true` in the `GIT_STORAGE_CIRCUIT_BREAKER` environment variable. +This approach would be used when enabling the circuit breaker on a single host. + When storage failures occur, this will be visible in the admin interface like this: ![failing storage](img/failing_storage.png) diff --git a/doc/api/settings.md b/doc/api/settings.md index 5ec96b8a6a2..5fce6ac7810 100644 --- a/doc/api/settings.md +++ b/doc/api/settings.md @@ -96,6 +96,8 @@ PUT /application/settings | `dsa_key_restriction` | integer | no | The minimum allowed bit length of an uploaded DSA key. Default is `0` (no restriction). `-1` disables DSA keys. | `ecdsa_key_restriction` | integer | no | The minimum allowed curve size (in bits) of an uploaded ECDSA key. Default is `0` (no restriction). `-1` disables ECDSA keys. | `ed25519_key_restriction` | integer | no | The minimum allowed curve size (in bits) of an uploaded ED25519 key. Default is `0` (no restriction). `-1` disables ED25519 keys. +| `circuitbreaker_access_retries | integer | no | The number of attempts GitLab will make to access a storage. | +| `circuitbreaker_backoff_threshold | integer | no | The number of failures after which GitLab will start temporarily disabling access to a storage shard on a host. | | `circuitbreaker_failure_count_threshold` | integer | no | The number of failures of after which GitLab will completely prevent access to the storage. | | `circuitbreaker_failure_reset_time` | integer | no | Time in seconds GitLab will keep storage failure information. When no failures occur during this time, the failure information is reset. | | `circuitbreaker_failure_wait_time` | integer | no | Time in seconds GitLab will block access to a failing storage to allow it to recover. | diff --git a/lib/gitlab/git/storage.rb b/lib/gitlab/git/storage.rb index 08e6c29abad..99518c9b1e4 100644 --- a/lib/gitlab/git/storage.rb +++ b/lib/gitlab/git/storage.rb @@ -12,6 +12,7 @@ module Gitlab CircuitOpen = Class.new(Inaccessible) Misconfiguration = Class.new(Inaccessible) + Failing = Class.new(Inaccessible) REDIS_KEY_PREFIX = 'storage_accessible:'.freeze diff --git a/lib/gitlab/git/storage/circuit_breaker.rb b/lib/gitlab/git/storage/circuit_breaker.rb index 0456ad9a1f3..be7598ef011 100644 --- a/lib/gitlab/git/storage/circuit_breaker.rb +++ b/lib/gitlab/git/storage/circuit_breaker.rb @@ -54,7 +54,7 @@ module Gitlab end def perform - return yield unless Feature.enabled?('git_storage_circuit_breaker') + return yield unless enabled? check_storage_accessible! @@ -64,10 +64,27 @@ module Gitlab def circuit_broken? return false if no_failures? + failure_count > failure_count_threshold + end + + def backing_off? + return false if no_failures? + recent_failure = last_failure > failure_wait_time.seconds.ago - too_many_failures = failure_count > failure_count_threshold + too_many_failures = failure_count > backoff_threshold - recent_failure || too_many_failures + recent_failure && too_many_failures + end + + private + + # The circuitbreaker can be enabled for the entire fleet using a Feature + # flag. + # + # Enabling it for a single host can be done setting the + # `GIT_STORAGE_CIRCUIT_BREAKER` environment variable. + def enabled? + ENV['GIT_STORAGE_CIRCUIT_BREAKER'].present? || Feature.enabled?('git_storage_circuit_breaker') end def failure_info @@ -83,7 +100,7 @@ module Gitlab return @storage_available if @storage_available if @storage_available = Gitlab::Git::Storage::ForkedStorageCheck - .storage_available?(storage_path, storage_timeout) + .storage_available?(storage_path, storage_timeout, access_retries) track_storage_accessible else track_storage_inaccessible @@ -94,7 +111,11 @@ module Gitlab def check_storage_accessible! if circuit_broken? - raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_wait_time) + raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_reset_time) + end + + if backing_off? + raise Gitlab::Git::Storage::Failing.new("Backing off access to #{storage}", failure_wait_time) end unless storage_available? @@ -131,12 +152,6 @@ module Gitlab end end - def cache_key - @cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}" - end - - private - def get_failure_info last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis| redis.hmget(cache_key, :last_failure, :failure_count) @@ -146,6 +161,10 @@ module Gitlab FailureInfo.new(last_failure, failure_count.to_i) end + + def cache_key + @cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}" + end end end end diff --git a/lib/gitlab/git/storage/circuit_breaker_settings.rb b/lib/gitlab/git/storage/circuit_breaker_settings.rb index d2313fe7c1b..257fe8cd8f0 100644 --- a/lib/gitlab/git/storage/circuit_breaker_settings.rb +++ b/lib/gitlab/git/storage/circuit_breaker_settings.rb @@ -18,6 +18,14 @@ module Gitlab application_settings.circuitbreaker_storage_timeout end + def access_retries + application_settings.circuitbreaker_access_retries + end + + def backoff_threshold + application_settings.circuitbreaker_backoff_threshold + end + private def application_settings diff --git a/lib/gitlab/git/storage/forked_storage_check.rb b/lib/gitlab/git/storage/forked_storage_check.rb index 91d8241f17b..1307f400700 100644 --- a/lib/gitlab/git/storage/forked_storage_check.rb +++ b/lib/gitlab/git/storage/forked_storage_check.rb @@ -4,8 +4,17 @@ module Gitlab module ForkedStorageCheck extend self - def storage_available?(path, timeout_seconds = 5) - status = timeout_check(path, timeout_seconds) + def storage_available?(path, timeout_seconds = 5, retries = 1) + partial_timeout = timeout_seconds / retries + status = timeout_check(path, partial_timeout) + + # If the status check did not succeed the first time, we retry a few + # more times to avoid one-off failures + current_attempts = 1 + while current_attempts < retries && !status.success? + status = timeout_check(path, partial_timeout) + current_attempts += 1 + end status.success? end diff --git a/lib/gitlab/git/storage/null_circuit_breaker.rb b/lib/gitlab/git/storage/null_circuit_breaker.rb index 60c6791a7e4..a12d52d295f 100644 --- a/lib/gitlab/git/storage/null_circuit_breaker.rb +++ b/lib/gitlab/git/storage/null_circuit_breaker.rb @@ -25,6 +25,10 @@ module Gitlab !!@error end + def backing_off? + false + end + def last_failure circuit_broken? ? Time.now : nil end diff --git a/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb b/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb index c8d532df059..72dabca793a 100644 --- a/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb +++ b/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb @@ -79,7 +79,9 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: stub_application_setting(circuitbreaker_failure_count_threshold: 0, circuitbreaker_failure_wait_time: 1, circuitbreaker_failure_reset_time: 2, - circuitbreaker_storage_timeout: 3) + circuitbreaker_storage_timeout: 3, + circuitbreaker_access_retries: 4, + circuitbreaker_backoff_threshold: 5) end describe '#failure_count_threshold' do @@ -105,14 +107,43 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: expect(circuit_breaker.storage_timeout).to eq(3) end end + + describe '#access_retries' do + it 'reads the value from settings' do + expect(circuit_breaker.access_retries).to eq(4) + end + end + + describe '#backoff_threshold' do + it 'reads the value from settings' do + expect(circuit_breaker.backoff_threshold).to eq(5) + end + end end describe '#perform' do - it 'raises an exception with retry time when the circuit is open' do - allow(circuit_breaker).to receive(:circuit_broken?).and_return(true) + it 'raises the correct exception when the circuit is open' do + set_in_redis(:last_failure, 1.day.ago.to_f) + set_in_redis(:failure_count, 999) expect { |b| circuit_breaker.perform(&b) } - .to raise_error(Gitlab::Git::Storage::CircuitOpen) + .to raise_error do |exception| + expect(exception).to be_kind_of(Gitlab::Git::Storage::CircuitOpen) + expect(exception.retry_after).to eq(1800) + end + end + + it 'raises the correct exception when backing off' do + Timecop.freeze do + set_in_redis(:last_failure, 1.second.ago.to_f) + set_in_redis(:failure_count, 90) + + expect { |b| circuit_breaker.perform(&b) } + .to raise_error do |exception| + expect(exception).to be_kind_of(Gitlab::Git::Storage::Failing) + expect(exception.retry_after).to eq(30) + end + end end it 'yields the block' do @@ -122,6 +153,7 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: it 'checks if the storage is available' do expect(circuit_breaker).to receive(:check_storage_accessible!) + .and_call_original circuit_breaker.perform { 'hello world' } end @@ -137,201 +169,124 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: .to raise_error(Rugged::OSError) end - context 'with the feature disabled' do - it 'returns the block without checking accessibility' do - stub_feature_flags(git_storage_circuit_breaker: false) - - expect(circuit_breaker).not_to receive(:circuit_broken?) + it 'tracks that the storage was accessible' do + set_in_redis(:failure_count, 10) + set_in_redis(:last_failure, Time.now.to_f) - result = circuit_breaker.perform { 'hello' } + circuit_breaker.perform { '' } - expect(result).to eq('hello') - end + expect(value_from_redis(:failure_count).to_i).to eq(0) + expect(value_from_redis(:last_failure)).to be_empty + expect(circuit_breaker.failure_count).to eq(0) + expect(circuit_breaker.last_failure).to be_nil end - end - describe '#circuit_broken?' do - it 'is working when there is no last failure' do - set_in_redis(:last_failure, nil) - set_in_redis(:failure_count, 0) + it 'only performs the accessibility check once' do + expect(Gitlab::Git::Storage::ForkedStorageCheck) + .to receive(:storage_available?).once.and_call_original - expect(circuit_breaker.circuit_broken?).to be_falsey + 2.times { circuit_breaker.perform { '' } } end - it 'is broken when there was a recent failure' do - Timecop.freeze do - set_in_redis(:last_failure, 1.second.ago.to_f) - set_in_redis(:failure_count, 1) + it 'calls the check with the correct arguments' do + stub_application_setting(circuitbreaker_storage_timeout: 30, + circuitbreaker_access_retries: 3) - expect(circuit_breaker.circuit_broken?).to be_truthy - end - end + expect(Gitlab::Git::Storage::ForkedStorageCheck) + .to receive(:storage_available?).with(TestEnv.repos_path, 30, 3) + .and_call_original - it 'is broken when there are too many failures' do - set_in_redis(:last_failure, 1.day.ago.to_f) - set_in_redis(:failure_count, 200) - - expect(circuit_breaker.circuit_broken?).to be_truthy + circuit_breaker.perform { '' } end - context 'the `failure_wait_time` is set to 0' do + context 'with the feature disabled' do before do - stub_application_setting(circuitbreaker_failure_wait_time: 0) - end - - it 'is working even when there is a recent failure' do - Timecop.freeze do - set_in_redis(:last_failure, 0.seconds.ago.to_f) - set_in_redis(:failure_count, 1) - - expect(circuit_breaker.circuit_broken?).to be_falsey - end + stub_feature_flags(git_storage_circuit_breaker: false) end - end - end - describe "storage_available?" do - context 'the storage is available' do - it 'tracks that the storage was accessible an raises the error' do - expect(circuit_breaker).to receive(:track_storage_accessible) - - circuit_breaker.storage_available? - end + it 'returns the block without checking accessibility' do + expect(circuit_breaker).not_to receive(:check_storage_accessible!) - it 'only performs the check once' do - expect(Gitlab::Git::Storage::ForkedStorageCheck) - .to receive(:storage_available?).once.and_call_original + result = circuit_breaker.perform { 'hello' } - 2.times { circuit_breaker.storage_available? } + expect(result).to eq('hello') end - end - - context 'storage is not available' do - let(:storage_name) { 'broken' } - - it 'tracks that the storage was inaccessible' do - expect(circuit_breaker).to receive(:track_storage_inaccessible) - circuit_breaker.storage_available? - end - end - end + it 'allows enabling the feature using an ENV var' do + stub_env('GIT_STORAGE_CIRCUIT_BREAKER', 'true') + expect(circuit_breaker).to receive(:check_storage_accessible!) - describe '#check_storage_accessible!' do - it 'raises an exception with retry time when the circuit is open' do - allow(circuit_breaker).to receive(:circuit_broken?).and_return(true) + result = circuit_breaker.perform { 'hello' } - expect { circuit_breaker.check_storage_accessible! } - .to raise_error do |exception| - expect(exception).to be_kind_of(Gitlab::Git::Storage::CircuitOpen) - expect(exception.retry_after).to eq(30) + expect(result).to eq('hello') end end context 'the storage is not available' do let(:storage_name) { 'broken' } - it 'raises an error' do + it 'raises the correct exception' do expect(circuit_breaker).to receive(:track_storage_inaccessible) - expect { circuit_breaker.check_storage_accessible! } + expect { circuit_breaker.perform { '' } } .to raise_error do |exception| expect(exception).to be_kind_of(Gitlab::Git::Storage::Inaccessible) expect(exception.retry_after).to eq(30) end end - end - end - - describe '#track_storage_inaccessible' do - around do |example| - Timecop.freeze { example.run } - end - - it 'records the failure time in redis' do - circuit_breaker.track_storage_inaccessible - - failure_time = value_from_redis(:last_failure) - expect(Time.at(failure_time.to_i)).to be_within(1.second).of(Time.now) - end - - it 'sets the failure time on the breaker without reloading' do - circuit_breaker.track_storage_inaccessible - - expect(circuit_breaker).not_to receive(:get_failure_info) - expect(circuit_breaker.last_failure).to eq(Time.now) - end - - it 'increments the failure count in redis' do - set_in_redis(:failure_count, 10) - - circuit_breaker.track_storage_inaccessible - - expect(value_from_redis(:failure_count).to_i).to be(11) - end - - it 'increments the failure count on the breaker without reloading' do - set_in_redis(:failure_count, 10) - - circuit_breaker.track_storage_inaccessible + it 'tracks that the storage was inaccessible' do + Timecop.freeze do + expect { circuit_breaker.perform { '' } }.to raise_error(Gitlab::Git::Storage::Inaccessible) - expect(circuit_breaker).not_to receive(:get_failure_info) - expect(circuit_breaker.failure_count).to eq(11) + expect(value_from_redis(:failure_count).to_i).to eq(1) + expect(value_from_redis(:last_failure)).not_to be_empty + expect(circuit_breaker.failure_count).to eq(1) + expect(circuit_breaker.last_failure).to be_within(1.second).of(Time.now) + end + end end end - describe '#track_storage_accessible' do - it 'sets the failure count to zero in redis' do - set_in_redis(:failure_count, 10) - - circuit_breaker.track_storage_accessible - - expect(value_from_redis(:failure_count).to_i).to be(0) - end - - it 'sets the failure count to zero on the breaker without reloading' do - set_in_redis(:failure_count, 10) - - circuit_breaker.track_storage_accessible + describe '#circuit_broken?' do + it 'is working when there is no last failure' do + set_in_redis(:last_failure, nil) + set_in_redis(:failure_count, 0) - expect(circuit_breaker).not_to receive(:get_failure_info) - expect(circuit_breaker.failure_count).to eq(0) + expect(circuit_breaker.circuit_broken?).to be_falsey end - it 'removes the last failure time from redis' do - set_in_redis(:last_failure, Time.now.to_i) - - circuit_breaker.track_storage_accessible + it 'is broken when there are too many failures' do + set_in_redis(:last_failure, 1.day.ago.to_f) + set_in_redis(:failure_count, 200) - expect(circuit_breaker).not_to receive(:get_failure_info) - expect(circuit_breaker.last_failure).to be_nil + expect(circuit_breaker.circuit_broken?).to be_truthy end + end - it 'removes the last failure time from the breaker without reloading' do - set_in_redis(:last_failure, Time.now.to_i) - - circuit_breaker.track_storage_accessible + describe '#backing_off?' do + it 'is true when there was a recent failure' do + Timecop.freeze do + set_in_redis(:last_failure, 1.second.ago.to_f) + set_in_redis(:failure_count, 90) - expect(value_from_redis(:last_failure)).to be_empty + expect(circuit_breaker.backing_off?).to be_truthy + end end - it 'wont connect to redis when there are no failures' do - expect(Gitlab::Git::Storage.redis).to receive(:with).once - .and_call_original - expect(circuit_breaker).to receive(:track_storage_accessible) - .and_call_original - - circuit_breaker.track_storage_accessible - end - end + context 'the `failure_wait_time` is set to 0' do + before do + stub_application_setting(circuitbreaker_failure_wait_time: 0) + end - describe '#no_failures?' do - it 'is false when a failure was tracked' do - set_in_redis(:last_failure, Time.now.to_i) - set_in_redis(:failure_count, 1) + it 'is working even when there are failures' do + Timecop.freeze do + set_in_redis(:last_failure, 0.seconds.ago.to_f) + set_in_redis(:failure_count, 90) - expect(circuit_breaker.no_failures?).to be_falsey + expect(circuit_breaker.backing_off?).to be_falsey + end + end end end @@ -351,10 +306,4 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: expect(circuit_breaker.failure_count).to eq(7) end end - - describe '#cache_key' do - it 'includes storage and host' do - expect(circuit_breaker.cache_key).to eq(cache_key) - end - end end diff --git a/spec/lib/gitlab/git/storage/forked_storage_check_spec.rb b/spec/lib/gitlab/git/storage/forked_storage_check_spec.rb index c708b15853a..39a5d020bb4 100644 --- a/spec/lib/gitlab/git/storage/forked_storage_check_spec.rb +++ b/spec/lib/gitlab/git/storage/forked_storage_check_spec.rb @@ -33,6 +33,21 @@ describe Gitlab::Git::Storage::ForkedStorageCheck, broken_storage: true, skip_da expect(runtime).to be < 1.0 end + it 'will try the specified amount of times before failing' do + allow(described_class).to receive(:check_filesystem_in_process) do + Process.spawn("sleep 10") + end + + expect(Process).to receive(:spawn).with('sleep 10').twice + .and_call_original + + runtime = Benchmark.realtime do + described_class.storage_available?(existing_path, 0.5, 2) + end + + expect(runtime).to be < 1.0 + end + describe 'when using paths with spaces' do let(:test_dir) { Rails.root.join('tmp', 'tests', 'storage_check') } let(:path_with_spaces) { File.join(test_dir, 'path with spaces') } diff --git a/spec/lib/gitlab/git/storage/null_circuit_breaker_spec.rb b/spec/lib/gitlab/git/storage/null_circuit_breaker_spec.rb index 7ee6d2f3709..5db37f55e03 100644 --- a/spec/lib/gitlab/git/storage/null_circuit_breaker_spec.rb +++ b/spec/lib/gitlab/git/storage/null_circuit_breaker_spec.rb @@ -65,17 +65,6 @@ describe Gitlab::Git::Storage::NullCircuitBreaker do ours = described_class.public_instance_methods theirs = Gitlab::Git::Storage::CircuitBreaker.public_instance_methods - # These methods are not part of the public API, but are public to allow the - # CircuitBreaker specs to operate. They should be made private over time. - exceptions = %i[ - cache_key - check_storage_accessible! - no_failures? - storage_available? - track_storage_accessible - track_storage_inaccessible - ] - - expect(theirs - ours).to contain_exactly(*exceptions) + expect(theirs - ours).to be_empty end end diff --git a/spec/models/application_setting_spec.rb b/spec/models/application_setting_spec.rb index 30495fd4f5e..47b7150d36f 100644 --- a/spec/models/application_setting_spec.rb +++ b/spec/models/application_setting_spec.rb @@ -115,7 +115,8 @@ describe ApplicationSetting do end context 'circuitbreaker settings' do - [:circuitbreaker_failure_count_threshold, + [:circuitbreaker_backoff_threshold, + :circuitbreaker_failure_count_threshold, :circuitbreaker_failure_wait_time, :circuitbreaker_failure_reset_time, :circuitbreaker_storage_timeout].each do |field| @@ -125,6 +126,16 @@ describe ApplicationSetting do .is_greater_than_or_equal_to(0) end end + + it 'requires the `backoff_threshold` to be lower than the `failure_count_threshold`' do + setting.circuitbreaker_failure_count_threshold = 10 + setting.circuitbreaker_backoff_threshold = 15 + failure_message = "The circuitbreaker backoff threshold should be lower "\ + "than the failure count threshold" + + expect(setting).not_to be_valid + expect(setting.errors[:circuitbreaker_backoff_threshold]).to include(failure_message) + end end context 'repository storages' do -- cgit v1.2.1