diff options
author | Douwe Maan <douwe@gitlab.com> | 2017-12-08 09:19:42 +0000 |
---|---|---|
committer | Douwe Maan <douwe@gitlab.com> | 2017-12-08 09:19:42 +0000 |
commit | 7fabc892f251740dbd9a4755baede662e6854870 (patch) | |
tree | 1d01033287e4e15e505c7b8b3f69ced4e6cf21c8 | |
parent | 12d33b883adda7093f0f4b838532871036af3925 (diff) | |
parent | f1ae1e39ce6b7578c5697c977bc3b52b119301ab (diff) | |
download | gitlab-ce-7fabc892f251740dbd9a4755baede662e6854870.tar.gz |
Merge branch 'bvl-circuitbreaker-process' into 'master'
Check NFS mounts in a separate process
Closes #39847
See merge request gitlab-org/gitlab-ce!15426
45 files changed, 983 insertions, 362 deletions
diff --git a/app/controllers/admin/health_check_controller.rb b/app/controllers/admin/health_check_controller.rb index 65a17828feb..61247b280b3 100644 --- a/app/controllers/admin/health_check_controller.rb +++ b/app/controllers/admin/health_check_controller.rb @@ -5,7 +5,7 @@ class Admin::HealthCheckController < Admin::ApplicationController end def reset_storage_health - Gitlab::Git::Storage::CircuitBreaker.reset_all! + Gitlab::Git::Storage::FailureInfo.reset_all! redirect_to admin_health_check_path, notice: _('Git storage health information has been reset') end diff --git a/app/controllers/health_controller.rb b/app/controllers/health_controller.rb index 98c2aaa3526..a931b456a93 100644 --- a/app/controllers/health_controller.rb +++ b/app/controllers/health_controller.rb @@ -1,5 +1,5 @@ class HealthController < ActionController::Base - protect_from_forgery with: :exception + protect_from_forgery with: :exception, except: :storage_check include RequiresWhitelistedMonitoringClient CHECKS = [ @@ -23,6 +23,15 @@ class HealthController < ActionController::Base render_check_results(results) end + def storage_check + results = Gitlab::Git::Storage::Checker.check_all + + render json: { + check_interval: Gitlab::CurrentSettings.current_application_settings.circuitbreaker_check_interval, + results: results + } + end + private def render_check_results(results) diff --git a/app/helpers/application_settings_helper.rb b/app/helpers/application_settings_helper.rb index dccde46fa33..b12ea760668 100644 --- a/app/helpers/application_settings_helper.rb +++ b/app/helpers/application_settings_helper.rb @@ -124,17 +124,6 @@ module ApplicationSettingsHelper _('The number of attempts GitLab will make to access a storage.') end - def circuitbreaker_backoff_threshold_help_text - _("The number of failures after which GitLab will start temporarily "\ - "disabling access to a storage shard on a host") - end - - def circuitbreaker_failure_wait_time_help_text - _("When access to a storage fails. GitLab will prevent access to the "\ - "storage for the time specified here. This allows the filesystem to "\ - "recover. Repositories on failing shards are temporarly unavailable") - end - def circuitbreaker_failure_reset_time_help_text _("The time in seconds GitLab will keep failure information. When no "\ "failures occur during this time, information about the mount is reset.") @@ -145,6 +134,11 @@ module ApplicationSettingsHelper "timeout error will be raised.") end + def circuitbreaker_check_interval_help_text + _("The time in seconds between storage checks. When a previous check did "\ + "complete yet, GitLab will skip a check.") + end + def visible_attributes [ :admin_notification_email, @@ -154,10 +148,9 @@ module ApplicationSettingsHelper :akismet_enabled, :auto_devops_enabled, :circuitbreaker_access_retries, - :circuitbreaker_backoff_threshold, + :circuitbreaker_check_interval, :circuitbreaker_failure_count_threshold, :circuitbreaker_failure_reset_time, - :circuitbreaker_failure_wait_time, :circuitbreaker_storage_timeout, :clientside_sentry_dsn, :clientside_sentry_enabled, diff --git a/app/helpers/storage_health_helper.rb b/app/helpers/storage_health_helper.rb index 4d2180f7eee..b76c1228220 100644 --- a/app/helpers/storage_health_helper.rb +++ b/app/helpers/storage_health_helper.rb @@ -18,16 +18,12 @@ module StorageHealthHelper current_failures = circuit_breaker.failure_count translation_params = { number_of_failures: current_failures, - maximum_failures: maximum_failures, - number_of_seconds: circuit_breaker.failure_wait_time } + maximum_failures: maximum_failures } if circuit_breaker.circuit_broken? s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\ "retry automatically. Reset storage information when the problem is "\ "resolved.") % translation_params - elsif circuit_breaker.backing_off? - _("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\ - "block access for %{number_of_seconds} seconds.") % translation_params else _("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\ "allow access on the next attempt.") % translation_params diff --git a/app/models/application_setting.rb b/app/models/application_setting.rb index 3117c98c846..253e213af81 100644 --- a/app/models/application_setting.rb +++ b/app/models/application_setting.rb @@ -153,11 +153,10 @@ class ApplicationSetting < ActiveRecord::Base presence: true, numericality: { greater_than_or_equal_to: 0 } - validates :circuitbreaker_backoff_threshold, - :circuitbreaker_failure_count_threshold, - :circuitbreaker_failure_wait_time, + validates :circuitbreaker_failure_count_threshold, :circuitbreaker_failure_reset_time, :circuitbreaker_storage_timeout, + :circuitbreaker_check_interval, presence: true, numericality: { only_integer: true, greater_than_or_equal_to: 0 } @@ -165,13 +164,6 @@ class ApplicationSetting < ActiveRecord::Base presence: true, numericality: { only_integer: true, greater_than_or_equal_to: 1 } - validates_each :circuitbreaker_backoff_threshold do |record, attr, value| - if value.to_i >= record.circuitbreaker_failure_count_threshold - record.errors.add(attr, _("The circuitbreaker backoff threshold should be "\ - "lower than the failure count threshold")) - end - end - validates :gitaly_timeout_default, presence: true, numericality: { only_integer: true, greater_than_or_equal_to: 0 } diff --git a/app/views/admin/application_settings/_form.html.haml b/app/views/admin/application_settings/_form.html.haml index a9d0503bc73..3e2dbb07a6c 100644 --- a/app/views/admin/application_settings/_form.html.haml +++ b/app/views/admin/application_settings/_form.html.haml @@ -546,6 +546,12 @@ %fieldset %legend Git Storage Circuitbreaker settings .form-group + = f.label :circuitbreaker_check_interval, _('Check interval'), class: 'control-label col-sm-2' + .col-sm-10 + = f.number_field :circuitbreaker_check_interval, class: 'form-control' + .help-block + = circuitbreaker_check_interval_help_text + .form-group = f.label :circuitbreaker_access_retries, _('Number of access attempts'), class: 'control-label col-sm-2' .col-sm-10 = f.number_field :circuitbreaker_access_retries, class: 'form-control' @@ -558,18 +564,6 @@ .help-block = circuitbreaker_storage_timeout_help_text .form-group - = f.label :circuitbreaker_backoff_threshold, _('Number of failures before backing off'), class: 'control-label col-sm-2' - .col-sm-10 - = f.number_field :circuitbreaker_backoff_threshold, class: 'form-control' - .help-block - = circuitbreaker_backoff_threshold_help_text - .form-group - = f.label :circuitbreaker_failure_wait_time, _('Seconds to wait after a storage failure'), class: 'control-label col-sm-2' - .col-sm-10 - = f.number_field :circuitbreaker_failure_wait_time, class: 'form-control' - .help-block - = circuitbreaker_failure_wait_time_help_text - .form-group = f.label :circuitbreaker_failure_count_threshold, _('Maximum git storage failures'), class: 'control-label col-sm-2' .col-sm-10 = f.number_field :circuitbreaker_failure_count_threshold, class: 'form-control' diff --git a/bin/storage_check b/bin/storage_check new file mode 100755 index 00000000000..5a818732bd1 --- /dev/null +++ b/bin/storage_check @@ -0,0 +1,11 @@ +#!/usr/bin/env ruby + +require 'optparse' +require 'net/http' +require 'json' +require 'socket' +require 'logger' + +require_relative '../lib/gitlab/storage_check' + +Gitlab::StorageCheck::CLI.start!(ARGV) diff --git a/changelogs/unreleased/bvl-circuitbreaker-process.yml b/changelogs/unreleased/bvl-circuitbreaker-process.yml new file mode 100644 index 00000000000..595dd13f724 --- /dev/null +++ b/changelogs/unreleased/bvl-circuitbreaker-process.yml @@ -0,0 +1,5 @@ +--- +title: Monitor NFS shards for circuitbreaker in a separate process +merge_request: 15426 +author: +type: changed diff --git a/config/routes.rb b/config/routes.rb index 4f27fea0e92..016140e0ede 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -42,6 +42,7 @@ Rails.application.routes.draw do scope path: '-' do get 'liveness' => 'health#liveness' get 'readiness' => 'health#readiness' + post 'storage_check' => 'health#storage_check' resources :metrics, only: [:index] mount Peek::Railtie => '/peek' diff --git a/db/migrate/20171123094802_add_circuitbreaker_check_interval_to_application_settings.rb b/db/migrate/20171123094802_add_circuitbreaker_check_interval_to_application_settings.rb new file mode 100644 index 00000000000..213d46018fc --- /dev/null +++ b/db/migrate/20171123094802_add_circuitbreaker_check_interval_to_application_settings.rb @@ -0,0 +1,20 @@ +class AddCircuitbreakerCheckIntervalToApplicationSettings < ActiveRecord::Migration + include Gitlab::Database::MigrationHelpers + + # Set this constant to true if this migration requires downtime. + DOWNTIME = false + + disable_ddl_transaction! + + def up + add_column_with_default :application_settings, + :circuitbreaker_check_interval, + :integer, + default: 1 + end + + def down + remove_column :application_settings, + :circuitbreaker_check_interval + end +end diff --git a/db/post_migrate/20171123101020_update_circuitbreaker_defaults.rb b/db/post_migrate/20171123101020_update_circuitbreaker_defaults.rb new file mode 100644 index 00000000000..8e1c9e6d6bb --- /dev/null +++ b/db/post_migrate/20171123101020_update_circuitbreaker_defaults.rb @@ -0,0 +1,34 @@ +# See http://doc.gitlab.com/ce/development/migration_style_guide.html +# for more information on how to write migrations for GitLab. + +class UpdateCircuitbreakerDefaults < ActiveRecord::Migration + include Gitlab::Database::MigrationHelpers + + DOWNTIME = false + + class ApplicationSetting < ActiveRecord::Base; end + + def up + change_column_default :application_settings, + :circuitbreaker_failure_count_threshold, + 3 + change_column_default :application_settings, + :circuitbreaker_storage_timeout, + 15 + + ApplicationSetting.update_all(circuitbreaker_failure_count_threshold: 3, + circuitbreaker_storage_timeout: 15) + end + + def down + change_column_default :application_settings, + :circuitbreaker_failure_count_threshold, + 160 + change_column_default :application_settings, + :circuitbreaker_storage_timeout, + 30 + + ApplicationSetting.update_all(circuitbreaker_failure_count_threshold: 160, + circuitbreaker_storage_timeout: 30) + end +end diff --git a/db/post_migrate/20171123101046_remove_old_circuitbreaker_config.rb b/db/post_migrate/20171123101046_remove_old_circuitbreaker_config.rb new file mode 100644 index 00000000000..e646d4d3224 --- /dev/null +++ b/db/post_migrate/20171123101046_remove_old_circuitbreaker_config.rb @@ -0,0 +1,26 @@ +# See http://doc.gitlab.com/ce/development/migration_style_guide.html +# for more information on how to write migrations for GitLab. + +class RemoveOldCircuitbreakerConfig < ActiveRecord::Migration + include Gitlab::Database::MigrationHelpers + + DOWNTIME = false + + def up + remove_column :application_settings, + :circuitbreaker_backoff_threshold + remove_column :application_settings, + :circuitbreaker_failure_wait_time + end + + def down + add_column :application_settings, + :circuitbreaker_backoff_threshold, + :integer, + default: 80 + add_column :application_settings, + :circuitbreaker_failure_wait_time, + :integer, + default: 30 + end +end diff --git a/db/schema.rb b/db/schema.rb index 6ea3ab54742..4c697a4a384 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -135,12 +135,10 @@ ActiveRecord::Schema.define(version: 20171205190711) do t.boolean "hashed_storage_enabled", default: false, null: false t.boolean "project_export_enabled", default: true, null: false t.boolean "auto_devops_enabled", default: false, null: false - t.integer "circuitbreaker_failure_count_threshold", default: 160 - t.integer "circuitbreaker_failure_wait_time", default: 30 + t.integer "circuitbreaker_failure_count_threshold", default: 3 t.integer "circuitbreaker_failure_reset_time", default: 1800 - t.integer "circuitbreaker_storage_timeout", default: 30 + t.integer "circuitbreaker_storage_timeout", default: 15 t.integer "circuitbreaker_access_retries", default: 3 - t.integer "circuitbreaker_backoff_threshold", default: 80 t.boolean "throttle_unauthenticated_enabled", default: false, null: false t.integer "throttle_unauthenticated_requests_per_period", default: 3600, null: false t.integer "throttle_unauthenticated_period_in_seconds", default: 3600, null: false @@ -150,6 +148,7 @@ ActiveRecord::Schema.define(version: 20171205190711) do t.boolean "throttle_authenticated_web_enabled", default: false, null: false t.integer "throttle_authenticated_web_requests_per_period", default: 7200, null: false t.integer "throttle_authenticated_web_period_in_seconds", default: 3600, null: false + t.integer "circuitbreaker_check_interval", default: 1, null: false t.boolean "password_authentication_enabled_for_web" t.boolean "password_authentication_enabled_for_git", default: true t.integer "gitaly_timeout_default", default: 55, null: false diff --git a/doc/api/settings.md b/doc/api/settings.md index 22fb2baa8ec..0e4758cda2d 100644 --- a/doc/api/settings.md +++ b/doc/api/settings.md @@ -70,10 +70,9 @@ PUT /application/settings | `akismet_api_key` | string | no | API key for akismet spam protection | | `akismet_enabled` | boolean | no | Enable or disable akismet spam protection | | `circuitbreaker_access_retries | integer | no | The number of attempts GitLab will make to access a storage. | -| `circuitbreaker_backoff_threshold | integer | no | The number of failures after which GitLab will start temporarily disabling access to a storage shard on a host. | +| `circuitbreaker_check_interval` | integer | no | Number of seconds in between storage checks. | | `circuitbreaker_failure_count_threshold` | integer | no | The number of failures of after which GitLab will completely prevent access to the storage. | | `circuitbreaker_failure_reset_time` | integer | no | Time in seconds GitLab will keep storage failure information. When no failures occur during this time, the failure information is reset. | -| `circuitbreaker_failure_wait_time` | integer | no | Time in seconds GitLab will block access to a failing storage to allow it to recover. | | `circuitbreaker_storage_timeout` | integer | no | Seconds to wait for a storage access attempt | | `clientside_sentry_dsn` | string | no | Required if `clientside_sentry_dsn` is enabled | | `clientside_sentry_enabled` | boolean | no | Enable Sentry error reporting for the client side | diff --git a/lib/api/circuit_breakers.rb b/lib/api/circuit_breakers.rb index 118883f5ea5..598c76f6168 100644 --- a/lib/api/circuit_breakers.rb +++ b/lib/api/circuit_breakers.rb @@ -41,7 +41,7 @@ module API detail 'This feature was introduced in GitLab 9.5' end delete do - Gitlab::Git::Storage::CircuitBreaker.reset_all! + Gitlab::Git::Storage::FailureInfo.reset_all! end end end diff --git a/lib/gitlab/git/storage/checker.rb b/lib/gitlab/git/storage/checker.rb new file mode 100644 index 00000000000..de63cb4b40c --- /dev/null +++ b/lib/gitlab/git/storage/checker.rb @@ -0,0 +1,98 @@ +module Gitlab + module Git + module Storage + class Checker + include CircuitBreakerSettings + + attr_reader :storage_path, :storage, :hostname, :logger + + def self.check_all(logger = Rails.logger) + threads = Gitlab.config.repositories.storages.keys.map do |storage_name| + Thread.new do + Thread.current[:result] = new(storage_name, logger).check_with_lease + end + end + + threads.map do |thread| + thread.join + thread[:result] + end + end + + def initialize(storage, logger = Rails.logger) + @storage = storage + config = Gitlab.config.repositories.storages[@storage] + @storage_path = config['path'] + @logger = logger + + @hostname = Gitlab::Environment.hostname + end + + def check_with_lease + lease_key = "storage_check:#{cache_key}" + lease = Gitlab::ExclusiveLease.new(lease_key, timeout: storage_timeout) + result = { storage: storage, success: nil } + + if uuid = lease.try_obtain + result[:success] = check + + Gitlab::ExclusiveLease.cancel(lease_key, uuid) + else + logger.warn("#{hostname}: #{storage}: Skipping check, previous check still running") + end + + result + end + + def check + if Gitlab::Git::Storage::ForkedStorageCheck.storage_available?(storage_path, storage_timeout, access_retries) + track_storage_accessible + true + else + track_storage_inaccessible + logger.error("#{hostname}: #{storage}: Not accessible.") + false + end + end + + private + + def track_storage_inaccessible + first_failure = current_failure_info.first_failure || Time.now + last_failure = Time.now + + Gitlab::Git::Storage.redis.with do |redis| + redis.pipelined do + redis.hset(cache_key, :first_failure, first_failure.to_i) + redis.hset(cache_key, :last_failure, last_failure.to_i) + redis.hincrby(cache_key, :failure_count, 1) + redis.expire(cache_key, failure_reset_time) + maintain_known_keys(redis) + end + end + end + + def track_storage_accessible + Gitlab::Git::Storage.redis.with do |redis| + redis.pipelined do + redis.hset(cache_key, :first_failure, nil) + redis.hset(cache_key, :last_failure, nil) + redis.hset(cache_key, :failure_count, 0) + maintain_known_keys(redis) + end + end + end + + def maintain_known_keys(redis) + expire_time = Time.now.to_i + failure_reset_time + redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, expire_time, cache_key) + redis.zremrangebyscore(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, '-inf', Time.now.to_i) + end + + def current_failure_info + FailureInfo.load(cache_key) + end + end + end + end +end diff --git a/lib/gitlab/git/storage/circuit_breaker.rb b/lib/gitlab/git/storage/circuit_breaker.rb index 4328c0ea29b..898bb1b65be 100644 --- a/lib/gitlab/git/storage/circuit_breaker.rb +++ b/lib/gitlab/git/storage/circuit_breaker.rb @@ -4,22 +4,11 @@ module Gitlab class CircuitBreaker include CircuitBreakerSettings - FailureInfo = Struct.new(:last_failure, :failure_count) - attr_reader :storage, - :hostname, - :storage_path - - delegate :last_failure, :failure_count, to: :failure_info - - def self.reset_all! - Gitlab::Git::Storage.redis.with do |redis| - all_storage_keys = redis.zrange(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, -1) - redis.del(*all_storage_keys) unless all_storage_keys.empty? - end + :hostname - RequestStore.delete(:circuitbreaker_cache) - end + delegate :last_failure, :failure_count, :no_failures?, + to: :failure_info def self.for_storage(storage) cached_circuitbreakers = RequestStore.fetch(:circuitbreaker_cache) do @@ -46,9 +35,6 @@ module Gitlab def initialize(storage, hostname) @storage = storage @hostname = hostname - - config = Gitlab.config.repositories.storages[@storage] - @storage_path = config['path'] end def perform @@ -65,15 +51,6 @@ module Gitlab failure_count > failure_count_threshold end - def backing_off? - return false if no_failures? - - recent_failure = last_failure > failure_wait_time.seconds.ago - too_many_failures = failure_count > backoff_threshold - - recent_failure && too_many_failures - end - private # The circuitbreaker can be enabled for the entire fleet using a Feature @@ -86,88 +63,13 @@ module Gitlab end def failure_info - @failure_info ||= get_failure_info - end - - # Memoizing the `storage_available` call means we only do it once per - # request when the storage is available. - # - # When the storage appears not available, and the memoized value is `false` - # we might want to try again. - def storage_available? - return @storage_available if @storage_available - - if @storage_available = Gitlab::Git::Storage::ForkedStorageCheck - .storage_available?(storage_path, storage_timeout, access_retries) - track_storage_accessible - else - track_storage_inaccessible - end - - @storage_available + @failure_info ||= FailureInfo.load(cache_key) end def check_storage_accessible! if circuit_broken? raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_reset_time) end - - if backing_off? - raise Gitlab::Git::Storage::Failing.new("Backing off access to #{storage}", failure_wait_time) - end - - unless storage_available? - raise Gitlab::Git::Storage::Inaccessible.new("#{storage} not accessible", failure_wait_time) - end - end - - def no_failures? - last_failure.blank? && failure_count == 0 - end - - def track_storage_inaccessible - @failure_info = FailureInfo.new(Time.now, failure_count + 1) - - Gitlab::Git::Storage.redis.with do |redis| - redis.pipelined do - redis.hset(cache_key, :last_failure, last_failure.to_i) - redis.hincrby(cache_key, :failure_count, 1) - redis.expire(cache_key, failure_reset_time) - maintain_known_keys(redis) - end - end - end - - def track_storage_accessible - @failure_info = FailureInfo.new(nil, 0) - - Gitlab::Git::Storage.redis.with do |redis| - redis.pipelined do - redis.hset(cache_key, :last_failure, nil) - redis.hset(cache_key, :failure_count, 0) - maintain_known_keys(redis) - end - end - end - - def maintain_known_keys(redis) - expire_time = Time.now.to_i + failure_reset_time - redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, expire_time, cache_key) - redis.zremrangebyscore(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, '-inf', Time.now.to_i) - end - - def get_failure_info - last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis| - redis.hmget(cache_key, :last_failure, :failure_count) - end - - last_failure = Time.at(last_failure.to_i) if last_failure.present? - - FailureInfo.new(last_failure, failure_count.to_i) - end - - def cache_key - @cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}" end end end diff --git a/lib/gitlab/git/storage/circuit_breaker_settings.rb b/lib/gitlab/git/storage/circuit_breaker_settings.rb index 257fe8cd8f0..c9e225f187d 100644 --- a/lib/gitlab/git/storage/circuit_breaker_settings.rb +++ b/lib/gitlab/git/storage/circuit_breaker_settings.rb @@ -6,10 +6,6 @@ module Gitlab application_settings.circuitbreaker_failure_count_threshold end - def failure_wait_time - application_settings.circuitbreaker_failure_wait_time - end - def failure_reset_time application_settings.circuitbreaker_failure_reset_time end @@ -22,8 +18,12 @@ module Gitlab application_settings.circuitbreaker_access_retries end - def backoff_threshold - application_settings.circuitbreaker_backoff_threshold + def check_interval + application_settings.circuitbreaker_check_interval + end + + def cache_key + @cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}" end private diff --git a/lib/gitlab/git/storage/failure_info.rb b/lib/gitlab/git/storage/failure_info.rb new file mode 100644 index 00000000000..387279c110d --- /dev/null +++ b/lib/gitlab/git/storage/failure_info.rb @@ -0,0 +1,39 @@ +module Gitlab + module Git + module Storage + class FailureInfo + attr_accessor :first_failure, :last_failure, :failure_count + + def self.reset_all! + Gitlab::Git::Storage.redis.with do |redis| + all_storage_keys = redis.zrange(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, -1) + redis.del(*all_storage_keys) unless all_storage_keys.empty? + end + + RequestStore.delete(:circuitbreaker_cache) + end + + def self.load(cache_key) + first_failure, last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis| + redis.hmget(cache_key, :first_failure, :last_failure, :failure_count) + end + + last_failure = Time.at(last_failure.to_i) if last_failure.present? + first_failure = Time.at(first_failure.to_i) if first_failure.present? + + new(first_failure, last_failure, failure_count.to_i) + end + + def initialize(first_failure, last_failure, failure_count) + @first_failure = first_failure + @last_failure = last_failure + @failure_count = failure_count + end + + def no_failures? + first_failure.blank? && last_failure.blank? && failure_count == 0 + end + end + end + end +end diff --git a/lib/gitlab/git/storage/null_circuit_breaker.rb b/lib/gitlab/git/storage/null_circuit_breaker.rb index a12d52d295f..261c936c689 100644 --- a/lib/gitlab/git/storage/null_circuit_breaker.rb +++ b/lib/gitlab/git/storage/null_circuit_breaker.rb @@ -11,6 +11,9 @@ module Gitlab # These will always have nil values attr_reader :storage_path + delegate :last_failure, :failure_count, :no_failures?, + to: :failure_info + def initialize(storage, hostname, error: nil) @storage = storage @hostname = hostname @@ -29,16 +32,17 @@ module Gitlab false end - def last_failure - circuit_broken? ? Time.now : nil - end - - def failure_count - circuit_broken? ? failure_count_threshold : 0 - end - def failure_info - Gitlab::Git::Storage::CircuitBreaker::FailureInfo.new(last_failure, failure_count) + @failure_info ||= + if circuit_broken? + Gitlab::Git::Storage::FailureInfo.new(Time.now, + Time.now, + failure_count_threshold) + else + Gitlab::Git::Storage::FailureInfo.new(nil, + nil, + 0) + end end end end diff --git a/lib/gitlab/storage_check.rb b/lib/gitlab/storage_check.rb new file mode 100644 index 00000000000..fe81513c9ec --- /dev/null +++ b/lib/gitlab/storage_check.rb @@ -0,0 +1,11 @@ +require_relative 'storage_check/cli' +require_relative 'storage_check/gitlab_caller' +require_relative 'storage_check/option_parser' +require_relative 'storage_check/response' + +module Gitlab + module StorageCheck + ENDPOINT = '/-/storage_check'.freeze + Options = Struct.new(:target, :token, :interval, :dryrun) + end +end diff --git a/lib/gitlab/storage_check/cli.rb b/lib/gitlab/storage_check/cli.rb new file mode 100644 index 00000000000..04bf1bf1d26 --- /dev/null +++ b/lib/gitlab/storage_check/cli.rb @@ -0,0 +1,69 @@ +module Gitlab + module StorageCheck + class CLI + def self.start!(args) + runner = new(Gitlab::StorageCheck::OptionParser.parse!(args)) + runner.start_loop + end + + attr_reader :logger, :options + + def initialize(options) + @options = options + @logger = Logger.new(STDOUT) + end + + def start_loop + logger.info "Checking #{options.target} every #{options.interval} seconds" + + if options.dryrun + logger.info "Dryrun, exiting..." + return + end + + begin + loop do + response = GitlabCaller.new(options).call! + log_response(response) + update_settings(response) + + sleep options.interval + end + rescue Interrupt + logger.info "Ending storage-check" + end + end + + def update_settings(response) + previous_interval = options.interval + + if response.valid? + options.interval = response.check_interval || previous_interval + end + + if previous_interval != options.interval + logger.info "Interval changed: #{options.interval} seconds" + end + end + + def log_response(response) + unless response.valid? + return logger.error("Invalid response checking nfs storage: #{response.http_response.inspect}") + end + + if response.responsive_shards.any? + logger.debug("Responsive shards: #{response.responsive_shards.join(', ')}") + end + + warnings = [] + if response.skipped_shards.any? + warnings << "Skipped shards: #{response.skipped_shards.join(', ')}" + end + if response.failing_shards.any? + warnings << "Failing shards: #{response.failing_shards.join(', ')}" + end + logger.warn(warnings.join(' - ')) if warnings.any? + end + end + end +end diff --git a/lib/gitlab/storage_check/gitlab_caller.rb b/lib/gitlab/storage_check/gitlab_caller.rb new file mode 100644 index 00000000000..44952b68844 --- /dev/null +++ b/lib/gitlab/storage_check/gitlab_caller.rb @@ -0,0 +1,39 @@ +require 'excon' + +module Gitlab + module StorageCheck + class GitlabCaller + def initialize(options) + @options = options + end + + def call! + Gitlab::StorageCheck::Response.new(get_response) + rescue Errno::ECONNREFUSED, Excon::Error + # Server not ready, treated as invalid response. + Gitlab::StorageCheck::Response.new(nil) + end + + def get_response + scheme, *other_parts = URI.split(@options.target) + socket_path = if scheme == 'unix' + other_parts.compact.join + end + + connection = Excon.new(@options.target, socket: socket_path) + connection.post(path: Gitlab::StorageCheck::ENDPOINT, + headers: headers) + end + + def headers + @headers ||= begin + headers = {} + headers['Content-Type'] = headers['Accept'] = 'application/json' + headers['TOKEN'] = @options.token if @options.token + + headers + end + end + end + end +end diff --git a/lib/gitlab/storage_check/option_parser.rb b/lib/gitlab/storage_check/option_parser.rb new file mode 100644 index 00000000000..66ed7906f97 --- /dev/null +++ b/lib/gitlab/storage_check/option_parser.rb @@ -0,0 +1,39 @@ +module Gitlab + module StorageCheck + class OptionParser + def self.parse!(args) + # Start out with some defaults + options = Gitlab::StorageCheck::Options.new(nil, nil, 1, false) + + parser = ::OptionParser.new do |opts| + opts.banner = "Usage: bin/storage_check [options]" + + opts.on('-t=string', '--target string', 'URL or socket to trigger storage check') do |value| + options.target = value + end + + opts.on('-T=string', '--token string', 'Health token to use') { |value| options.token = value } + + opts.on('-i=n', '--interval n', ::OptionParser::DecimalInteger, 'Seconds between checks') do |value| + options.interval = value + end + + opts.on('-d', '--dryrun', "Output what will be performed, but don't start the process") do |value| + options.dryrun = value + end + end + parser.parse!(args) + + unless options.target + raise ::OptionParser::InvalidArgument.new('Provide a URI to provide checks') + end + + if URI.parse(options.target).scheme.nil? + raise ::OptionParser::InvalidArgument.new('Add the scheme to the target, `unix://`, `https://` or `http://` are supported') + end + + options + end + end + end +end diff --git a/lib/gitlab/storage_check/response.rb b/lib/gitlab/storage_check/response.rb new file mode 100644 index 00000000000..326ab236e3e --- /dev/null +++ b/lib/gitlab/storage_check/response.rb @@ -0,0 +1,77 @@ +require 'json' + +module Gitlab + module StorageCheck + class Response + attr_reader :http_response + + def initialize(http_response) + @http_response = http_response + end + + def valid? + @http_response && (200...299).cover?(@http_response.status) && + @http_response.headers['Content-Type'].include?('application/json') && + parsed_response + end + + def check_interval + return nil unless parsed_response + + parsed_response['check_interval'] + end + + def responsive_shards + divided_results[:responsive_shards] + end + + def skipped_shards + divided_results[:skipped_shards] + end + + def failing_shards + divided_results[:failing_shards] + end + + private + + def results + return [] unless parsed_response + + parsed_response['results'] + end + + def divided_results + return @divided_results if @divided_results + + @divided_results = {} + @divided_results[:responsive_shards] = [] + @divided_results[:skipped_shards] = [] + @divided_results[:failing_shards] = [] + + results.each do |info| + name = info['storage'] + + case info['success'] + when true + @divided_results[:responsive_shards] << name + when false + @divided_results[:failing_shards] << name + else + @divided_results[:skipped_shards] << name + end + end + + @divided_results + end + + def parsed_response + return @parsed_response if defined?(@parsed_response) + + @parsed_response = JSON.parse(@http_response.body) + rescue JSON::JSONError + @parsed_response = nil + end + end + end +end diff --git a/spec/bin/storage_check_spec.rb b/spec/bin/storage_check_spec.rb new file mode 100644 index 00000000000..02f6fcb6e3a --- /dev/null +++ b/spec/bin/storage_check_spec.rb @@ -0,0 +1,13 @@ +require 'spec_helper' + +describe 'bin/storage_check' do + it 'is executable' do + command = %w[bin/storage_check -t unix://the/path/to/a/unix-socket.sock -i 10 -d] + expected_output = 'Checking unix://the/path/to/a/unix-socket.sock every 10 seconds' + + output, status = Gitlab::Popen.popen(command, Rails.root.to_s) + + expect(status).to eq(0) + expect(output).to include(expected_output) + end +end diff --git a/spec/controllers/admin/health_check_controller_spec.rb b/spec/controllers/admin/health_check_controller_spec.rb index 0b8e0c8a065..d15ee0021d9 100644 --- a/spec/controllers/admin/health_check_controller_spec.rb +++ b/spec/controllers/admin/health_check_controller_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe Admin::HealthCheckController, broken_storage: true do +describe Admin::HealthCheckController do let(:admin) { create(:admin) } before do @@ -17,7 +17,7 @@ describe Admin::HealthCheckController, broken_storage: true do describe 'POST reset_storage_health' do it 'resets all storage health information' do - expect(Gitlab::Git::Storage::CircuitBreaker).to receive(:reset_all!) + expect(Gitlab::Git::Storage::FailureInfo).to receive(:reset_all!) post :reset_storage_health end diff --git a/spec/controllers/health_controller_spec.rb b/spec/controllers/health_controller_spec.rb index 9e9cf4f2c1f..95946def5f9 100644 --- a/spec/controllers/health_controller_spec.rb +++ b/spec/controllers/health_controller_spec.rb @@ -14,6 +14,48 @@ describe HealthController do stub_env('IN_MEMORY_APPLICATION_SETTINGS', 'false') end + describe '#storage_check' do + before do + allow(Gitlab::RequestContext).to receive(:client_ip).and_return(whitelisted_ip) + end + + subject { post :storage_check } + + it 'checks all the configured storages' do + expect(Gitlab::Git::Storage::Checker).to receive(:check_all).and_call_original + + subject + end + + it 'returns the check interval' do + stub_env('IN_MEMORY_APPLICATION_SETTINGS', 'true') + stub_application_setting(circuitbreaker_check_interval: 10) + + subject + + expect(json_response['check_interval']).to eq(10) + end + + context 'with failing storages', :broken_storage do + before do + stub_storage_settings( + broken: { path: 'tmp/tests/non-existent-repositories' } + ) + end + + it 'includes the failure information' do + subject + + expected_results = [ + { 'storage' => 'broken', 'success' => false }, + { 'storage' => 'default', 'success' => true } + ] + + expect(json_response['results']).to eq(expected_results) + end + end + end + describe '#readiness' do shared_context 'endpoint responding with readiness data' do let(:request_params) { {} } diff --git a/spec/features/admin/admin_health_check_spec.rb b/spec/features/admin/admin_health_check_spec.rb index 4430fc15501..ac3392b49f9 100644 --- a/spec/features/admin/admin_health_check_spec.rb +++ b/spec/features/admin/admin_health_check_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -feature "Admin Health Check", :feature, :broken_storage do +feature "Admin Health Check", :feature do include StubENV before do @@ -36,6 +36,7 @@ feature "Admin Health Check", :feature, :broken_storage do context 'when services are up' do before do + stub_storage_settings({}) # Hide the broken storage visit admin_health_check_path end @@ -56,10 +57,8 @@ feature "Admin Health Check", :feature, :broken_storage do end end - context 'with repository storage failures' do + context 'with repository storage failures', :broken_storage do before do - # Track a failure - Gitlab::Git::Storage::CircuitBreaker.for_storage('broken').perform { nil } rescue nil visit admin_health_check_path end @@ -67,9 +66,10 @@ feature "Admin Health Check", :feature, :broken_storage do hostname = Gitlab::Environment.hostname maximum_failures = Gitlab::CurrentSettings.current_application_settings .circuitbreaker_failure_count_threshold + number_of_failures = maximum_failures + 1 - expect(page).to have_content('broken: failed storage access attempt on host:') - expect(page).to have_content("#{hostname}: 1 of #{maximum_failures} failures.") + expect(page).to have_content("broken: #{number_of_failures} failed storage access attempts:") + expect(page).to have_content("#{hostname}: #{number_of_failures} of #{maximum_failures} failures.") end it 'allows resetting storage failures' do diff --git a/spec/lib/gitlab/git/storage/checker_spec.rb b/spec/lib/gitlab/git/storage/checker_spec.rb new file mode 100644 index 00000000000..d74c3bcb04c --- /dev/null +++ b/spec/lib/gitlab/git/storage/checker_spec.rb @@ -0,0 +1,132 @@ +require 'spec_helper' + +describe Gitlab::Git::Storage::Checker, :clean_gitlab_redis_shared_state do + let(:storage_name) { 'default' } + let(:hostname) { Gitlab::Environment.hostname } + let(:cache_key) { "storage_accessible:#{storage_name}:#{hostname}" } + + subject(:checker) { described_class.new(storage_name) } + + def value_from_redis(name) + Gitlab::Git::Storage.redis.with do |redis| + redis.hmget(cache_key, name) + end.first + end + + def set_in_redis(name, value) + Gitlab::Git::Storage.redis.with do |redis| + redis.hmset(cache_key, name, value) + end.first + end + + describe '.check_all' do + it 'calls a check for each storage' do + fake_checker_default = double + fake_checker_broken = double + fake_logger = fake_logger + + expect(described_class).to receive(:new).with('default', fake_logger) { fake_checker_default } + expect(described_class).to receive(:new).with('broken', fake_logger) { fake_checker_broken } + expect(fake_checker_default).to receive(:check_with_lease) + expect(fake_checker_broken).to receive(:check_with_lease) + + described_class.check_all(fake_logger) + end + + context 'with broken storage', :broken_storage do + it 'returns the results' do + expected_result = [ + { storage: 'default', success: true }, + { storage: 'broken', success: false } + ] + + expect(described_class.check_all).to eq(expected_result) + end + end + end + + describe '#initialize' do + it 'assigns the settings' do + expect(checker.hostname).to eq(hostname) + expect(checker.storage).to eq('default') + expect(checker.storage_path).to eq(TestEnv.repos_path) + end + end + + describe '#check_with_lease' do + it 'only allows one check at a time' do + expect(checker).to receive(:check).once { sleep 1 } + + thread = Thread.new { checker.check_with_lease } + checker.check_with_lease + thread.join + end + + it 'returns a result hash' do + expect(checker.check_with_lease).to eq(storage: 'default', success: true) + end + end + + describe '#check' do + it 'tracks that the storage was accessible' do + set_in_redis(:failure_count, 10) + set_in_redis(:last_failure, Time.now.to_f) + + checker.check + + expect(value_from_redis(:failure_count).to_i).to eq(0) + expect(value_from_redis(:last_failure)).to be_empty + expect(value_from_redis(:first_failure)).to be_empty + end + + it 'calls the check with the correct arguments' do + stub_application_setting(circuitbreaker_storage_timeout: 30, + circuitbreaker_access_retries: 3) + + expect(Gitlab::Git::Storage::ForkedStorageCheck) + .to receive(:storage_available?).with(TestEnv.repos_path, 30, 3) + .and_call_original + + checker.check + end + + it 'returns `true`' do + expect(checker.check).to eq(true) + end + + it 'maintains known storage keys' do + Timecop.freeze do + # Insert an old key to expire + old_entry = Time.now.to_i - 3.days.to_i + Gitlab::Git::Storage.redis.with do |redis| + redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, old_entry, 'to_be_removed') + end + + checker.check + + known_keys = Gitlab::Git::Storage.redis.with do |redis| + redis.zrange(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, -1) + end + + expect(known_keys).to contain_exactly(cache_key) + end + end + + context 'the storage is not available', :broken_storage do + let(:storage_name) { 'broken' } + + it 'tracks that the storage was inaccessible' do + Timecop.freeze do + expect { checker.check }.to change { value_from_redis(:failure_count).to_i }.by(1) + + expect(value_from_redis(:last_failure)).not_to be_empty + expect(value_from_redis(:first_failure)).not_to be_empty + end + end + + it 'returns `false`' do + expect(checker.check).to eq(false) + end + end + end +end diff --git a/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb b/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb index f34c9f09057..210b90bfba9 100644 --- a/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb +++ b/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb @@ -1,11 +1,18 @@ require 'spec_helper' -describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: true, broken_storage: true do +describe Gitlab::Git::Storage::CircuitBreaker, :broken_storage do let(:storage_name) { 'default' } let(:circuit_breaker) { described_class.new(storage_name, hostname) } let(:hostname) { Gitlab::Environment.hostname } let(:cache_key) { "storage_accessible:#{storage_name}:#{hostname}" } + def set_in_redis(name, value) + Gitlab::Git::Storage.redis.with do |redis| + redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, cache_key) + redis.hmset(cache_key, name, value) + end.first + end + before do # Override test-settings for the circuitbreaker with something more realistic # for these specs. @@ -19,36 +26,7 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: ) end - def value_from_redis(name) - Gitlab::Git::Storage.redis.with do |redis| - redis.hmget(cache_key, name) - end.first - end - - def set_in_redis(name, value) - Gitlab::Git::Storage.redis.with do |redis| - redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, cache_key) - redis.hmset(cache_key, name, value) - end.first - end - - describe '.reset_all!' do - it 'clears all entries form redis' do - set_in_redis(:failure_count, 10) - - described_class.reset_all! - - key_exists = Gitlab::Git::Storage.redis.with { |redis| redis.exists(cache_key) } - - expect(key_exists).to be_falsey - end - - it 'does not break when there are no keys in redis' do - expect { described_class.reset_all! }.not_to raise_error - end - end - - describe '.for_storage' do + describe '.for_storage', :request_store do it 'only builds a single circuitbreaker per storage' do expect(described_class).to receive(:new).once.and_call_original @@ -71,7 +49,6 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: it 'assigns the settings' do expect(circuit_breaker.hostname).to eq(hostname) expect(circuit_breaker.storage).to eq('default') - expect(circuit_breaker.storage_path).to eq(TestEnv.repos_path) end end @@ -91,9 +68,9 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: end end - describe '#failure_wait_time' do + describe '#check_interval' do it 'reads the value from settings' do - expect(circuit_breaker.failure_wait_time).to eq(1) + expect(circuit_breaker.check_interval).to eq(1) end end @@ -114,12 +91,6 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: expect(circuit_breaker.access_retries).to eq(4) end end - - describe '#backoff_threshold' do - it 'reads the value from settings' do - expect(circuit_breaker.backoff_threshold).to eq(5) - end - end end describe '#perform' do @@ -134,19 +105,6 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: end end - it 'raises the correct exception when backing off' do - Timecop.freeze do - set_in_redis(:last_failure, 1.second.ago.to_f) - set_in_redis(:failure_count, 90) - - expect { |b| circuit_breaker.perform(&b) } - .to raise_error do |exception| - expect(exception).to be_kind_of(Gitlab::Git::Storage::Failing) - expect(exception.retry_after).to eq(30) - end - end - end - it 'yields the block' do expect { |b| circuit_breaker.perform(&b) } .to yield_control @@ -170,54 +128,6 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: .to raise_error(Rugged::OSError) end - it 'tracks that the storage was accessible' do - set_in_redis(:failure_count, 10) - set_in_redis(:last_failure, Time.now.to_f) - - circuit_breaker.perform { '' } - - expect(value_from_redis(:failure_count).to_i).to eq(0) - expect(value_from_redis(:last_failure)).to be_empty - expect(circuit_breaker.failure_count).to eq(0) - expect(circuit_breaker.last_failure).to be_nil - end - - it 'maintains known storage keys' do - Timecop.freeze do - # Insert an old key to expire - old_entry = Time.now.to_i - 3.days.to_i - Gitlab::Git::Storage.redis.with do |redis| - redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, old_entry, 'to_be_removed') - end - - circuit_breaker.perform { '' } - - known_keys = Gitlab::Git::Storage.redis.with do |redis| - redis.zrange(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, -1) - end - - expect(known_keys).to contain_exactly(cache_key) - end - end - - it 'only performs the accessibility check once' do - expect(Gitlab::Git::Storage::ForkedStorageCheck) - .to receive(:storage_available?).once.and_call_original - - 2.times { circuit_breaker.perform { '' } } - end - - it 'calls the check with the correct arguments' do - stub_application_setting(circuitbreaker_storage_timeout: 30, - circuitbreaker_access_retries: 3) - - expect(Gitlab::Git::Storage::ForkedStorageCheck) - .to receive(:storage_available?).with(TestEnv.repos_path, 30, 3) - .and_call_original - - circuit_breaker.perform { '' } - end - context 'with the feature disabled' do before do stub_feature_flags(git_storage_circuit_breaker: false) @@ -240,31 +150,6 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: expect(result).to eq('hello') end end - - context 'the storage is not available' do - let(:storage_name) { 'broken' } - - it 'raises the correct exception' do - expect(circuit_breaker).to receive(:track_storage_inaccessible) - - expect { circuit_breaker.perform { '' } } - .to raise_error do |exception| - expect(exception).to be_kind_of(Gitlab::Git::Storage::Inaccessible) - expect(exception.retry_after).to eq(30) - end - end - - it 'tracks that the storage was inaccessible' do - Timecop.freeze do - expect { circuit_breaker.perform { '' } }.to raise_error(Gitlab::Git::Storage::Inaccessible) - - expect(value_from_redis(:failure_count).to_i).to eq(1) - expect(value_from_redis(:last_failure)).not_to be_empty - expect(circuit_breaker.failure_count).to eq(1) - expect(circuit_breaker.last_failure).to be_within(1.second).of(Time.now) - end - end - end end describe '#circuit_broken?' do @@ -283,32 +168,6 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: end end - describe '#backing_off?' do - it 'is true when there was a recent failure' do - Timecop.freeze do - set_in_redis(:last_failure, 1.second.ago.to_f) - set_in_redis(:failure_count, 90) - - expect(circuit_breaker.backing_off?).to be_truthy - end - end - - context 'the `failure_wait_time` is set to 0' do - before do - stub_application_setting(circuitbreaker_failure_wait_time: 0) - end - - it 'is working even when there are failures' do - Timecop.freeze do - set_in_redis(:last_failure, 0.seconds.ago.to_f) - set_in_redis(:failure_count, 90) - - expect(circuit_breaker.backing_off?).to be_falsey - end - end - end - end - describe '#last_failure' do it 'returns the last failure time' do time = Time.parse("2017-05-26 17:52:30") diff --git a/spec/lib/gitlab/git/storage/failure_info_spec.rb b/spec/lib/gitlab/git/storage/failure_info_spec.rb new file mode 100644 index 00000000000..bae88fdda86 --- /dev/null +++ b/spec/lib/gitlab/git/storage/failure_info_spec.rb @@ -0,0 +1,70 @@ +require 'spec_helper' + +describe Gitlab::Git::Storage::FailureInfo, :broken_storage do + let(:storage_name) { 'default' } + let(:hostname) { Gitlab::Environment.hostname } + let(:cache_key) { "storage_accessible:#{storage_name}:#{hostname}" } + + def value_from_redis(name) + Gitlab::Git::Storage.redis.with do |redis| + redis.hmget(cache_key, name) + end.first + end + + def set_in_redis(name, value) + Gitlab::Git::Storage.redis.with do |redis| + redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, cache_key) + redis.hmset(cache_key, name, value) + end.first + end + + describe '.reset_all!' do + it 'clears all entries form redis' do + set_in_redis(:failure_count, 10) + + described_class.reset_all! + + key_exists = Gitlab::Git::Storage.redis.with { |redis| redis.exists(cache_key) } + + expect(key_exists).to be_falsey + end + + it 'does not break when there are no keys in redis' do + expect { described_class.reset_all! }.not_to raise_error + end + end + + describe '.load' do + it 'loads failure information for a storage on a host' do + first_failure = Time.parse("2017-11-14 17:52:30") + last_failure = Time.parse("2017-11-14 18:54:37") + failure_count = 11 + + set_in_redis(:first_failure, first_failure.to_i) + set_in_redis(:last_failure, last_failure.to_i) + set_in_redis(:failure_count, failure_count.to_i) + + info = described_class.load(cache_key) + + expect(info.first_failure).to eq(first_failure) + expect(info.last_failure).to eq(last_failure) + expect(info.failure_count).to eq(failure_count) + end + end + + describe '#no_failures?' do + it 'is true when there are no failures' do + info = described_class.new(nil, nil, 0) + + expect(info.no_failures?).to be_truthy + end + + it 'is false when there are failures' do + info = described_class.new(Time.parse("2017-11-14 17:52:30"), + Time.parse("2017-11-14 18:54:37"), + 20) + + expect(info.no_failures?).to be_falsy + end + end +end diff --git a/spec/lib/gitlab/git/storage/health_spec.rb b/spec/lib/gitlab/git/storage/health_spec.rb index d7a52a04fbb..bb670fc5d94 100644 --- a/spec/lib/gitlab/git/storage/health_spec.rb +++ b/spec/lib/gitlab/git/storage/health_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe Gitlab::Git::Storage::Health, clean_gitlab_redis_shared_state: true, broken_storage: true do +describe Gitlab::Git::Storage::Health, broken_storage: true do let(:host1_key) { 'storage_accessible:broken:web01' } let(:host2_key) { 'storage_accessible:default:kiq01' } diff --git a/spec/lib/gitlab/git/storage/null_circuit_breaker_spec.rb b/spec/lib/gitlab/git/storage/null_circuit_breaker_spec.rb index 5db37f55e03..93ad20011de 100644 --- a/spec/lib/gitlab/git/storage/null_circuit_breaker_spec.rb +++ b/spec/lib/gitlab/git/storage/null_circuit_breaker_spec.rb @@ -27,7 +27,7 @@ describe Gitlab::Git::Storage::NullCircuitBreaker do end describe '#failure_info' do - it { Timecop.freeze { expect(breaker.failure_info).to eq(Gitlab::Git::Storage::CircuitBreaker::FailureInfo.new(Time.now, breaker.failure_count_threshold)) } } + it { expect(breaker.failure_info.no_failures?).to be_falsy } end end @@ -49,7 +49,7 @@ describe Gitlab::Git::Storage::NullCircuitBreaker do end describe '#failure_info' do - it { expect(breaker.failure_info).to eq(Gitlab::Git::Storage::CircuitBreaker::FailureInfo.new(nil, 0)) } + it { expect(breaker.failure_info.no_failures?).to be_truthy } end end diff --git a/spec/lib/gitlab/storage_check/cli_spec.rb b/spec/lib/gitlab/storage_check/cli_spec.rb new file mode 100644 index 00000000000..6db0925899c --- /dev/null +++ b/spec/lib/gitlab/storage_check/cli_spec.rb @@ -0,0 +1,19 @@ +require 'spec_helper' + +describe Gitlab::StorageCheck::CLI do + let(:options) { Gitlab::StorageCheck::Options.new('unix://tmp/socket.sock', nil, 1, false) } + subject(:runner) { described_class.new(options) } + + describe '#update_settings' do + it 'updates the interval when changed in a valid response and logs the change' do + fake_response = double + expect(fake_response).to receive(:valid?).and_return(true) + expect(fake_response).to receive(:check_interval).and_return(42) + expect(runner.logger).to receive(:info) + + runner.update_settings(fake_response) + + expect(options.interval).to eq(42) + end + end +end diff --git a/spec/lib/gitlab/storage_check/gitlab_caller_spec.rb b/spec/lib/gitlab/storage_check/gitlab_caller_spec.rb new file mode 100644 index 00000000000..d869022fd31 --- /dev/null +++ b/spec/lib/gitlab/storage_check/gitlab_caller_spec.rb @@ -0,0 +1,46 @@ +require 'spec_helper' + +describe Gitlab::StorageCheck::GitlabCaller do + let(:options) { Gitlab::StorageCheck::Options.new('unix://tmp/socket.sock', nil, nil, false) } + subject(:gitlab_caller) { described_class.new(options) } + + describe '#call!' do + context 'when a socket is given' do + it 'calls a socket' do + fake_connection = double + expect(fake_connection).to receive(:post) + expect(Excon).to receive(:new).with('unix://tmp/socket.sock', socket: "tmp/socket.sock") { fake_connection } + + gitlab_caller.call! + end + end + + context 'when a host is given' do + let(:options) { Gitlab::StorageCheck::Options.new('http://localhost:8080', nil, nil, false) } + + it 'it calls a http response' do + fake_connection = double + expect(Excon).to receive(:new).with('http://localhost:8080', socket: nil) { fake_connection } + expect(fake_connection).to receive(:post) + + gitlab_caller.call! + end + end + end + + describe '#headers' do + it 'Adds the JSON header' do + headers = gitlab_caller.headers + + expect(headers['Content-Type']).to eq('application/json') + end + + context 'when a token was provided' do + let(:options) { Gitlab::StorageCheck::Options.new('unix://tmp/socket.sock', 'atoken', nil, false) } + + it 'adds it to the headers' do + expect(gitlab_caller.headers['TOKEN']).to eq('atoken') + end + end + end +end diff --git a/spec/lib/gitlab/storage_check/option_parser_spec.rb b/spec/lib/gitlab/storage_check/option_parser_spec.rb new file mode 100644 index 00000000000..cad4dfbefcf --- /dev/null +++ b/spec/lib/gitlab/storage_check/option_parser_spec.rb @@ -0,0 +1,31 @@ +require 'spec_helper' + +describe Gitlab::StorageCheck::OptionParser do + describe '.parse!' do + it 'assigns all options' do + args = %w(--target unix://tmp/hello/world.sock --token thetoken --interval 42) + + options = described_class.parse!(args) + + expect(options.token).to eq('thetoken') + expect(options.interval).to eq(42) + expect(options.target).to eq('unix://tmp/hello/world.sock') + end + + it 'requires the interval to be a number' do + args = %w(--target unix://tmp/hello/world.sock --interval fortytwo) + + expect { described_class.parse!(args) }.to raise_error(OptionParser::InvalidArgument) + end + + it 'raises an error if the scheme is not included' do + args = %w(--target tmp/hello/world.sock) + + expect { described_class.parse!(args) }.to raise_error(OptionParser::InvalidArgument) + end + + it 'raises an error if both socket and host are missing' do + expect { described_class.parse!([]) }.to raise_error(OptionParser::InvalidArgument) + end + end +end diff --git a/spec/lib/gitlab/storage_check/response_spec.rb b/spec/lib/gitlab/storage_check/response_spec.rb new file mode 100644 index 00000000000..0ff2963e443 --- /dev/null +++ b/spec/lib/gitlab/storage_check/response_spec.rb @@ -0,0 +1,54 @@ +require 'spec_helper' + +describe Gitlab::StorageCheck::Response do + let(:fake_json) do + { + check_interval: 42, + results: [ + { storage: 'working', success: true }, + { storage: 'skipped', success: nil }, + { storage: 'failing', success: false } + ] + }.to_json + end + + let(:fake_http_response) do + fake_response = instance_double("Excon::Response - Status check") + allow(fake_response).to receive(:status).and_return(200) + allow(fake_response).to receive(:body).and_return(fake_json) + allow(fake_response).to receive(:headers).and_return('Content-Type' => 'application/json') + + fake_response + end + let(:response) { described_class.new(fake_http_response) } + + describe '#valid?' do + it 'is valid for a success response with parseable JSON' do + expect(response).to be_valid + end + end + + describe '#check_interval' do + it 'returns the result from the JSON' do + expect(response.check_interval).to eq(42) + end + end + + describe '#responsive_shards' do + it 'contains the names of working shards' do + expect(response.responsive_shards).to contain_exactly('working') + end + end + + describe '#skipped_shards' do + it 'contains the names of skipped shards' do + expect(response.skipped_shards).to contain_exactly('skipped') + end + end + + describe '#failing_shards' do + it 'contains the name of failing shards' do + expect(response.failing_shards).to contain_exactly('failing') + end + end +end diff --git a/spec/models/application_setting_spec.rb b/spec/models/application_setting_spec.rb index 0b7e16cc33c..ef480e7a80a 100644 --- a/spec/models/application_setting_spec.rb +++ b/spec/models/application_setting_spec.rb @@ -115,9 +115,8 @@ describe ApplicationSetting do end context 'circuitbreaker settings' do - [:circuitbreaker_backoff_threshold, - :circuitbreaker_failure_count_threshold, - :circuitbreaker_failure_wait_time, + [:circuitbreaker_failure_count_threshold, + :circuitbreaker_check_interval, :circuitbreaker_failure_reset_time, :circuitbreaker_storage_timeout].each do |field| it "Validates #{field} as number" do @@ -126,16 +125,6 @@ describe ApplicationSetting do .is_greater_than_or_equal_to(0) end end - - it 'requires the `backoff_threshold` to be lower than the `failure_count_threshold`' do - setting.circuitbreaker_failure_count_threshold = 10 - setting.circuitbreaker_backoff_threshold = 15 - failure_message = "The circuitbreaker backoff threshold should be lower "\ - "than the failure count threshold" - - expect(setting).not_to be_valid - expect(setting.errors[:circuitbreaker_backoff_threshold]).to include(failure_message) - end end context 'repository storages' do diff --git a/spec/models/repository_spec.rb b/spec/models/repository_spec.rb index 82ed1ecee33..358bc3dfb94 100644 --- a/spec/models/repository_spec.rb +++ b/spec/models/repository_spec.rb @@ -29,7 +29,9 @@ describe Repository do def expect_to_raise_storage_error expect { yield }.to raise_error do |exception| storage_exceptions = [Gitlab::Git::Storage::Inaccessible, Gitlab::Git::CommandError, GRPC::Unavailable] - expect(exception.class).to be_in(storage_exceptions) + known_exception = storage_exceptions.select { |e| exception.is_a?(e) } + + expect(known_exception).not_to be_nil end end @@ -634,9 +636,7 @@ describe Repository do end describe '#fetch_ref' do - # Setting the var here, sidesteps the stub that makes gitaly raise an error - # before the actual test call - set(:broken_repository) { create(:project, :broken_storage).repository } + let(:broken_repository) { create(:project, :broken_storage).repository } describe 'when storage is broken', :broken_storage do it 'should raise a storage error' do diff --git a/spec/requests/api/circuit_breakers_spec.rb b/spec/requests/api/circuit_breakers_spec.rb index 3b858c40fd6..fe76f057115 100644 --- a/spec/requests/api/circuit_breakers_spec.rb +++ b/spec/requests/api/circuit_breakers_spec.rb @@ -47,7 +47,7 @@ describe API::CircuitBreakers do describe 'DELETE circuit_breakers/repository_storage' do it 'clears all circuit_breakers' do - expect(Gitlab::Git::Storage::CircuitBreaker).to receive(:reset_all!) + expect(Gitlab::Git::Storage::FailureInfo).to receive(:reset_all!) delete api('/circuit_breakers/repository_storage', admin) diff --git a/spec/requests/api/settings_spec.rb b/spec/requests/api/settings_spec.rb index 63175c40a18..015d4b9a491 100644 --- a/spec/requests/api/settings_spec.rb +++ b/spec/requests/api/settings_spec.rb @@ -54,7 +54,7 @@ describe API::Settings, 'Settings' do dsa_key_restriction: 2048, ecdsa_key_restriction: 384, ed25519_key_restriction: 256, - circuitbreaker_failure_wait_time: 2 + circuitbreaker_check_interval: 2 expect(response).to have_gitlab_http_status(200) expect(json_response['default_projects_limit']).to eq(3) @@ -75,7 +75,7 @@ describe API::Settings, 'Settings' do expect(json_response['dsa_key_restriction']).to eq(2048) expect(json_response['ecdsa_key_restriction']).to eq(384) expect(json_response['ed25519_key_restriction']).to eq(256) - expect(json_response['circuitbreaker_failure_wait_time']).to eq(2) + expect(json_response['circuitbreaker_check_interval']).to eq(2) end end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 242a2230b67..f94fb8733d5 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -121,18 +121,6 @@ RSpec.configure do |config| reset_delivered_emails! end - # Stub the `ForkedStorageCheck.storage_available?` method unless - # `:broken_storage` metadata is defined - # - # This check can be slow and is unnecessary in a test environment where we - # know the storage is available, because we create it at runtime - config.before(:example) do |example| - unless example.metadata[:broken_storage] - allow(Gitlab::Git::Storage::ForkedStorageCheck) - .to receive(:storage_available?).and_return(true) - end - end - config.around(:each, :use_clean_rails_memory_store_caching) do |example| caching_store = Rails.cache Rails.cache = ActiveSupport::Cache::MemoryStore.new diff --git a/spec/support/stored_repositories.rb b/spec/support/stored_repositories.rb index f3deae0f455..f9121cce985 100644 --- a/spec/support/stored_repositories.rb +++ b/spec/support/stored_repositories.rb @@ -12,6 +12,25 @@ RSpec.configure do |config| raise GRPC::Unavailable.new('Gitaly broken in this spec') end - Gitlab::Git::Storage::CircuitBreaker.reset_all! + # Track the maximum number of failures + first_failure = Time.parse("2017-11-14 17:52:30") + last_failure = Time.parse("2017-11-14 18:54:37") + failure_count = Gitlab::CurrentSettings + .current_application_settings + .circuitbreaker_failure_count_threshold + 1 + cache_key = "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}broken:#{Gitlab::Environment.hostname}" + + Gitlab::Git::Storage.redis.with do |redis| + redis.pipelined do + redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, cache_key) + redis.hset(cache_key, :first_failure, first_failure.to_i) + redis.hset(cache_key, :last_failure, last_failure.to_i) + redis.hset(cache_key, :failure_count, failure_count.to_i) + end + end + end + + config.after(:each, :broken_storage) do + Gitlab::Git::Storage.redis.with(&:flushall) end end diff --git a/spec/support/stub_configuration.rb b/spec/support/stub_configuration.rb index 4ead78529c3..b36cf3c544c 100644 --- a/spec/support/stub_configuration.rb +++ b/spec/support/stub_configuration.rb @@ -43,6 +43,8 @@ module StubConfiguration end def stub_storage_settings(messages) + messages.deep_stringify_keys! + # Default storage is always required messages['default'] ||= Gitlab.config.repositories.storages.default messages.each do |storage_name, storage_settings| |