diff options
author | Bob Van Landuyt <bob@gitlab.com> | 2017-05-17 18:17:15 +0200 |
---|---|---|
committer | Bob Van Landuyt <bob@vanlanduyt.co> | 2017-08-04 15:38:48 +0200 |
commit | 3598e60bf20b185b3f8d4e9a88a8eff39c8f729b (patch) | |
tree | ba84a7e7972d4a2563bb79485933fb78462868de /lib | |
parent | 990feb9f2b886c5bd0ac37339f149b8e80202019 (diff) | |
download | gitlab-ce-3598e60bf20b185b3f8d4e9a88a8eff39c8f729b.tar.gz |
Add a Circuitbreaker for storage paths
Diffstat (limited to 'lib')
-rw-r--r-- | lib/api/api.rb | 1 | ||||
-rw-r--r-- | lib/api/circuit_breakers.rb | 50 | ||||
-rw-r--r-- | lib/api/entities.rb | 6 | ||||
-rw-r--r-- | lib/gitlab/git/repository.rb | 8 | ||||
-rw-r--r-- | lib/gitlab/git/storage.rb | 22 | ||||
-rw-r--r-- | lib/gitlab/git/storage/circuit_breaker.rb | 142 | ||||
-rw-r--r-- | lib/gitlab/git/storage/forked_storage_check.rb | 59 | ||||
-rw-r--r-- | lib/gitlab/git/storage/health.rb | 101 | ||||
-rw-r--r-- | lib/gitlab/health_checks/fs_shards_check.rb | 21 |
9 files changed, 407 insertions, 3 deletions
diff --git a/lib/api/api.rb b/lib/api/api.rb index 982a2b88d62..94df543853b 100644 --- a/lib/api/api.rb +++ b/lib/api/api.rb @@ -95,6 +95,7 @@ module API mount ::API::Boards mount ::API::Branches mount ::API::BroadcastMessages + mount ::API::CircuitBreakers mount ::API::Commits mount ::API::CommitStatuses mount ::API::DeployKeys diff --git a/lib/api/circuit_breakers.rb b/lib/api/circuit_breakers.rb new file mode 100644 index 00000000000..118883f5ea5 --- /dev/null +++ b/lib/api/circuit_breakers.rb @@ -0,0 +1,50 @@ +module API + class CircuitBreakers < Grape::API + before { authenticated_as_admin! } + + resource :circuit_breakers do + params do + requires :type, + type: String, + desc: "The type of circuitbreaker", + values: ['repository_storage'] + end + resource ':type' do + namespace '', requirements: { type: 'repository_storage' } do + helpers do + def failing_storage_health + @failing_storage_health ||= Gitlab::Git::Storage::Health.for_failing_storages + end + + def storage_health + @failing_storage_health ||= Gitlab::Git::Storage::Health.for_all_storages + end + end + + desc 'Get all failing git storages' do + detail 'This feature was introduced in GitLab 9.5' + success Entities::RepositoryStorageHealth + end + get do + present storage_health, with: Entities::RepositoryStorageHealth + end + + desc 'Get all failing git storages' do + detail 'This feature was introduced in GitLab 9.5' + success Entities::RepositoryStorageHealth + end + get 'failing' do + present failing_storage_health, with: Entities::RepositoryStorageHealth + end + + desc 'Reset all storage failures and open circuitbreaker' do + detail 'This feature was introduced in GitLab 9.5' + end + delete do + Gitlab::Git::Storage::CircuitBreaker.reset_all! + end + end + end + end + end +end diff --git a/lib/api/entities.rb b/lib/api/entities.rb index 298831a8fdb..f25b408439a 100644 --- a/lib/api/entities.rb +++ b/lib/api/entities.rb @@ -954,5 +954,11 @@ module API expose :ip_address expose :submitted, as: :akismet_submitted end + + class RepositoryStorageHealth < Grape::Entity + expose :storage_name + expose :failing_on_hosts + expose :total_failures + end end end diff --git a/lib/gitlab/git/repository.rb b/lib/gitlab/git/repository.rb index ffe2c8b91bb..aa3252e1df8 100644 --- a/lib/gitlab/git/repository.rb +++ b/lib/gitlab/git/repository.rb @@ -64,11 +64,17 @@ module Gitlab end def rugged - @rugged ||= Rugged::Repository.new(path, alternates: alternate_object_directories) + @rugged ||= circuit_breaker.perform do + Rugged::Repository.new(path, alternates: alternate_object_directories) + end rescue Rugged::RepositoryError, Rugged::OSError raise NoRepository.new('no repository for such path') end + def circuit_breaker + @circuit_breaker ||= Gitlab::Git::Storage::CircuitBreaker.for_storage(storage) + end + # Returns an Array of branch names # sorted by name ASC def branch_names diff --git a/lib/gitlab/git/storage.rb b/lib/gitlab/git/storage.rb new file mode 100644 index 00000000000..e28be4b8a38 --- /dev/null +++ b/lib/gitlab/git/storage.rb @@ -0,0 +1,22 @@ +module Gitlab + module Git + module Storage + class Inaccessible < StandardError + attr_reader :retry_after + + def initialize(message = nil, retry_after = nil) + super(message) + @retry_after = retry_after + end + end + + CircuitOpen = Class.new(Inaccessible) + + REDIS_KEY_PREFIX = 'storage_accessible:'.freeze + + def self.redis + Gitlab::Redis::SharedState + end + end + end +end diff --git a/lib/gitlab/git/storage/circuit_breaker.rb b/lib/gitlab/git/storage/circuit_breaker.rb new file mode 100644 index 00000000000..c722771e0d5 --- /dev/null +++ b/lib/gitlab/git/storage/circuit_breaker.rb @@ -0,0 +1,142 @@ +module Gitlab + module Git + module Storage + class CircuitBreaker + attr_reader :storage, + :hostname, + :storage_path, + :failure_count_threshold, + :failure_wait_time, + :failure_reset_time, + :storage_timeout + + def self.reset_all! + pattern = "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}*" + + Gitlab::Git::Storage.redis.with do |redis| + all_storage_keys = redis.scan_each(match: pattern).to_a + redis.del(*all_storage_keys) unless all_storage_keys.empty? + end + + RequestStore.delete(:circuitbreaker_cache) + end + + def self.for_storage(storage) + cached_circuitbreakers = RequestStore.fetch(:circuitbreaker_cache) do + Hash.new do |hash, storage_name| + hash[storage_name] = new(storage_name) + end + end + + cached_circuitbreakers[storage] + end + + def initialize(storage, hostname = Gitlab.config.gitlab.hostname) + @storage = storage + @hostname = hostname + + config = Gitlab.config.repositories.storages[@storage] + @storage_path = config['path'] + @failure_count_threshold = config['failure_count_threshold'] + @failure_wait_time = config['failure_wait_time'] + @failure_reset_time = config['failure_reset_time'] + @storage_timeout = config['storage_timeout'] + end + + def perform + return yield unless Feature.enabled?('git_storage_circuit_breaker') + + if circuit_broken? + raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} open", failure_wait_time) + end + + check_storage_accessible! + + yield + end + + def circuit_broken? + return false if no_failures? + + recent_failure = last_failure > failure_wait_time.seconds.ago + too_many_failures = failure_count > failure_count_threshold + + recent_failure || too_many_failures + end + + # Memoizing the `storage_available` call means we only do it once per + # request when the storage is available. + # + # When the storage appears not available, and the memoized value is `false` + # we might want to try again. + def storage_available? + @storage_available ||= Gitlab::Git::Storage::ForkedStorageCheck.storage_available?(storage_path, storage_timeout) + end + + def check_storage_accessible! + if storage_available? + track_storage_accessible + else + track_storage_inaccessible + raise Gitlab::Git::Storage::Inaccessible.new("#{storage} not accessible", failure_wait_time) + end + end + + def no_failures? + last_failure.blank? && failure_count == 0 + end + + def track_storage_inaccessible + @failure_info = [Time.now, failure_count + 1] + + Gitlab::Git::Storage.redis.with do |redis| + redis.pipelined do + redis.hset(cache_key, :last_failure, last_failure.to_i) + redis.hincrby(cache_key, :failure_count, 1) + redis.expire(cache_key, failure_reset_time) + end + end + end + + def track_storage_accessible + return if no_failures? + + @failure_info = [nil, 0] + + Gitlab::Git::Storage.redis.with do |redis| + redis.pipelined do + redis.hset(cache_key, :last_failure, nil) + redis.hset(cache_key, :failure_count, 0) + end + end + end + + def last_failure + failure_info.first + end + + def failure_count + failure_info.last + end + + def failure_info + @failure_info ||= get_failure_info + end + + def get_failure_info + last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis| + redis.hmget(cache_key, :last_failure, :failure_count) + end + + last_failure = Time.at(last_failure.to_i) if last_failure.present? + + [last_failure, failure_count.to_i] + end + + def cache_key + @cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}" + end + end + end + end +end diff --git a/lib/gitlab/git/storage/forked_storage_check.rb b/lib/gitlab/git/storage/forked_storage_check.rb new file mode 100644 index 00000000000..557a3b0e61d --- /dev/null +++ b/lib/gitlab/git/storage/forked_storage_check.rb @@ -0,0 +1,59 @@ +module Gitlab + module Git + module Storage + module ForkedStorageCheck + extend self + + def storage_available?(path, timeout_seconds = 5) + status = timeout_check(path, timeout_seconds) + + status.success? + end + + def timeout_check(path, timeout_seconds) + filesystem_check_pid = check_filesystem_in_fork(path) + + deadline = timeout_seconds.seconds.from_now.utc + wait_time = 0.01 + status = nil + + while status.nil? + if deadline > Time.now.utc + sleep(wait_time) + _pid, status = Process.wait2(filesystem_check_pid, Process::WNOHANG) + else + Process.kill('KILL', filesystem_check_pid) + # Blocking wait, so we are sure the process is gone before continuing + _pid, status = Process.wait2(filesystem_check_pid) + end + end + + status + end + + # This call forks out into a process, that process will then be replaced + # With an `exec` call, since we fork out into a shell, we can create a + # child process without needing an ActiveRecord-connection. + # + # Inside the shell, we use `& wait` to fork another child. We do this + # to prevent leaving a zombie process when the parent gets killed by the + # timeout. + # + # https://stackoverflow.com/questions/27892975/what-causes-activerecord-breaking-postgres-connection-after-forking + # https://stackoverflow.com/questions/22012943/activerecordstatementinvalid-runtimeerror-the-connection-cannot-be-reused-in + def check_filesystem_in_fork(path) + fork do + STDOUT.reopen('/dev/null') + STDERR.reopen('/dev/null') + + exec("(#{test_script(path)}) & wait %1") + end + end + + def test_script(path) + "testpath=\"$(realpath #{Shellwords.escape(path)})\" && stat $testpath" + end + end + end + end +end diff --git a/lib/gitlab/git/storage/health.rb b/lib/gitlab/git/storage/health.rb new file mode 100644 index 00000000000..0a28052fd81 --- /dev/null +++ b/lib/gitlab/git/storage/health.rb @@ -0,0 +1,101 @@ +module Gitlab + module Git + module Storage + class Health + attr_reader :storage_name, :info + + def self.pattern_for_storage(storage_name) + "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage_name}:*" + end + + def self.for_all_storages + storage_names = Gitlab.config.repositories.storages.keys + results_per_storage = nil + + Gitlab::Git::Storage.redis.with do |redis| + keys_per_storage = all_keys_for_storages(storage_names, redis) + + # We need to make sure all keys are actually loaded as an array. + # Otherwise when using the enumerator of the `scan_each` within a + # second pipeline, it will be assumed unloaded, wich would make the + # result unusable inside the pipeline. + loaded_keys_per_storage = keys_per_storage.inject({}) do |loaded_keys, (storage_name, keys)| + loaded_keys[storage_name] = keys.to_a + loaded_keys + end + + results_per_storage = load_for_keys(loaded_keys_per_storage, redis) + end + + results_per_storage.map do |name, info| + info.each { |i| i[:failure_count] = i[:failure_count].value.to_i } + new(name, info) + end + end + + def self.all_keys_for_storages(storage_names, redis) + keys_per_storage = nil + + redis.pipelined do + keys_per_storage = storage_names.inject({}) do |result, storage_name| + key = pattern_for_storage(storage_name) + + result.merge(storage_name => redis.scan_each(match: key)) + end + end + + keys_per_storage + end + + def self.load_for_keys(keys_per_storage, redis) + info_for_keys = nil + + redis.pipelined do + info_for_keys = keys_per_storage.inject({}) do |result, (storage_name, keys)| + info_for_storage = keys.map do |key| + { name: key, failure_count: redis.hget(key, :failure_count) } + end + + result.merge(storage_name => info_for_storage) + end + end + + info_for_keys + end + + def self.for_failing_storages + for_all_storages.select(&:failing?) + end + + def initialize(storage_name, info) + @storage_name = storage_name + @info = info + end + + def failing_info + @failing_info ||= info.select { |info_for_host| info_for_host[:failure_count] > 0 } + end + + def failing? + failing_info.any? + end + + def failing_on_hosts + @failing_on_hosts ||= failing_info.map do |info_for_host| + info_for_host[:name].split(':').last + end + end + + def failing_circuit_breakers + @failing_circuit_breakers ||= failing_on_hosts.map do |hostname| + CircuitBreaker.new(storage_name, hostname) + end + end + + def total_failures + @total_failures ||= failing_info.sum { |info_for_host| info_for_host[:failure_count] } + end + end + end + end +end diff --git a/lib/gitlab/health_checks/fs_shards_check.rb b/lib/gitlab/health_checks/fs_shards_check.rb index 9e91c135956..eef97f54962 100644 --- a/lib/gitlab/health_checks/fs_shards_check.rb +++ b/lib/gitlab/health_checks/fs_shards_check.rb @@ -10,7 +10,9 @@ module Gitlab def readiness repository_storages.map do |storage_name| begin - if !storage_stat_test(storage_name) + if !storage_circuitbreaker_test(storage_name) + HealthChecks::Result.new(false, 'circuitbreaker tripped', shard: storage_name) + elsif !storage_stat_test(storage_name) HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name) else with_temp_file(storage_name) do |tmp_file_path| @@ -36,7 +38,8 @@ module Gitlab [ storage_stat_metrics(storage_name), storage_write_metrics(storage_name), - storage_read_metrics(storage_name) + storage_read_metrics(storage_name), + storage_circuitbreaker_metrics(storage_name) ].flatten end end @@ -121,6 +124,12 @@ module Gitlab file_contents == RANDOM_STRING end + def storage_circuitbreaker_test(storage_name) + Gitlab::Git::Storage::CircuitBreaker.new(storage_name).perform { "OK" } + rescue Gitlab::Git::Storage::Inaccessible + nil + end + def storage_stat_metrics(storage_name) operation_metrics(:filesystem_accessible, :filesystem_access_latency_seconds, shard: storage_name) do with_timing { storage_stat_test(storage_name) } @@ -143,6 +152,14 @@ module Gitlab end end end + + def storage_circuitbreaker_metrics(storage_name) + operation_metrics(:filesystem_circuitbreaker, + :filesystem_circuitbreaker_latency_seconds, + shard: storage_name) do + with_timing { storage_circuitbreaker_test(storage_name) } + end + end end end end |