summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorBob Van Landuyt <bob@gitlab.com>2017-05-17 18:17:15 +0200
committerBob Van Landuyt <bob@vanlanduyt.co>2017-08-04 15:38:48 +0200
commit3598e60bf20b185b3f8d4e9a88a8eff39c8f729b (patch)
treeba84a7e7972d4a2563bb79485933fb78462868de /lib
parent990feb9f2b886c5bd0ac37339f149b8e80202019 (diff)
downloadgitlab-ce-3598e60bf20b185b3f8d4e9a88a8eff39c8f729b.tar.gz
Add a Circuitbreaker for storage paths
Diffstat (limited to 'lib')
-rw-r--r--lib/api/api.rb1
-rw-r--r--lib/api/circuit_breakers.rb50
-rw-r--r--lib/api/entities.rb6
-rw-r--r--lib/gitlab/git/repository.rb8
-rw-r--r--lib/gitlab/git/storage.rb22
-rw-r--r--lib/gitlab/git/storage/circuit_breaker.rb142
-rw-r--r--lib/gitlab/git/storage/forked_storage_check.rb59
-rw-r--r--lib/gitlab/git/storage/health.rb101
-rw-r--r--lib/gitlab/health_checks/fs_shards_check.rb21
9 files changed, 407 insertions, 3 deletions
diff --git a/lib/api/api.rb b/lib/api/api.rb
index 982a2b88d62..94df543853b 100644
--- a/lib/api/api.rb
+++ b/lib/api/api.rb
@@ -95,6 +95,7 @@ module API
mount ::API::Boards
mount ::API::Branches
mount ::API::BroadcastMessages
+ mount ::API::CircuitBreakers
mount ::API::Commits
mount ::API::CommitStatuses
mount ::API::DeployKeys
diff --git a/lib/api/circuit_breakers.rb b/lib/api/circuit_breakers.rb
new file mode 100644
index 00000000000..118883f5ea5
--- /dev/null
+++ b/lib/api/circuit_breakers.rb
@@ -0,0 +1,50 @@
+module API
+ class CircuitBreakers < Grape::API
+ before { authenticated_as_admin! }
+
+ resource :circuit_breakers do
+ params do
+ requires :type,
+ type: String,
+ desc: "The type of circuitbreaker",
+ values: ['repository_storage']
+ end
+ resource ':type' do
+ namespace '', requirements: { type: 'repository_storage' } do
+ helpers do
+ def failing_storage_health
+ @failing_storage_health ||= Gitlab::Git::Storage::Health.for_failing_storages
+ end
+
+ def storage_health
+ @failing_storage_health ||= Gitlab::Git::Storage::Health.for_all_storages
+ end
+ end
+
+ desc 'Get all failing git storages' do
+ detail 'This feature was introduced in GitLab 9.5'
+ success Entities::RepositoryStorageHealth
+ end
+ get do
+ present storage_health, with: Entities::RepositoryStorageHealth
+ end
+
+ desc 'Get all failing git storages' do
+ detail 'This feature was introduced in GitLab 9.5'
+ success Entities::RepositoryStorageHealth
+ end
+ get 'failing' do
+ present failing_storage_health, with: Entities::RepositoryStorageHealth
+ end
+
+ desc 'Reset all storage failures and open circuitbreaker' do
+ detail 'This feature was introduced in GitLab 9.5'
+ end
+ delete do
+ Gitlab::Git::Storage::CircuitBreaker.reset_all!
+ end
+ end
+ end
+ end
+ end
+end
diff --git a/lib/api/entities.rb b/lib/api/entities.rb
index 298831a8fdb..f25b408439a 100644
--- a/lib/api/entities.rb
+++ b/lib/api/entities.rb
@@ -954,5 +954,11 @@ module API
expose :ip_address
expose :submitted, as: :akismet_submitted
end
+
+ class RepositoryStorageHealth < Grape::Entity
+ expose :storage_name
+ expose :failing_on_hosts
+ expose :total_failures
+ end
end
end
diff --git a/lib/gitlab/git/repository.rb b/lib/gitlab/git/repository.rb
index ffe2c8b91bb..aa3252e1df8 100644
--- a/lib/gitlab/git/repository.rb
+++ b/lib/gitlab/git/repository.rb
@@ -64,11 +64,17 @@ module Gitlab
end
def rugged
- @rugged ||= Rugged::Repository.new(path, alternates: alternate_object_directories)
+ @rugged ||= circuit_breaker.perform do
+ Rugged::Repository.new(path, alternates: alternate_object_directories)
+ end
rescue Rugged::RepositoryError, Rugged::OSError
raise NoRepository.new('no repository for such path')
end
+ def circuit_breaker
+ @circuit_breaker ||= Gitlab::Git::Storage::CircuitBreaker.for_storage(storage)
+ end
+
# Returns an Array of branch names
# sorted by name ASC
def branch_names
diff --git a/lib/gitlab/git/storage.rb b/lib/gitlab/git/storage.rb
new file mode 100644
index 00000000000..e28be4b8a38
--- /dev/null
+++ b/lib/gitlab/git/storage.rb
@@ -0,0 +1,22 @@
+module Gitlab
+ module Git
+ module Storage
+ class Inaccessible < StandardError
+ attr_reader :retry_after
+
+ def initialize(message = nil, retry_after = nil)
+ super(message)
+ @retry_after = retry_after
+ end
+ end
+
+ CircuitOpen = Class.new(Inaccessible)
+
+ REDIS_KEY_PREFIX = 'storage_accessible:'.freeze
+
+ def self.redis
+ Gitlab::Redis::SharedState
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/git/storage/circuit_breaker.rb b/lib/gitlab/git/storage/circuit_breaker.rb
new file mode 100644
index 00000000000..c722771e0d5
--- /dev/null
+++ b/lib/gitlab/git/storage/circuit_breaker.rb
@@ -0,0 +1,142 @@
+module Gitlab
+ module Git
+ module Storage
+ class CircuitBreaker
+ attr_reader :storage,
+ :hostname,
+ :storage_path,
+ :failure_count_threshold,
+ :failure_wait_time,
+ :failure_reset_time,
+ :storage_timeout
+
+ def self.reset_all!
+ pattern = "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}*"
+
+ Gitlab::Git::Storage.redis.with do |redis|
+ all_storage_keys = redis.scan_each(match: pattern).to_a
+ redis.del(*all_storage_keys) unless all_storage_keys.empty?
+ end
+
+ RequestStore.delete(:circuitbreaker_cache)
+ end
+
+ def self.for_storage(storage)
+ cached_circuitbreakers = RequestStore.fetch(:circuitbreaker_cache) do
+ Hash.new do |hash, storage_name|
+ hash[storage_name] = new(storage_name)
+ end
+ end
+
+ cached_circuitbreakers[storage]
+ end
+
+ def initialize(storage, hostname = Gitlab.config.gitlab.hostname)
+ @storage = storage
+ @hostname = hostname
+
+ config = Gitlab.config.repositories.storages[@storage]
+ @storage_path = config['path']
+ @failure_count_threshold = config['failure_count_threshold']
+ @failure_wait_time = config['failure_wait_time']
+ @failure_reset_time = config['failure_reset_time']
+ @storage_timeout = config['storage_timeout']
+ end
+
+ def perform
+ return yield unless Feature.enabled?('git_storage_circuit_breaker')
+
+ if circuit_broken?
+ raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} open", failure_wait_time)
+ end
+
+ check_storage_accessible!
+
+ yield
+ end
+
+ def circuit_broken?
+ return false if no_failures?
+
+ recent_failure = last_failure > failure_wait_time.seconds.ago
+ too_many_failures = failure_count > failure_count_threshold
+
+ recent_failure || too_many_failures
+ end
+
+ # Memoizing the `storage_available` call means we only do it once per
+ # request when the storage is available.
+ #
+ # When the storage appears not available, and the memoized value is `false`
+ # we might want to try again.
+ def storage_available?
+ @storage_available ||= Gitlab::Git::Storage::ForkedStorageCheck.storage_available?(storage_path, storage_timeout)
+ end
+
+ def check_storage_accessible!
+ if storage_available?
+ track_storage_accessible
+ else
+ track_storage_inaccessible
+ raise Gitlab::Git::Storage::Inaccessible.new("#{storage} not accessible", failure_wait_time)
+ end
+ end
+
+ def no_failures?
+ last_failure.blank? && failure_count == 0
+ end
+
+ def track_storage_inaccessible
+ @failure_info = [Time.now, failure_count + 1]
+
+ Gitlab::Git::Storage.redis.with do |redis|
+ redis.pipelined do
+ redis.hset(cache_key, :last_failure, last_failure.to_i)
+ redis.hincrby(cache_key, :failure_count, 1)
+ redis.expire(cache_key, failure_reset_time)
+ end
+ end
+ end
+
+ def track_storage_accessible
+ return if no_failures?
+
+ @failure_info = [nil, 0]
+
+ Gitlab::Git::Storage.redis.with do |redis|
+ redis.pipelined do
+ redis.hset(cache_key, :last_failure, nil)
+ redis.hset(cache_key, :failure_count, 0)
+ end
+ end
+ end
+
+ def last_failure
+ failure_info.first
+ end
+
+ def failure_count
+ failure_info.last
+ end
+
+ def failure_info
+ @failure_info ||= get_failure_info
+ end
+
+ def get_failure_info
+ last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis|
+ redis.hmget(cache_key, :last_failure, :failure_count)
+ end
+
+ last_failure = Time.at(last_failure.to_i) if last_failure.present?
+
+ [last_failure, failure_count.to_i]
+ end
+
+ def cache_key
+ @cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}"
+ end
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/git/storage/forked_storage_check.rb b/lib/gitlab/git/storage/forked_storage_check.rb
new file mode 100644
index 00000000000..557a3b0e61d
--- /dev/null
+++ b/lib/gitlab/git/storage/forked_storage_check.rb
@@ -0,0 +1,59 @@
+module Gitlab
+ module Git
+ module Storage
+ module ForkedStorageCheck
+ extend self
+
+ def storage_available?(path, timeout_seconds = 5)
+ status = timeout_check(path, timeout_seconds)
+
+ status.success?
+ end
+
+ def timeout_check(path, timeout_seconds)
+ filesystem_check_pid = check_filesystem_in_fork(path)
+
+ deadline = timeout_seconds.seconds.from_now.utc
+ wait_time = 0.01
+ status = nil
+
+ while status.nil?
+ if deadline > Time.now.utc
+ sleep(wait_time)
+ _pid, status = Process.wait2(filesystem_check_pid, Process::WNOHANG)
+ else
+ Process.kill('KILL', filesystem_check_pid)
+ # Blocking wait, so we are sure the process is gone before continuing
+ _pid, status = Process.wait2(filesystem_check_pid)
+ end
+ end
+
+ status
+ end
+
+ # This call forks out into a process, that process will then be replaced
+ # With an `exec` call, since we fork out into a shell, we can create a
+ # child process without needing an ActiveRecord-connection.
+ #
+ # Inside the shell, we use `& wait` to fork another child. We do this
+ # to prevent leaving a zombie process when the parent gets killed by the
+ # timeout.
+ #
+ # https://stackoverflow.com/questions/27892975/what-causes-activerecord-breaking-postgres-connection-after-forking
+ # https://stackoverflow.com/questions/22012943/activerecordstatementinvalid-runtimeerror-the-connection-cannot-be-reused-in
+ def check_filesystem_in_fork(path)
+ fork do
+ STDOUT.reopen('/dev/null')
+ STDERR.reopen('/dev/null')
+
+ exec("(#{test_script(path)}) & wait %1")
+ end
+ end
+
+ def test_script(path)
+ "testpath=\"$(realpath #{Shellwords.escape(path)})\" && stat $testpath"
+ end
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/git/storage/health.rb b/lib/gitlab/git/storage/health.rb
new file mode 100644
index 00000000000..0a28052fd81
--- /dev/null
+++ b/lib/gitlab/git/storage/health.rb
@@ -0,0 +1,101 @@
+module Gitlab
+ module Git
+ module Storage
+ class Health
+ attr_reader :storage_name, :info
+
+ def self.pattern_for_storage(storage_name)
+ "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage_name}:*"
+ end
+
+ def self.for_all_storages
+ storage_names = Gitlab.config.repositories.storages.keys
+ results_per_storage = nil
+
+ Gitlab::Git::Storage.redis.with do |redis|
+ keys_per_storage = all_keys_for_storages(storage_names, redis)
+
+ # We need to make sure all keys are actually loaded as an array.
+ # Otherwise when using the enumerator of the `scan_each` within a
+ # second pipeline, it will be assumed unloaded, wich would make the
+ # result unusable inside the pipeline.
+ loaded_keys_per_storage = keys_per_storage.inject({}) do |loaded_keys, (storage_name, keys)|
+ loaded_keys[storage_name] = keys.to_a
+ loaded_keys
+ end
+
+ results_per_storage = load_for_keys(loaded_keys_per_storage, redis)
+ end
+
+ results_per_storage.map do |name, info|
+ info.each { |i| i[:failure_count] = i[:failure_count].value.to_i }
+ new(name, info)
+ end
+ end
+
+ def self.all_keys_for_storages(storage_names, redis)
+ keys_per_storage = nil
+
+ redis.pipelined do
+ keys_per_storage = storage_names.inject({}) do |result, storage_name|
+ key = pattern_for_storage(storage_name)
+
+ result.merge(storage_name => redis.scan_each(match: key))
+ end
+ end
+
+ keys_per_storage
+ end
+
+ def self.load_for_keys(keys_per_storage, redis)
+ info_for_keys = nil
+
+ redis.pipelined do
+ info_for_keys = keys_per_storage.inject({}) do |result, (storage_name, keys)|
+ info_for_storage = keys.map do |key|
+ { name: key, failure_count: redis.hget(key, :failure_count) }
+ end
+
+ result.merge(storage_name => info_for_storage)
+ end
+ end
+
+ info_for_keys
+ end
+
+ def self.for_failing_storages
+ for_all_storages.select(&:failing?)
+ end
+
+ def initialize(storage_name, info)
+ @storage_name = storage_name
+ @info = info
+ end
+
+ def failing_info
+ @failing_info ||= info.select { |info_for_host| info_for_host[:failure_count] > 0 }
+ end
+
+ def failing?
+ failing_info.any?
+ end
+
+ def failing_on_hosts
+ @failing_on_hosts ||= failing_info.map do |info_for_host|
+ info_for_host[:name].split(':').last
+ end
+ end
+
+ def failing_circuit_breakers
+ @failing_circuit_breakers ||= failing_on_hosts.map do |hostname|
+ CircuitBreaker.new(storage_name, hostname)
+ end
+ end
+
+ def total_failures
+ @total_failures ||= failing_info.sum { |info_for_host| info_for_host[:failure_count] }
+ end
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/health_checks/fs_shards_check.rb b/lib/gitlab/health_checks/fs_shards_check.rb
index 9e91c135956..eef97f54962 100644
--- a/lib/gitlab/health_checks/fs_shards_check.rb
+++ b/lib/gitlab/health_checks/fs_shards_check.rb
@@ -10,7 +10,9 @@ module Gitlab
def readiness
repository_storages.map do |storage_name|
begin
- if !storage_stat_test(storage_name)
+ if !storage_circuitbreaker_test(storage_name)
+ HealthChecks::Result.new(false, 'circuitbreaker tripped', shard: storage_name)
+ elsif !storage_stat_test(storage_name)
HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name)
else
with_temp_file(storage_name) do |tmp_file_path|
@@ -36,7 +38,8 @@ module Gitlab
[
storage_stat_metrics(storage_name),
storage_write_metrics(storage_name),
- storage_read_metrics(storage_name)
+ storage_read_metrics(storage_name),
+ storage_circuitbreaker_metrics(storage_name)
].flatten
end
end
@@ -121,6 +124,12 @@ module Gitlab
file_contents == RANDOM_STRING
end
+ def storage_circuitbreaker_test(storage_name)
+ Gitlab::Git::Storage::CircuitBreaker.new(storage_name).perform { "OK" }
+ rescue Gitlab::Git::Storage::Inaccessible
+ nil
+ end
+
def storage_stat_metrics(storage_name)
operation_metrics(:filesystem_accessible, :filesystem_access_latency_seconds, shard: storage_name) do
with_timing { storage_stat_test(storage_name) }
@@ -143,6 +152,14 @@ module Gitlab
end
end
end
+
+ def storage_circuitbreaker_metrics(storage_name)
+ operation_metrics(:filesystem_circuitbreaker,
+ :filesystem_circuitbreaker_latency_seconds,
+ shard: storage_name) do
+ with_timing { storage_circuitbreaker_test(storage_name) }
+ end
+ end
end
end
end