summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBob Van Landuyt <bob@vanlanduyt.co>2017-10-19 08:32:55 +0200
committerBob Van Landuyt <bob@vanlanduyt.co>2017-10-23 12:02:23 +0300
commit430e7671397a1c022b88da31328a5a81409671b5 (patch)
tree44dfbf4deb769d418968ba1505c9d1b2fa4b92f2
parent1881d4f8ecbf52afd7bc732cd6c1296fafd38405 (diff)
downloadgitlab-ce-430e7671397a1c022b88da31328a5a81409671b5.tar.gz
Implement backoff for the circuitbreaker
The circuitbreaker now has 2 failure modes: - Backing off: This will raise the `Gitlab::Git::Storage::Failing` exception. Access to the shard is blocked temporarily. - Circuit broken: This will raise the `Gitlab::Git::Storage::CircuitBroken` exception. Access to the shard will be blocked until the failures are reset.
-rw-r--r--app/helpers/storage_health_helper.rb5
-rw-r--r--lib/gitlab/git/storage.rb1
-rw-r--r--lib/gitlab/git/storage/circuit_breaker.rb28
-rw-r--r--lib/gitlab/git/storage/circuit_breaker_settings.rb8
-rw-r--r--lib/gitlab/git/storage/null_circuit_breaker.rb4
-rw-r--r--spec/lib/gitlab/git/storage/circuit_breaker_spec.rb259
-rw-r--r--spec/lib/gitlab/git/storage/null_circuit_breaker_spec.rb13
7 files changed, 128 insertions, 190 deletions
diff --git a/app/helpers/storage_health_helper.rb b/app/helpers/storage_health_helper.rb
index 544c9efb845..4d2180f7eee 100644
--- a/app/helpers/storage_health_helper.rb
+++ b/app/helpers/storage_health_helper.rb
@@ -16,17 +16,16 @@ module StorageHealthHelper
def message_for_circuit_breaker(circuit_breaker)
maximum_failures = circuit_breaker.failure_count_threshold
current_failures = circuit_breaker.failure_count
- permanently_broken = circuit_breaker.circuit_broken? && current_failures >= maximum_failures
translation_params = { number_of_failures: current_failures,
maximum_failures: maximum_failures,
number_of_seconds: circuit_breaker.failure_wait_time }
- if permanently_broken
+ if circuit_breaker.circuit_broken?
s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\
"retry automatically. Reset storage information when the problem is "\
"resolved.") % translation_params
- elsif circuit_breaker.circuit_broken?
+ elsif circuit_breaker.backing_off?
_("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\
"block access for %{number_of_seconds} seconds.") % translation_params
else
diff --git a/lib/gitlab/git/storage.rb b/lib/gitlab/git/storage.rb
index 08e6c29abad..99518c9b1e4 100644
--- a/lib/gitlab/git/storage.rb
+++ b/lib/gitlab/git/storage.rb
@@ -12,6 +12,7 @@ module Gitlab
CircuitOpen = Class.new(Inaccessible)
Misconfiguration = Class.new(Inaccessible)
+ Failing = Class.new(Inaccessible)
REDIS_KEY_PREFIX = 'storage_accessible:'.freeze
diff --git a/lib/gitlab/git/storage/circuit_breaker.rb b/lib/gitlab/git/storage/circuit_breaker.rb
index 0456ad9a1f3..2ce97ff41f9 100644
--- a/lib/gitlab/git/storage/circuit_breaker.rb
+++ b/lib/gitlab/git/storage/circuit_breaker.rb
@@ -64,12 +64,20 @@ module Gitlab
def circuit_broken?
return false if no_failures?
+ failure_count > failure_count_threshold
+ end
+
+ def backing_off?
+ return false if no_failures?
+
recent_failure = last_failure > failure_wait_time.seconds.ago
- too_many_failures = failure_count > failure_count_threshold
+ too_many_failures = failure_count > backoff_threshold
- recent_failure || too_many_failures
+ recent_failure && too_many_failures
end
+ private
+
def failure_info
@failure_info ||= get_failure_info
end
@@ -94,7 +102,11 @@ module Gitlab
def check_storage_accessible!
if circuit_broken?
- raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_wait_time)
+ raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_reset_time)
+ end
+
+ if backing_off?
+ raise Gitlab::Git::Storage::Failing.new("Backing off access to #{storage}", failure_wait_time)
end
unless storage_available?
@@ -131,12 +143,6 @@ module Gitlab
end
end
- def cache_key
- @cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}"
- end
-
- private
-
def get_failure_info
last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis|
redis.hmget(cache_key, :last_failure, :failure_count)
@@ -146,6 +152,10 @@ module Gitlab
FailureInfo.new(last_failure, failure_count.to_i)
end
+
+ def cache_key
+ @cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}"
+ end
end
end
end
diff --git a/lib/gitlab/git/storage/circuit_breaker_settings.rb b/lib/gitlab/git/storage/circuit_breaker_settings.rb
index d2313fe7c1b..257fe8cd8f0 100644
--- a/lib/gitlab/git/storage/circuit_breaker_settings.rb
+++ b/lib/gitlab/git/storage/circuit_breaker_settings.rb
@@ -18,6 +18,14 @@ module Gitlab
application_settings.circuitbreaker_storage_timeout
end
+ def access_retries
+ application_settings.circuitbreaker_access_retries
+ end
+
+ def backoff_threshold
+ application_settings.circuitbreaker_backoff_threshold
+ end
+
private
def application_settings
diff --git a/lib/gitlab/git/storage/null_circuit_breaker.rb b/lib/gitlab/git/storage/null_circuit_breaker.rb
index 60c6791a7e4..a12d52d295f 100644
--- a/lib/gitlab/git/storage/null_circuit_breaker.rb
+++ b/lib/gitlab/git/storage/null_circuit_breaker.rb
@@ -25,6 +25,10 @@ module Gitlab
!!@error
end
+ def backing_off?
+ false
+ end
+
def last_failure
circuit_broken? ? Time.now : nil
end
diff --git a/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb b/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb
index c8d532df059..e3f221aa863 100644
--- a/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb
+++ b/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb
@@ -79,7 +79,9 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
stub_application_setting(circuitbreaker_failure_count_threshold: 0,
circuitbreaker_failure_wait_time: 1,
circuitbreaker_failure_reset_time: 2,
- circuitbreaker_storage_timeout: 3)
+ circuitbreaker_storage_timeout: 3,
+ circuitbreaker_access_retries: 4,
+ circuitbreaker_backoff_threshold: 5)
end
describe '#failure_count_threshold' do
@@ -105,14 +107,43 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
expect(circuit_breaker.storage_timeout).to eq(3)
end
end
+
+ describe '#access_retries' do
+ it 'reads the value from settings' do
+ expect(circuit_breaker.access_retries).to eq(4)
+ end
+ end
+
+ describe '#backoff_threshold' do
+ it 'reads the value from settings' do
+ expect(circuit_breaker.backoff_threshold).to eq(5)
+ end
+ end
end
describe '#perform' do
- it 'raises an exception with retry time when the circuit is open' do
- allow(circuit_breaker).to receive(:circuit_broken?).and_return(true)
+ it 'raises the correct exception when the circuit is open' do
+ set_in_redis(:last_failure, 1.day.ago.to_f)
+ set_in_redis(:failure_count, 999)
expect { |b| circuit_breaker.perform(&b) }
- .to raise_error(Gitlab::Git::Storage::CircuitOpen)
+ .to raise_error do |exception|
+ expect(exception).to be_kind_of(Gitlab::Git::Storage::CircuitOpen)
+ expect(exception.retry_after).to eq(1800)
+ end
+ end
+
+ it 'raises the correct exception when backing off' do
+ Timecop.freeze do
+ set_in_redis(:last_failure, 1.second.ago.to_f)
+ set_in_redis(:failure_count, 90)
+
+ expect { |b| circuit_breaker.perform(&b) }
+ .to raise_error do |exception|
+ expect(exception).to be_kind_of(Gitlab::Git::Storage::Failing)
+ expect(exception.retry_after).to eq(30)
+ end
+ end
end
it 'yields the block' do
@@ -122,6 +153,7 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
it 'checks if the storage is available' do
expect(circuit_breaker).to receive(:check_storage_accessible!)
+ .and_call_original
circuit_breaker.perform { 'hello world' }
end
@@ -137,201 +169,102 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
.to raise_error(Rugged::OSError)
end
- context 'with the feature disabled' do
- it 'returns the block without checking accessibility' do
- stub_feature_flags(git_storage_circuit_breaker: false)
-
- expect(circuit_breaker).not_to receive(:circuit_broken?)
-
- result = circuit_breaker.perform { 'hello' }
-
- expect(result).to eq('hello')
- end
- end
- end
-
- describe '#circuit_broken?' do
- it 'is working when there is no last failure' do
- set_in_redis(:last_failure, nil)
- set_in_redis(:failure_count, 0)
-
- expect(circuit_breaker.circuit_broken?).to be_falsey
- end
-
- it 'is broken when there was a recent failure' do
- Timecop.freeze do
- set_in_redis(:last_failure, 1.second.ago.to_f)
- set_in_redis(:failure_count, 1)
-
- expect(circuit_breaker.circuit_broken?).to be_truthy
- end
- end
-
- it 'is broken when there are too many failures' do
- set_in_redis(:last_failure, 1.day.ago.to_f)
- set_in_redis(:failure_count, 200)
-
- expect(circuit_breaker.circuit_broken?).to be_truthy
- end
-
- context 'the `failure_wait_time` is set to 0' do
- before do
- stub_application_setting(circuitbreaker_failure_wait_time: 0)
- end
+ it 'tracks that the storage was accessible' do
+ set_in_redis(:failure_count, 10)
+ set_in_redis(:last_failure, Time.now.to_f)
- it 'is working even when there is a recent failure' do
- Timecop.freeze do
- set_in_redis(:last_failure, 0.seconds.ago.to_f)
- set_in_redis(:failure_count, 1)
+ circuit_breaker.perform { '' }
- expect(circuit_breaker.circuit_broken?).to be_falsey
- end
- end
+ expect(value_from_redis(:failure_count).to_i).to eq(0)
+ expect(value_from_redis(:last_failure)).to be_empty
+ expect(circuit_breaker.failure_count).to eq(0)
+ expect(circuit_breaker.last_failure).to be_nil
end
- end
- describe "storage_available?" do
- context 'the storage is available' do
- it 'tracks that the storage was accessible an raises the error' do
- expect(circuit_breaker).to receive(:track_storage_accessible)
-
- circuit_breaker.storage_available?
- end
+ it 'only accessibility check once' do
+ expect(Gitlab::Git::Storage::ForkedStorageCheck)
+ .to receive(:storage_available?).once.and_call_original
- it 'only performs the check once' do
- expect(Gitlab::Git::Storage::ForkedStorageCheck)
- .to receive(:storage_available?).once.and_call_original
-
- 2.times { circuit_breaker.storage_available? }
- end
+ 2.times { circuit_breaker.perform { '' } }
end
- context 'storage is not available' do
- let(:storage_name) { 'broken' }
-
- it 'tracks that the storage was inaccessible' do
- expect(circuit_breaker).to receive(:track_storage_inaccessible)
+ context 'with the feature disabled' do
+ it 'returns the block without checking accessibility' do
+ stub_feature_flags(git_storage_circuit_breaker: false)
- circuit_breaker.storage_available?
- end
- end
- end
+ expect(circuit_breaker).not_to receive(:circuit_broken?)
- describe '#check_storage_accessible!' do
- it 'raises an exception with retry time when the circuit is open' do
- allow(circuit_breaker).to receive(:circuit_broken?).and_return(true)
+ result = circuit_breaker.perform { 'hello' }
- expect { circuit_breaker.check_storage_accessible! }
- .to raise_error do |exception|
- expect(exception).to be_kind_of(Gitlab::Git::Storage::CircuitOpen)
- expect(exception.retry_after).to eq(30)
+ expect(result).to eq('hello')
end
end
context 'the storage is not available' do
let(:storage_name) { 'broken' }
- it 'raises an error' do
+ it 'raises the correct exception' do
expect(circuit_breaker).to receive(:track_storage_inaccessible)
- expect { circuit_breaker.check_storage_accessible! }
+ expect { circuit_breaker.perform { '' } }
.to raise_error do |exception|
expect(exception).to be_kind_of(Gitlab::Git::Storage::Inaccessible)
expect(exception.retry_after).to eq(30)
end
end
- end
- end
-
- describe '#track_storage_inaccessible' do
- around do |example|
- Timecop.freeze { example.run }
- end
-
- it 'records the failure time in redis' do
- circuit_breaker.track_storage_inaccessible
-
- failure_time = value_from_redis(:last_failure)
- expect(Time.at(failure_time.to_i)).to be_within(1.second).of(Time.now)
- end
-
- it 'sets the failure time on the breaker without reloading' do
- circuit_breaker.track_storage_inaccessible
-
- expect(circuit_breaker).not_to receive(:get_failure_info)
- expect(circuit_breaker.last_failure).to eq(Time.now)
- end
-
- it 'increments the failure count in redis' do
- set_in_redis(:failure_count, 10)
-
- circuit_breaker.track_storage_inaccessible
-
- expect(value_from_redis(:failure_count).to_i).to be(11)
- end
-
- it 'increments the failure count on the breaker without reloading' do
- set_in_redis(:failure_count, 10)
-
- circuit_breaker.track_storage_inaccessible
+ it 'tracks that the storage was inaccessible' do
+ Timecop.freeze do
+ expect { circuit_breaker.perform { '' } }.to raise_error(Gitlab::Git::Storage::Inaccessible)
- expect(circuit_breaker).not_to receive(:get_failure_info)
- expect(circuit_breaker.failure_count).to eq(11)
+ expect(value_from_redis(:failure_count).to_i).to eq(1)
+ expect(value_from_redis(:last_failure)).not_to be_empty
+ expect(circuit_breaker.failure_count).to eq(1)
+ expect(circuit_breaker.last_failure).to be_within(1.second).of(Time.now)
+ end
+ end
end
end
- describe '#track_storage_accessible' do
- it 'sets the failure count to zero in redis' do
- set_in_redis(:failure_count, 10)
-
- circuit_breaker.track_storage_accessible
-
- expect(value_from_redis(:failure_count).to_i).to be(0)
- end
-
- it 'sets the failure count to zero on the breaker without reloading' do
- set_in_redis(:failure_count, 10)
-
- circuit_breaker.track_storage_accessible
+ describe '#circuit_broken?' do
+ it 'is working when there is no last failure' do
+ set_in_redis(:last_failure, nil)
+ set_in_redis(:failure_count, 0)
- expect(circuit_breaker).not_to receive(:get_failure_info)
- expect(circuit_breaker.failure_count).to eq(0)
+ expect(circuit_breaker.circuit_broken?).to be_falsey
end
- it 'removes the last failure time from redis' do
- set_in_redis(:last_failure, Time.now.to_i)
-
- circuit_breaker.track_storage_accessible
+ it 'is broken when there are too many failures' do
+ set_in_redis(:last_failure, 1.day.ago.to_f)
+ set_in_redis(:failure_count, 200)
- expect(circuit_breaker).not_to receive(:get_failure_info)
- expect(circuit_breaker.last_failure).to be_nil
+ expect(circuit_breaker.circuit_broken?).to be_truthy
end
+ end
- it 'removes the last failure time from the breaker without reloading' do
- set_in_redis(:last_failure, Time.now.to_i)
-
- circuit_breaker.track_storage_accessible
+ describe '#backing_off?' do
+ it 'is true when there was a recent failure' do
+ Timecop.freeze do
+ set_in_redis(:last_failure, 1.second.ago.to_f)
+ set_in_redis(:failure_count, 90)
- expect(value_from_redis(:last_failure)).to be_empty
+ expect(circuit_breaker.backing_off?).to be_truthy
+ end
end
- it 'wont connect to redis when there are no failures' do
- expect(Gitlab::Git::Storage.redis).to receive(:with).once
- .and_call_original
- expect(circuit_breaker).to receive(:track_storage_accessible)
- .and_call_original
-
- circuit_breaker.track_storage_accessible
- end
- end
+ context 'the `failure_wait_time` is set to 0' do
+ before do
+ stub_application_setting(circuitbreaker_failure_wait_time: 0)
+ end
- describe '#no_failures?' do
- it 'is false when a failure was tracked' do
- set_in_redis(:last_failure, Time.now.to_i)
- set_in_redis(:failure_count, 1)
+ it 'is working even when there are failures' do
+ Timecop.freeze do
+ set_in_redis(:last_failure, 0.seconds.ago.to_f)
+ set_in_redis(:failure_count, 90)
- expect(circuit_breaker.no_failures?).to be_falsey
+ expect(circuit_breaker.backing_off?).to be_falsey
+ end
+ end
end
end
@@ -351,10 +284,4 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
expect(circuit_breaker.failure_count).to eq(7)
end
end
-
- describe '#cache_key' do
- it 'includes storage and host' do
- expect(circuit_breaker.cache_key).to eq(cache_key)
- end
- end
end
diff --git a/spec/lib/gitlab/git/storage/null_circuit_breaker_spec.rb b/spec/lib/gitlab/git/storage/null_circuit_breaker_spec.rb
index 7ee6d2f3709..5db37f55e03 100644
--- a/spec/lib/gitlab/git/storage/null_circuit_breaker_spec.rb
+++ b/spec/lib/gitlab/git/storage/null_circuit_breaker_spec.rb
@@ -65,17 +65,6 @@ describe Gitlab::Git::Storage::NullCircuitBreaker do
ours = described_class.public_instance_methods
theirs = Gitlab::Git::Storage::CircuitBreaker.public_instance_methods
- # These methods are not part of the public API, but are public to allow the
- # CircuitBreaker specs to operate. They should be made private over time.
- exceptions = %i[
- cache_key
- check_storage_accessible!
- no_failures?
- storage_available?
- track_storage_accessible
- track_storage_inaccessible
- ]
-
- expect(theirs - ours).to contain_exactly(*exceptions)
+ expect(theirs - ours).to be_empty
end
end