summaryrefslogtreecommitdiff
path: root/lib/gitlab/health_checks
diff options
context:
space:
mode:
Diffstat (limited to 'lib/gitlab/health_checks')
-rw-r--r--lib/gitlab/health_checks/base_abstract_check.rb6
-rw-r--r--lib/gitlab/health_checks/fs_shards_check.rb97
-rw-r--r--lib/gitlab/health_checks/redis/cache_check.rb31
-rw-r--r--lib/gitlab/health_checks/redis/queues_check.rb31
-rw-r--r--lib/gitlab/health_checks/redis/redis_check.rb27
-rw-r--r--lib/gitlab/health_checks/redis/shared_state_check.rb31
-rw-r--r--lib/gitlab/health_checks/redis_check.rb25
-rw-r--r--lib/gitlab/health_checks/simple_abstract_check.rb15
8 files changed, 194 insertions, 69 deletions
diff --git a/lib/gitlab/health_checks/base_abstract_check.rb b/lib/gitlab/health_checks/base_abstract_check.rb
index 7de6d4d9367..8b365dab185 100644
--- a/lib/gitlab/health_checks/base_abstract_check.rb
+++ b/lib/gitlab/health_checks/base_abstract_check.rb
@@ -27,10 +27,10 @@ module Gitlab
Metric.new(name, value, labels)
end
- def with_timing(proc)
+ def with_timing
start = Time.now
- result = proc.call
- yield result, Time.now.to_f - start.to_f
+ result = yield
+ [result, Time.now.to_f - start.to_f]
end
def catch_timeout(seconds, &block)
diff --git a/lib/gitlab/health_checks/fs_shards_check.rb b/lib/gitlab/health_checks/fs_shards_check.rb
index e78b7f22e03..9e91c135956 100644
--- a/lib/gitlab/health_checks/fs_shards_check.rb
+++ b/lib/gitlab/health_checks/fs_shards_check.rb
@@ -10,49 +10,47 @@ module Gitlab
def readiness
repository_storages.map do |storage_name|
begin
- tmp_file_path = tmp_file_path(storage_name)
-
if !storage_stat_test(storage_name)
HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name)
- elsif !storage_write_test(tmp_file_path)
- HealthChecks::Result.new(false, 'cannot write to storage', shard: storage_name)
- elsif !storage_read_test(tmp_file_path)
- HealthChecks::Result.new(false, 'cannot read from storage', shard: storage_name)
else
- HealthChecks::Result.new(true, nil, shard: storage_name)
+ with_temp_file(storage_name) do |tmp_file_path|
+ if !storage_write_test(tmp_file_path)
+ HealthChecks::Result.new(false, 'cannot write to storage', shard: storage_name)
+ elsif !storage_read_test(tmp_file_path)
+ HealthChecks::Result.new(false, 'cannot read from storage', shard: storage_name)
+ else
+ HealthChecks::Result.new(true, nil, shard: storage_name)
+ end
+ end
end
rescue RuntimeError => ex
message = "unexpected error #{ex} when checking storage #{storage_name}"
Rails.logger.error(message)
HealthChecks::Result.new(false, message, shard: storage_name)
- ensure
- delete_test_file(tmp_file_path)
end
end
end
def metrics
repository_storages.flat_map do |storage_name|
- tmp_file_path = tmp_file_path(storage_name)
[
- operation_metrics(:filesystem_accessible, :filesystem_access_latency, -> { storage_stat_test(storage_name) }, shard: storage_name),
- operation_metrics(:filesystem_writable, :filesystem_write_latency, -> { storage_write_test(tmp_file_path) }, shard: storage_name),
- operation_metrics(:filesystem_readable, :filesystem_read_latency, -> { storage_read_test(tmp_file_path) }, shard: storage_name)
+ storage_stat_metrics(storage_name),
+ storage_write_metrics(storage_name),
+ storage_read_metrics(storage_name)
].flatten
end
end
private
- def operation_metrics(ok_metric, latency_metric, operation, **labels)
- with_timing operation do |result, elapsed|
- [
- metric(latency_metric, elapsed, **labels),
- metric(ok_metric, result ? 1 : 0, **labels)
- ]
- end
+ def operation_metrics(ok_metric, latency_metric, **labels)
+ result, elapsed = yield
+ [
+ metric(latency_metric, elapsed, **labels),
+ metric(ok_metric, result ? 1 : 0, **labels)
+ ]
rescue RuntimeError => ex
- Rails.logger("unexpected error #{ex} when checking #{ok_metric}")
+ Rails.logger.error("unexpected error #{ex} when checking #{ok_metric}")
[metric(ok_metric, 0, **labels)]
end
@@ -68,19 +66,36 @@ module Gitlab
Gitlab::Popen.popen([TIMEOUT_EXECUTABLE, COMMAND_TIMEOUT].concat(cmd_args), *args, &block)
end
- def tmp_file_path(storage_name)
- Dir::Tmpname.create(%w(fs_shards_check +deleted), path(storage_name)) { |path| path }
+ def with_temp_file(storage_name)
+ temp_file_path = Dir::Tmpname.create(%w(fs_shards_check +deleted), storage_path(storage_name)) { |path| path }
+ yield temp_file_path
+ ensure
+ delete_test_file(temp_file_path)
end
- def path(storage_name)
+ def storage_path(storage_name)
storages_paths&.dig(storage_name, 'path')
end
+ # All below test methods use shell commands to perform actions on storage volumes.
+ # In case a storage volume have connectivity problems causing pure Ruby IO operation to wait indefinitely,
+ # we can rely on shell commands to be terminated once `timeout` kills them.
+ #
+ # However we also fallback to pure Ruby file operations in case a specific shell command is missing
+ # so we are still able to perform healthchecks and gather metrics from such system.
+
+ def delete_test_file(tmp_path)
+ _, status = exec_with_timeout(%W{ rm -f #{tmp_path} })
+ status.zero?
+ rescue Errno::ENOENT
+ File.delete(tmp_path) rescue Errno::ENOENT
+ end
+
def storage_stat_test(storage_name)
- stat_path = File.join(path(storage_name), '.')
+ stat_path = File.join(storage_path(storage_name), '.')
begin
_, status = exec_with_timeout(%W{ stat #{stat_path} })
- status == 0
+ status.zero?
rescue Errno::ENOENT
File.exist?(stat_path) && File::Stat.new(stat_path).readable?
end
@@ -90,7 +105,7 @@ module Gitlab
_, status = exec_with_timeout(%W{ tee #{tmp_path} }) do |stdin|
stdin.write(RANDOM_STRING)
end
- status == 0
+ status.zero?
rescue Errno::ENOENT
written_bytes = File.write(tmp_path, RANDOM_STRING) rescue Errno::ENOENT
written_bytes == RANDOM_STRING.length
@@ -100,17 +115,33 @@ module Gitlab
_, status = exec_with_timeout(%W{ diff #{tmp_path} - }) do |stdin|
stdin.write(RANDOM_STRING)
end
- status == 0
+ status.zero?
rescue Errno::ENOENT
file_contents = File.read(tmp_path) rescue Errno::ENOENT
file_contents == RANDOM_STRING
end
- def delete_test_file(tmp_path)
- _, status = exec_with_timeout(%W{ rm -f #{tmp_path} })
- status == 0
- rescue Errno::ENOENT
- File.delete(tmp_path) rescue Errno::ENOENT
+ def storage_stat_metrics(storage_name)
+ operation_metrics(:filesystem_accessible, :filesystem_access_latency_seconds, shard: storage_name) do
+ with_timing { storage_stat_test(storage_name) }
+ end
+ end
+
+ def storage_write_metrics(storage_name)
+ operation_metrics(:filesystem_writable, :filesystem_write_latency_seconds, shard: storage_name) do
+ with_temp_file(storage_name) do |tmp_file_path|
+ with_timing { storage_write_test(tmp_file_path) }
+ end
+ end
+ end
+
+ def storage_read_metrics(storage_name)
+ operation_metrics(:filesystem_readable, :filesystem_read_latency_seconds, shard: storage_name) do
+ with_temp_file(storage_name) do |tmp_file_path|
+ storage_write_test(tmp_file_path) # writes data used by read test
+ with_timing { storage_read_test(tmp_file_path) }
+ end
+ end
end
end
end
diff --git a/lib/gitlab/health_checks/redis/cache_check.rb b/lib/gitlab/health_checks/redis/cache_check.rb
new file mode 100644
index 00000000000..a28658d42d4
--- /dev/null
+++ b/lib/gitlab/health_checks/redis/cache_check.rb
@@ -0,0 +1,31 @@
+module Gitlab
+ module HealthChecks
+ module Redis
+ class CacheCheck
+ extend SimpleAbstractCheck
+
+ class << self
+ def check_up
+ check
+ end
+
+ private
+
+ def metric_prefix
+ 'redis_cache_ping'
+ end
+
+ def is_successful?(result)
+ result == 'PONG'
+ end
+
+ def check
+ catch_timeout 10.seconds do
+ Gitlab::Redis::Cache.with(&:ping)
+ end
+ end
+ end
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/health_checks/redis/queues_check.rb b/lib/gitlab/health_checks/redis/queues_check.rb
new file mode 100644
index 00000000000..f97d50d3947
--- /dev/null
+++ b/lib/gitlab/health_checks/redis/queues_check.rb
@@ -0,0 +1,31 @@
+module Gitlab
+ module HealthChecks
+ module Redis
+ class QueuesCheck
+ extend SimpleAbstractCheck
+
+ class << self
+ def check_up
+ check
+ end
+
+ private
+
+ def metric_prefix
+ 'redis_queues_ping'
+ end
+
+ def is_successful?(result)
+ result == 'PONG'
+ end
+
+ def check
+ catch_timeout 10.seconds do
+ Gitlab::Redis::Queues.with(&:ping)
+ end
+ end
+ end
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/health_checks/redis/redis_check.rb b/lib/gitlab/health_checks/redis/redis_check.rb
new file mode 100644
index 00000000000..fe4e3c4a3ab
--- /dev/null
+++ b/lib/gitlab/health_checks/redis/redis_check.rb
@@ -0,0 +1,27 @@
+module Gitlab
+ module HealthChecks
+ module Redis
+ class RedisCheck
+ extend SimpleAbstractCheck
+
+ class << self
+ private
+
+ def metric_prefix
+ 'redis_ping'
+ end
+
+ def is_successful?(result)
+ result == 'PONG'
+ end
+
+ def check
+ ::Gitlab::HealthChecks::Redis::CacheCheck.check_up &&
+ ::Gitlab::HealthChecks::Redis::QueuesCheck.check_up &&
+ ::Gitlab::HealthChecks::Redis::SharedStateCheck.check_up
+ end
+ end
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/health_checks/redis/shared_state_check.rb b/lib/gitlab/health_checks/redis/shared_state_check.rb
new file mode 100644
index 00000000000..e3244392902
--- /dev/null
+++ b/lib/gitlab/health_checks/redis/shared_state_check.rb
@@ -0,0 +1,31 @@
+module Gitlab
+ module HealthChecks
+ module Redis
+ class SharedStateCheck
+ extend SimpleAbstractCheck
+
+ class << self
+ def check_up
+ check
+ end
+
+ private
+
+ def metric_prefix
+ 'redis_shared_state_ping'
+ end
+
+ def is_successful?(result)
+ result == 'PONG'
+ end
+
+ def check
+ catch_timeout 10.seconds do
+ Gitlab::Redis::SharedState.with(&:ping)
+ end
+ end
+ end
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/health_checks/redis_check.rb b/lib/gitlab/health_checks/redis_check.rb
deleted file mode 100644
index 57bbe5b3ad0..00000000000
--- a/lib/gitlab/health_checks/redis_check.rb
+++ /dev/null
@@ -1,25 +0,0 @@
-module Gitlab
- module HealthChecks
- class RedisCheck
- extend SimpleAbstractCheck
-
- class << self
- private
-
- def metric_prefix
- 'redis_ping'
- end
-
- def is_successful?(result)
- result == 'PONG'
- end
-
- def check
- catch_timeout 10.seconds do
- Gitlab::Redis.with(&:ping)
- end
- end
- end
- end
- end
-end
diff --git a/lib/gitlab/health_checks/simple_abstract_check.rb b/lib/gitlab/health_checks/simple_abstract_check.rb
index fbe1645c1b1..f5026171ba4 100644
--- a/lib/gitlab/health_checks/simple_abstract_check.rb
+++ b/lib/gitlab/health_checks/simple_abstract_check.rb
@@ -15,14 +15,13 @@ module Gitlab
end
def metrics
- with_timing method(:check) do |result, elapsed|
- Rails.logger.error("#{human_name} check returned unexpected result #{result}") unless is_successful?(result)
- [
- metric("#{metric_prefix}_timeout", result.is_a?(Timeout::Error) ? 1 : 0),
- metric("#{metric_prefix}_success", is_successful?(result) ? 1 : 0),
- metric("#{metric_prefix}_latency", elapsed)
- ]
- end
+ result, elapsed = with_timing(&method(:check))
+ Rails.logger.error("#{human_name} check returned unexpected result #{result}") unless is_successful?(result)
+ [
+ metric("#{metric_prefix}_timeout", result.is_a?(Timeout::Error) ? 1 : 0),
+ metric("#{metric_prefix}_success", is_successful?(result) ? 1 : 0),
+ metric("#{metric_prefix}_latency_seconds", elapsed)
+ ]
end
private