diff options
author | Paweł Chojnacki <pawel@chojnacki.ws> | 2017-04-07 10:27:15 +0000 |
---|---|---|
committer | Douwe Maan <douwe@gitlab.com> | 2017-04-07 10:27:15 +0000 |
commit | c3e43c9ba38f8cc0f2fd4a918d052c3977a93421 (patch) | |
tree | 3c5e09e7955663f69573ee9f47e9344011993532 /lib | |
parent | 15e87cea3a374e8d929c28f6843170b182fe159a (diff) | |
download | gitlab-ce-c3e43c9ba38f8cc0f2fd4a918d052c3977a93421.tar.gz |
Add /-/readiness /-/liveness and /-/health_metrics endpoints to track application readiness
Diffstat (limited to 'lib')
-rw-r--r-- | lib/gitlab/health_checks/base_abstract_check.rb | 45 | ||||
-rw-r--r-- | lib/gitlab/health_checks/db_check.rb | 29 | ||||
-rw-r--r-- | lib/gitlab/health_checks/fs_shards_check.rb | 117 | ||||
-rw-r--r-- | lib/gitlab/health_checks/metric.rb | 3 | ||||
-rw-r--r-- | lib/gitlab/health_checks/redis_check.rb | 25 | ||||
-rw-r--r-- | lib/gitlab/health_checks/result.rb | 3 | ||||
-rw-r--r-- | lib/gitlab/health_checks/simple_abstract_check.rb | 43 |
7 files changed, 265 insertions, 0 deletions
diff --git a/lib/gitlab/health_checks/base_abstract_check.rb b/lib/gitlab/health_checks/base_abstract_check.rb new file mode 100644 index 00000000000..7de6d4d9367 --- /dev/null +++ b/lib/gitlab/health_checks/base_abstract_check.rb @@ -0,0 +1,45 @@ +module Gitlab + module HealthChecks + module BaseAbstractCheck + def name + super.demodulize.underscore + end + + def human_name + name.sub(/_check$/, '').capitalize + end + + def readiness + raise NotImplementedError + end + + def liveness + HealthChecks::Result.new(true) + end + + def metrics + [] + end + + protected + + def metric(name, value, **labels) + Metric.new(name, value, labels) + end + + def with_timing(proc) + start = Time.now + result = proc.call + yield result, Time.now.to_f - start.to_f + end + + def catch_timeout(seconds, &block) + begin + Timeout.timeout(seconds.to_i, &block) + rescue Timeout::Error => ex + ex + end + end + end + end +end diff --git a/lib/gitlab/health_checks/db_check.rb b/lib/gitlab/health_checks/db_check.rb new file mode 100644 index 00000000000..fd94984f8a2 --- /dev/null +++ b/lib/gitlab/health_checks/db_check.rb @@ -0,0 +1,29 @@ +module Gitlab + module HealthChecks + class DbCheck + extend SimpleAbstractCheck + + class << self + private + + def metric_prefix + 'db_ping' + end + + def is_successful?(result) + result == '1' + end + + def check + catch_timeout 10.seconds do + if Gitlab::Database.postgresql? + ActiveRecord::Base.connection.execute('SELECT 1 as ping')&.first&.[]('ping') + else + ActiveRecord::Base.connection.execute('SELECT 1 as ping')&.first&.first&.to_s + end + end + end + end + end + end +end diff --git a/lib/gitlab/health_checks/fs_shards_check.rb b/lib/gitlab/health_checks/fs_shards_check.rb new file mode 100644 index 00000000000..df962d203b7 --- /dev/null +++ b/lib/gitlab/health_checks/fs_shards_check.rb @@ -0,0 +1,117 @@ +module Gitlab + module HealthChecks + class FsShardsCheck + extend BaseAbstractCheck + + class << self + def readiness + repository_storages.map do |storage_name| + begin + tmp_file_path = tmp_file_path(storage_name) + + if !storage_stat_test(storage_name) + HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name) + elsif !storage_write_test(tmp_file_path) + HealthChecks::Result.new(false, 'cannot write to storage', shard: storage_name) + elsif !storage_read_test(tmp_file_path) + HealthChecks::Result.new(false, 'cannot read from storage', shard: storage_name) + else + HealthChecks::Result.new(true, nil, shard: storage_name) + end + rescue RuntimeError => ex + message = "unexpected error #{ex} when checking storage #{storage_name}" + Rails.logger.error(message) + HealthChecks::Result.new(false, message, shard: storage_name) + ensure + delete_test_file(tmp_file_path) + end + end + end + + def metrics + repository_storages.flat_map do |storage_name| + tmp_file_path = tmp_file_path(storage_name) + [ + operation_metrics(:filesystem_accessible, :filesystem_access_latency, -> { storage_stat_test(storage_name) }, shard: storage_name), + operation_metrics(:filesystem_writable, :filesystem_write_latency, -> { storage_write_test(tmp_file_path) }, shard: storage_name), + operation_metrics(:filesystem_readable, :filesystem_read_latency, -> { storage_read_test(tmp_file_path) }, shard: storage_name) + ].flatten + end + end + + private + + RANDOM_STRING = SecureRandom.hex(1000).freeze + + def operation_metrics(ok_metric, latency_metric, operation, **labels) + with_timing operation do |result, elapsed| + [ + metric(latency_metric, elapsed, **labels), + metric(ok_metric, result ? 1 : 0, **labels) + ] + end + rescue RuntimeError => ex + Rails.logger("unexpected error #{ex} when checking #{ok_metric}") + [metric(ok_metric, 0, **labels)] + end + + def repository_storages + @repository_storage ||= Gitlab::CurrentSettings.current_application_settings.repository_storages + end + + def storages_paths + @storage_paths ||= Gitlab.config.repositories.storages + end + + def with_timeout(args) + %w{timeout 1}.concat(args) + end + + def tmp_file_path(storage_name) + Dir::Tmpname.create(%w(fs_shards_check +deleted), path(storage_name)) { |path| path } + end + + def path(storage_name) + storages_paths&.dig(storage_name, 'path') + end + + def storage_stat_test(storage_name) + stat_path = File.join(path(storage_name), '.') + begin + _, status = Gitlab::Popen.popen(with_timeout(%W{ stat #{stat_path} })) + status == 0 + rescue Errno::ENOENT + File.exist?(stat_path) && File::Stat.new(stat_path).readable? + end + end + + def storage_write_test(tmp_path) + _, status = Gitlab::Popen.popen(with_timeout(%W{ tee #{tmp_path} })) do |stdin| + stdin.write(RANDOM_STRING) + end + status == 0 + rescue Errno::ENOENT + written_bytes = File.write(tmp_path, RANDOM_STRING) rescue Errno::ENOENT + written_bytes == RANDOM_STRING.length + end + + def storage_read_test(tmp_path) + _, status = Gitlab::Popen.popen(with_timeout(%W{ diff #{tmp_path} - })) do |stdin| + stdin.write(RANDOM_STRING) + end + status == 0 + rescue Errno::ENOENT + file_contents = File.read(tmp_path) rescue Errno::ENOENT + file_contents == RANDOM_STRING + end + + def delete_test_file(tmp_path) + _, status = Gitlab::Popen.popen(with_timeout(%W{ rm -f #{tmp_path} })) + status == 0 + rescue Errno::ENOENT + File.delete(tmp_path) rescue Errno::ENOENT + end + end + end + end +end diff --git a/lib/gitlab/health_checks/metric.rb b/lib/gitlab/health_checks/metric.rb new file mode 100644 index 00000000000..1a2eab0b005 --- /dev/null +++ b/lib/gitlab/health_checks/metric.rb @@ -0,0 +1,3 @@ +module Gitlab::HealthChecks + Metric = Struct.new(:name, :value, :labels) +end diff --git a/lib/gitlab/health_checks/redis_check.rb b/lib/gitlab/health_checks/redis_check.rb new file mode 100644 index 00000000000..57bbe5b3ad0 --- /dev/null +++ b/lib/gitlab/health_checks/redis_check.rb @@ -0,0 +1,25 @@ +module Gitlab + module HealthChecks + class RedisCheck + extend SimpleAbstractCheck + + class << self + private + + def metric_prefix + 'redis_ping' + end + + def is_successful?(result) + result == 'PONG' + end + + def check + catch_timeout 10.seconds do + Gitlab::Redis.with(&:ping) + end + end + end + end + end +end diff --git a/lib/gitlab/health_checks/result.rb b/lib/gitlab/health_checks/result.rb new file mode 100644 index 00000000000..8086760023e --- /dev/null +++ b/lib/gitlab/health_checks/result.rb @@ -0,0 +1,3 @@ +module Gitlab::HealthChecks + Result = Struct.new(:success, :message, :labels) +end diff --git a/lib/gitlab/health_checks/simple_abstract_check.rb b/lib/gitlab/health_checks/simple_abstract_check.rb new file mode 100644 index 00000000000..fbe1645c1b1 --- /dev/null +++ b/lib/gitlab/health_checks/simple_abstract_check.rb @@ -0,0 +1,43 @@ +module Gitlab + module HealthChecks + module SimpleAbstractCheck + include BaseAbstractCheck + + def readiness + check_result = check + if is_successful?(check_result) + HealthChecks::Result.new(true) + elsif check_result.is_a?(Timeout::Error) + HealthChecks::Result.new(false, "#{human_name} check timed out") + else + HealthChecks::Result.new(false, "unexpected #{human_name} check result: #{check_result}") + end + end + + def metrics + with_timing method(:check) do |result, elapsed| + Rails.logger.error("#{human_name} check returned unexpected result #{result}") unless is_successful?(result) + [ + metric("#{metric_prefix}_timeout", result.is_a?(Timeout::Error) ? 1 : 0), + metric("#{metric_prefix}_success", is_successful?(result) ? 1 : 0), + metric("#{metric_prefix}_latency", elapsed) + ] + end + end + + private + + def metric_prefix + raise NotImplementedError + end + + def is_successful?(result) + raise NotImplementedError + end + + def check + raise NotImplementedError + end + end + end +end |