diff options
Diffstat (limited to 'lib/gitlab/health_checks/fs_shards_check.rb')
-rw-r--r-- | lib/gitlab/health_checks/fs_shards_check.rb | 95 |
1 files changed, 63 insertions, 32 deletions
diff --git a/lib/gitlab/health_checks/fs_shards_check.rb b/lib/gitlab/health_checks/fs_shards_check.rb index bebde857b16..9e91c135956 100644 --- a/lib/gitlab/health_checks/fs_shards_check.rb +++ b/lib/gitlab/health_checks/fs_shards_check.rb @@ -10,47 +10,45 @@ module Gitlab def readiness repository_storages.map do |storage_name| begin - tmp_file_path = tmp_file_path(storage_name) - if !storage_stat_test(storage_name) HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name) - elsif !storage_write_test(tmp_file_path) - HealthChecks::Result.new(false, 'cannot write to storage', shard: storage_name) - elsif !storage_read_test(tmp_file_path) - HealthChecks::Result.new(false, 'cannot read from storage', shard: storage_name) else - HealthChecks::Result.new(true, nil, shard: storage_name) + with_temp_file(storage_name) do |tmp_file_path| + if !storage_write_test(tmp_file_path) + HealthChecks::Result.new(false, 'cannot write to storage', shard: storage_name) + elsif !storage_read_test(tmp_file_path) + HealthChecks::Result.new(false, 'cannot read from storage', shard: storage_name) + else + HealthChecks::Result.new(true, nil, shard: storage_name) + end + end end rescue RuntimeError => ex message = "unexpected error #{ex} when checking storage #{storage_name}" Rails.logger.error(message) HealthChecks::Result.new(false, message, shard: storage_name) - ensure - delete_test_file(tmp_file_path) end end end def metrics repository_storages.flat_map do |storage_name| - tmp_file_path = tmp_file_path(storage_name) [ - operation_metrics(:filesystem_accessible, :filesystem_access_latency_seconds, -> { storage_stat_test(storage_name) }, shard: storage_name), - operation_metrics(:filesystem_writable, :filesystem_write_latency_seconds, -> { storage_write_test(tmp_file_path) }, shard: storage_name), - operation_metrics(:filesystem_readable, :filesystem_read_latency_seconds, -> { storage_read_test(tmp_file_path) }, shard: storage_name) + storage_stat_metrics(storage_name), + storage_write_metrics(storage_name), + storage_read_metrics(storage_name) ].flatten end end private - def operation_metrics(ok_metric, latency_metric, operation, **labels) - with_timing operation do |result, elapsed| - [ - metric(latency_metric, elapsed, **labels), - metric(ok_metric, result ? 1 : 0, **labels) - ] - end + def operation_metrics(ok_metric, latency_metric, **labels) + result, elapsed = yield + [ + metric(latency_metric, elapsed, **labels), + metric(ok_metric, result ? 1 : 0, **labels) + ] rescue RuntimeError => ex Rails.logger.error("unexpected error #{ex} when checking #{ok_metric}") [metric(ok_metric, 0, **labels)] @@ -68,19 +66,36 @@ module Gitlab Gitlab::Popen.popen([TIMEOUT_EXECUTABLE, COMMAND_TIMEOUT].concat(cmd_args), *args, &block) end - def tmp_file_path(storage_name) - Dir::Tmpname.create(%w(fs_shards_check +deleted), path(storage_name)) { |path| path } + def with_temp_file(storage_name) + temp_file_path = Dir::Tmpname.create(%w(fs_shards_check +deleted), storage_path(storage_name)) { |path| path } + yield temp_file_path + ensure + delete_test_file(temp_file_path) end - def path(storage_name) + def storage_path(storage_name) storages_paths&.dig(storage_name, 'path') end + # All below test methods use shell commands to perform actions on storage volumes. + # In case a storage volume have connectivity problems causing pure Ruby IO operation to wait indefinitely, + # we can rely on shell commands to be terminated once `timeout` kills them. + # + # However we also fallback to pure Ruby file operations in case a specific shell command is missing + # so we are still able to perform healthchecks and gather metrics from such system. + + def delete_test_file(tmp_path) + _, status = exec_with_timeout(%W{ rm -f #{tmp_path} }) + status.zero? + rescue Errno::ENOENT + File.delete(tmp_path) rescue Errno::ENOENT + end + def storage_stat_test(storage_name) - stat_path = File.join(path(storage_name), '.') + stat_path = File.join(storage_path(storage_name), '.') begin _, status = exec_with_timeout(%W{ stat #{stat_path} }) - status == 0 + status.zero? rescue Errno::ENOENT File.exist?(stat_path) && File::Stat.new(stat_path).readable? end @@ -90,7 +105,7 @@ module Gitlab _, status = exec_with_timeout(%W{ tee #{tmp_path} }) do |stdin| stdin.write(RANDOM_STRING) end - status == 0 + status.zero? rescue Errno::ENOENT written_bytes = File.write(tmp_path, RANDOM_STRING) rescue Errno::ENOENT written_bytes == RANDOM_STRING.length @@ -100,17 +115,33 @@ module Gitlab _, status = exec_with_timeout(%W{ diff #{tmp_path} - }) do |stdin| stdin.write(RANDOM_STRING) end - status == 0 + status.zero? rescue Errno::ENOENT file_contents = File.read(tmp_path) rescue Errno::ENOENT file_contents == RANDOM_STRING end - def delete_test_file(tmp_path) - _, status = exec_with_timeout(%W{ rm -f #{tmp_path} }) - status == 0 - rescue Errno::ENOENT - File.delete(tmp_path) rescue Errno::ENOENT + def storage_stat_metrics(storage_name) + operation_metrics(:filesystem_accessible, :filesystem_access_latency_seconds, shard: storage_name) do + with_timing { storage_stat_test(storage_name) } + end + end + + def storage_write_metrics(storage_name) + operation_metrics(:filesystem_writable, :filesystem_write_latency_seconds, shard: storage_name) do + with_temp_file(storage_name) do |tmp_file_path| + with_timing { storage_write_test(tmp_file_path) } + end + end + end + + def storage_read_metrics(storage_name) + operation_metrics(:filesystem_readable, :filesystem_read_latency_seconds, shard: storage_name) do + with_temp_file(storage_name) do |tmp_file_path| + storage_write_test(tmp_file_path) # writes data used by read test + with_timing { storage_read_test(tmp_file_path) } + end + end end end end |