summaryrefslogtreecommitdiff
path: root/lib/gitlab/health_checks/fs_shards_check.rb
blob: ab18701b3bf1162e11ea2c067131bfb5056182ff (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
module Gitlab
  module HealthChecks
    class FsShardsCheck
      extend BaseAbstractCheck
      RANDOM_STRING = SecureRandom.hex(1000).freeze
      COMMAND_TIMEOUT = '1'.freeze
      TIMEOUT_EXECUTABLE = 'timeout'.freeze

      class << self
        def readiness
          repository_storages.map do |storage_name|
            begin
              if !storage_stat_test(storage_name)
                HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name)
              else
                with_temp_file(storage_name) do |tmp_file_path|
                  if !storage_write_test(tmp_file_path)
                    HealthChecks::Result.new(false, 'cannot write to storage', shard: storage_name)
                  elsif !storage_read_test(tmp_file_path)
                    HealthChecks::Result.new(false, 'cannot read from storage', shard: storage_name)
                  else
                    HealthChecks::Result.new(true, nil, shard: storage_name)
                  end
                end
              end
            rescue RuntimeError => ex
              message = "unexpected error #{ex} when checking storage #{storage_name}"
              Rails.logger.error(message)
              HealthChecks::Result.new(false, message, shard: storage_name)
            end
          end
        end

        def metrics
          res = []
          repository_storages.each do |storage_name|
            res << operation_metrics(:filesystem_accessible, :filesystem_access_latency_seconds, shard: storage_name) do
              with_timing { storage_stat_test(storage_name) }
            end

            res << operation_metrics(:filesystem_writable, :filesystem_write_latency_seconds, shard: storage_name) do
              with_temp_file(storage_name) do |tmp_file_path|
                with_timing { storage_write_test(tmp_file_path) }
              end
            end

            res << operation_metrics(:filesystem_readable, :filesystem_read_latency_seconds, shard: storage_name) do
              with_temp_file(storage_name) do |tmp_file_path|
                storage_write_test(tmp_file_path) # writes data used by read test
                with_timing { storage_read_test(tmp_file_path) }
              end
            end
          end
          res.flatten
        end

        private

        def operation_metrics(ok_metric, latency_metric, **labels)
          result, elapsed = yield
          [
            metric(latency_metric, elapsed, **labels),
            metric(ok_metric, result ? 1 : 0, **labels)
          ]
        rescue RuntimeError => ex
          Rails.logger.error("unexpected error #{ex} when checking #{ok_metric}")
          [metric(ok_metric, 0, **labels)]
        end

        def repository_storages
          @repository_storage ||= Gitlab::CurrentSettings.current_application_settings.repository_storages
        end

        def storages_paths
          @storage_paths ||= Gitlab.config.repositories.storages
        end

        def exec_with_timeout(cmd_args, *args, &block)
          Gitlab::Popen.popen([TIMEOUT_EXECUTABLE, COMMAND_TIMEOUT].concat(cmd_args), *args, &block)
        end

        def with_temp_file(storage_name)
          begin
            temp_file_path = Dir::Tmpname.create(%w(fs_shards_check +deleted), path(storage_name)) { |path| path }
            yield temp_file_path
          ensure
            delete_test_file(temp_file_path)
          end
        end

        def path(storage_name)
          storages_paths&.dig(storage_name, 'path')
        end

        def storage_stat_test(storage_name)
          stat_path = File.join(path(storage_name), '.')
          begin
            _, status = exec_with_timeout(%W{ stat #{stat_path} })
            status == 0
          rescue Errno::ENOENT
            File.exist?(stat_path) && File::Stat.new(stat_path).readable?
          end
        end

        def storage_write_test(tmp_path)
          _, status = exec_with_timeout(%W{ tee #{tmp_path} }) do |stdin|
            stdin.write(RANDOM_STRING)
          end
          status == 0
        rescue Errno::ENOENT
          written_bytes = File.write(tmp_path, RANDOM_STRING) rescue Errno::ENOENT
          written_bytes == RANDOM_STRING.length
        end

        def storage_read_test(tmp_path)
          _, status = exec_with_timeout(%W{ diff #{tmp_path} - }) do |stdin|
            stdin.write(RANDOM_STRING)
          end
          status == 0
        rescue Errno::ENOENT
          file_contents = File.read(tmp_path) rescue Errno::ENOENT
          file_contents == RANDOM_STRING
        end

        def delete_test_file(tmp_path)
          _, status = exec_with_timeout(%W{ rm -f #{tmp_path} })
          status == 0
        rescue Errno::ENOENT
          File.delete(tmp_path) rescue Errno::ENOENT
        end
      end
    end
  end
end