summaryrefslogtreecommitdiff
path: root/lib/gitlab/health_checks/fs_shards_check.rb
blob: a533d4364ef54a2dcf1f84b95415add9546500d3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
module Gitlab
  module HealthChecks
    class FsShardsCheck
      extend BaseAbstractCheck
      RANDOM_STRING = SecureRandom.hex(1000).freeze
      COMMAND_TIMEOUT = '1'.freeze
      TIMEOUT_EXECUTABLE = 'timeout'.freeze

      class << self
        def readiness
          repository_storages.map do |storage_name|
            begin
              if !storage_circuitbreaker_test(storage_name)
                HealthChecks::Result.new(false, 'circuitbreaker tripped', shard: storage_name)
              elsif !storage_stat_test(storage_name)
                HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name)
              else
                with_temp_file(storage_name) do |tmp_file_path|
                  if !storage_write_test(tmp_file_path)
                    HealthChecks::Result.new(false, 'cannot write to storage', shard: storage_name)
                  elsif !storage_read_test(tmp_file_path)
                    HealthChecks::Result.new(false, 'cannot read from storage', shard: storage_name)
                  else
                    HealthChecks::Result.new(true, nil, shard: storage_name)
                  end
                end
              end
            rescue RuntimeError => ex
              message = "unexpected error #{ex} when checking storage #{storage_name}"
              Rails.logger.error(message)
              HealthChecks::Result.new(false, message, shard: storage_name)
            end
          end
        end

        def metrics
          repository_storages.flat_map do |storage_name|
            [
              storage_stat_metrics(storage_name),
              storage_write_metrics(storage_name),
              storage_read_metrics(storage_name),
              storage_circuitbreaker_metrics(storage_name)
            ].flatten
          end
        end

        private

        def operation_metrics(ok_metric, latency_metric, **labels)
          result, elapsed = yield
          [
            metric(latency_metric, elapsed, **labels),
            metric(ok_metric, result ? 1 : 0, **labels)
          ]
        rescue RuntimeError => ex
          Rails.logger.error("unexpected error #{ex} when checking #{ok_metric}")
          [metric(ok_metric, 0, **labels)]
        end

        def repository_storages
          @repository_storage ||= storages_paths.keys
        end

        def storages_paths
          @storage_paths ||= Gitlab.config.repositories.storages
        end

        def exec_with_timeout(cmd_args, *args, &block)
          Gitlab::Popen.popen([TIMEOUT_EXECUTABLE, COMMAND_TIMEOUT].concat(cmd_args), *args, &block)
        end

        def with_temp_file(storage_name)
          temp_file_path = Dir::Tmpname.create(%w(fs_shards_check +deleted), storage_path(storage_name)) { |path| path }
          yield temp_file_path
        ensure
          delete_test_file(temp_file_path)
        end

        def storage_path(storage_name)
          storages_paths&.dig(storage_name, 'path')
        end

        # All below test methods use shell commands to perform actions on storage volumes.
        # In case a storage volume have connectivity problems causing pure Ruby IO operation to wait indefinitely,
        # we can rely on shell commands to be terminated once `timeout` kills them.
        #
        # However we also fallback to pure Ruby file operations in case a specific shell command is missing
        # so we are still able to perform healthchecks and gather metrics from such system.

        def delete_test_file(tmp_path)
          _, status = exec_with_timeout(%W{ rm -f #{tmp_path} })
          status.zero?
        rescue Errno::ENOENT
          File.delete(tmp_path) rescue Errno::ENOENT
        end

        def storage_stat_test(storage_name)
          stat_path = File.join(storage_path(storage_name), '.')
          begin
            _, status = exec_with_timeout(%W{ stat #{stat_path} })
            status.zero?
          rescue Errno::ENOENT
            File.exist?(stat_path) && File::Stat.new(stat_path).readable?
          end
        end

        def storage_write_test(tmp_path)
          _, status = exec_with_timeout(%W{ tee #{tmp_path} }) do |stdin|
            stdin.write(RANDOM_STRING)
          end
          status.zero?
        rescue Errno::ENOENT
          written_bytes = File.write(tmp_path, RANDOM_STRING) rescue Errno::ENOENT
          written_bytes == RANDOM_STRING.length
        end

        def storage_read_test(tmp_path)
          _, status = exec_with_timeout(%W{ diff #{tmp_path} - }) do |stdin|
            stdin.write(RANDOM_STRING)
          end
          status.zero?
        rescue Errno::ENOENT
          file_contents = File.read(tmp_path) rescue Errno::ENOENT
          file_contents == RANDOM_STRING
        end

        def storage_circuitbreaker_test(storage_name)
          Gitlab::Git::Storage::CircuitBreaker.new(storage_name).perform { "OK" }
        rescue Gitlab::Git::Storage::Inaccessible
          nil
        end

        def storage_stat_metrics(storage_name)
          operation_metrics(:filesystem_accessible, :filesystem_access_latency_seconds, shard: storage_name) do
            with_timing { storage_stat_test(storage_name) }
          end
        end

        def storage_write_metrics(storage_name)
          operation_metrics(:filesystem_writable, :filesystem_write_latency_seconds, shard: storage_name) do
            with_temp_file(storage_name) do |tmp_file_path|
              with_timing { storage_write_test(tmp_file_path) }
            end
          end
        end

        def storage_read_metrics(storage_name)
          operation_metrics(:filesystem_readable, :filesystem_read_latency_seconds, shard: storage_name) do
            with_temp_file(storage_name) do |tmp_file_path|
              storage_write_test(tmp_file_path) # writes data used by read test
              with_timing { storage_read_test(tmp_file_path) }
            end
          end
        end

        def storage_circuitbreaker_metrics(storage_name)
          operation_metrics(:filesystem_circuitbreaker,
                            :filesystem_circuitbreaker_latency_seconds,
                            shard: storage_name) do
            with_timing { storage_circuitbreaker_test(storage_name) }
          end
        end
      end
    end
  end
end