summaryrefslogtreecommitdiff
path: root/lib/gitlab/background_migration/prepare_untracked_uploads.rb
blob: 81ca2b0a9b7acdcd32ffa2ea9a9287d6566a7bc0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# frozen_string_literal: true

module Gitlab
  module BackgroundMigration
    # This class finds all non-hashed uploaded file paths and saves them to a
    # `untracked_files_for_uploads` table.
    class PrepareUntrackedUploads # rubocop:disable Metrics/ClassLength
      # For bulk_queue_background_migration_jobs_by_range
      include Database::MigrationHelpers
      include ::Gitlab::Utils::StrongMemoize

      FIND_BATCH_SIZE = 500
      RELATIVE_UPLOAD_DIR = "uploads".freeze
      ABSOLUTE_UPLOAD_DIR = File.join(
        Gitlab.config.uploads.storage_path,
        RELATIVE_UPLOAD_DIR
      )
      FOLLOW_UP_MIGRATION = 'PopulateUntrackedUploads'.freeze
      START_WITH_ROOT_REGEX = %r{\A#{Gitlab.config.uploads.storage_path}/}
      EXCLUDED_HASHED_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/@hashed/*".freeze
      EXCLUDED_TMP_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/tmp/*".freeze

      # This class is used to iterate over batches of
      # `untracked_files_for_uploads` rows.
      class UntrackedFile < ActiveRecord::Base
        include EachBatch

        self.table_name = 'untracked_files_for_uploads'
      end

      def perform
        ensure_temporary_tracking_table_exists

        # Since Postgres < 9.5 does not have ON CONFLICT DO NOTHING, and since
        # doing inserts-if-not-exists without ON CONFLICT DO NOTHING would be
        # slow, start with an empty table for Postgres < 9.5.
        # That way we can do bulk inserts at ~30x the speed of individual
        # inserts (~20 minutes worth of inserts at GitLab.com scale instead of
        # ~10 hours).
        # In all other cases, installations will get both bulk inserts and the
        # ability for these jobs to retry without having to clear and reinsert.
        clear_untracked_file_paths unless can_bulk_insert_and_ignore_duplicates?

        store_untracked_file_paths

        if UntrackedFile.all.empty?
          drop_temp_table
        else
          schedule_populate_untracked_uploads_jobs
        end
      end

      private

      def ensure_temporary_tracking_table_exists
        table_name = :untracked_files_for_uploads

        unless ActiveRecord::Base.connection.data_source_exists?(table_name)
          UntrackedFile.connection.create_table table_name do |t|
            t.string :path, limit: 600, null: false
            t.index :path, unique: true
          end
        end
      end

      def clear_untracked_file_paths
        UntrackedFile.delete_all
      end

      def store_untracked_file_paths
        return unless Dir.exist?(ABSOLUTE_UPLOAD_DIR)

        each_file_batch(ABSOLUTE_UPLOAD_DIR, FIND_BATCH_SIZE) do |file_paths|
          insert_file_paths(file_paths)
        end
      end

      def each_file_batch(search_dir, batch_size, &block)
        cmd = build_find_command(search_dir)

        Open3.popen2(*cmd) do |stdin, stdout, status_thread|
          yield_paths_in_batches(stdout, batch_size, &block)

          raise "Find command failed" unless status_thread.value.success?
        end
      end

      def yield_paths_in_batches(stdout, batch_size, &block)
        paths = []

        stdout.each_line("\0") do |line|
          paths << line.chomp("\0").sub(START_WITH_ROOT_REGEX, '')

          if paths.size >= batch_size
            yield(paths)
            paths = []
          end
        end

        yield(paths) if paths.any?
      end

      def build_find_command(search_dir)
        cmd = %W[find -L #{search_dir}
                 -type f
                 ! ( -path #{EXCLUDED_HASHED_UPLOADS_PATH} -prune )
                 ! ( -path #{EXCLUDED_TMP_UPLOADS_PATH} -prune )
                 -print0]

        ionice = which_ionice
        cmd = %W[#{ionice} -c Idle] + cmd if ionice

        log_msg = "PrepareUntrackedUploads find command: \"#{cmd.join(' ')}\""
        Rails.logger.info log_msg

        cmd
      end

      def which_ionice
        Gitlab::Utils.which('ionice')
      rescue StandardError
        # In this case, returning false is relatively safe,
        # even though it isn't very nice
        false
      end

      def insert_file_paths(file_paths)
        sql = insert_sql(file_paths)

        ActiveRecord::Base.connection.execute(sql)
      end

      def insert_sql(file_paths)
        if postgresql_pre_9_5?
          "INSERT INTO #{table_columns_and_values_for_insert(file_paths)};"
        elsif postgresql?
          "INSERT INTO #{table_columns_and_values_for_insert(file_paths)}"\
            " ON CONFLICT DO NOTHING;"
        else # MySQL
          "INSERT IGNORE INTO"\
            " #{table_columns_and_values_for_insert(file_paths)};"
        end
      end

      def table_columns_and_values_for_insert(file_paths)
        values = file_paths.map do |file_path|
          ActiveRecord::Base.send(:sanitize_sql_array, ['(?)', file_path]) # rubocop:disable GitlabSecurity/PublicSend
        end.join(', ')

        "#{UntrackedFile.table_name} (path) VALUES #{values}"
      end

      def postgresql?
        strong_memoize(:postgresql) do
          Gitlab::Database.postgresql?
        end
      end

      def can_bulk_insert_and_ignore_duplicates?
        !postgresql_pre_9_5?
      end

      def postgresql_pre_9_5?
        strong_memoize(:postgresql_pre_9_5) do
          postgresql? && Gitlab::Database.version.to_f < 9.5
        end
      end

      def schedule_populate_untracked_uploads_jobs
        bulk_queue_background_migration_jobs_by_range(
          UntrackedFile, FOLLOW_UP_MIGRATION)
      end

      def drop_temp_table
        unless Rails.env.test? # Dropping a table intermittently breaks test cleanup
          UntrackedFile.connection.drop_table(:untracked_files_for_uploads,
                                              if_exists: true)
        end
      end
    end
  end
end