summaryrefslogtreecommitdiff
path: root/app/models/merge_request_diff.rb
blob: 66b27aeac91006b6505de4487aa2e081c08942ba (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
# frozen_string_literal: true

class MergeRequestDiff < ApplicationRecord
  include Sortable
  include Importable
  include ManualInverseAssociation
  include EachBatch
  include Gitlab::Utils::StrongMemoize
  include ObjectStorage::BackgroundMove
  include BulkInsertableAssociations

  # Don't display more than 100 commits at once
  COMMITS_SAFE_SIZE = 100
  BATCH_SIZE = 1000

  # Applies to closed or merged MRs when determining whether to migrate their
  # diffs to external storage
  EXTERNAL_DIFF_CUTOFF = 7.days.freeze

  belongs_to :merge_request

  manual_inverse_association :merge_request, :merge_request_diff

  has_many :merge_request_diff_files,
    -> { order(:merge_request_diff_id, :relative_order) },
    inverse_of: :merge_request_diff

  has_many :merge_request_diff_commits, -> { order(:merge_request_diff_id, :relative_order) }

  validates :base_commit_sha, :head_commit_sha, :start_commit_sha, sha: true

  state_machine :state, initial: :empty do
    event :clean do
      transition any => :without_files
    end

    state :collected
    state :overflow
    # Diff files have been deleted by the system
    state :without_files
    # Deprecated states: these are no longer used but these values may still occur
    # in the database.
    state :timeout
    state :overflow_commits_safe_size
    state :overflow_diff_files_limit
    state :overflow_diff_lines_limit
  end

  scope :with_files, -> { without_states(:without_files, :empty) }
  scope :viewable, -> { without_state(:empty) }
  scope :by_commit_sha, ->(sha) do
    joins(:merge_request_diff_commits).where(merge_request_diff_commits: { sha: sha }).reorder(nil)
  end
  scope :has_diff_files, -> { where(id: MergeRequestDiffFile.select(:merge_request_diff_id)) }

  scope :by_project_id, -> (project_id) do
    joins(:merge_request).where(merge_requests: { target_project_id: project_id })
  end

  scope :recent, -> { order(id: :desc).limit(100) }
  scope :files_in_database, -> { has_diff_files.where(stored_externally: [false, nil]) }

  scope :not_latest_diffs, -> do
    merge_requests = MergeRequest.arel_table
    mr_diffs = arel_table

    join_condition = merge_requests[:id].eq(mr_diffs[:merge_request_id])
      .and(mr_diffs[:id].not_eq(merge_requests[:latest_merge_request_diff_id]))

    arel_join = mr_diffs.join(merge_requests).on(join_condition)
    joins(arel_join.join_sources)
  end

  scope :old_merged_diffs, -> (before) do
    merge_requests = MergeRequest.arel_table
    mr_metrics = MergeRequest::Metrics.arel_table
    mr_diffs = arel_table

    mr_join = mr_diffs
      .join(merge_requests)
      .on(mr_diffs[:merge_request_id].eq(merge_requests[:id]))

    metrics_join_condition = mr_diffs[:merge_request_id]
      .eq(mr_metrics[:merge_request_id])
      .and(mr_metrics[:merged_at].not_eq(nil))

    metrics_join = mr_diffs.join(mr_metrics).on(metrics_join_condition)

    condition = MergeRequest.arel_table[:state_id].eq(MergeRequest.available_states[:merged])
      .and(MergeRequest::Metrics.arel_table[:merged_at].lteq(before))
      .and(MergeRequest::Metrics.arel_table[:merged_at].not_eq(nil))

    joins(metrics_join.join_sources, mr_join.join_sources).where(condition)
  end

  scope :old_closed_diffs, -> (before) do
    condition = MergeRequest.arel_table[:state_id].eq(MergeRequest.available_states[:closed])
      .and(MergeRequest::Metrics.arel_table[:latest_closed_at].lteq(before))

    joins(merge_request: :metrics).where(condition)
  end

  def self.ids_for_external_storage_migration(limit:)
    # No point doing any work unless the feature is enabled
    return [] unless Gitlab.config.external_diffs.enabled

    case Gitlab.config.external_diffs.when
    when 'always'
      files_in_database.limit(limit).pluck(:id)
    when 'outdated'
      # Outdated is too complex to be a single SQL query, so split into three
      before = EXTERNAL_DIFF_CUTOFF.ago

      ids = files_in_database
        .old_merged_diffs(before)
        .limit(limit)
        .pluck(:id)

      return ids if ids.size >= limit

      ids += files_in_database
        .old_closed_diffs(before)
        .limit(limit - ids.size)
        .pluck(:id)

      return ids if ids.size >= limit

      ids + files_in_database
        .not_latest_diffs
        .limit(limit - ids.size)
        .pluck(:id)
    else
      []
    end
  end

  mount_uploader :external_diff, ExternalDiffUploader

  # All diff information is collected from repository after object is created.
  # It allows you to override variables like head_commit_sha before getting diff.
  after_create :save_git_content, unless: :importing?
  after_create_commit :set_as_latest_diff, unless: :importing?

  after_save :update_external_diff_store

  def self.find_by_diff_refs(diff_refs)
    find_by(start_commit_sha: diff_refs.start_sha, head_commit_sha: diff_refs.head_sha, base_commit_sha: diff_refs.base_sha)
  end

  def viewable?
    collected? || without_files? || overflow?
  end

  # Collect information about commits and diff from repository
  # and save it to the database as serialized data
  def save_git_content
    ensure_commit_shas
    save_commits
    save_diffs

    # Another set of `after_save` hooks will be called here when we update the record
    save
    # We need to reset so that dirty tracking is reset when running the original set
    # of `after_save` hooks that come after this `after_create` hook. Otherwise, the
    # hooks that run when an attribute was changed are run twice.
    reset

    keep_around_commits unless importing?
  end

  def set_as_latest_diff
    MergeRequest
      .where('id = ? AND COALESCE(latest_merge_request_diff_id, 0) < ?', self.merge_request_id, self.id)
      .update_all(latest_merge_request_diff_id: self.id)
  end

  def ensure_commit_shas
    self.start_commit_sha ||= merge_request.target_branch_sha
    self.head_commit_sha  ||= merge_request.source_branch_sha
    self.base_commit_sha  ||= find_base_sha
  end

  # Override head_commit_sha to keep compatibility with merge request diff
  # created before version 8.4 that does not store head_commit_sha in separate db field.
  def head_commit_sha
    if persisted? && super.nil?
      last_commit_sha
    else
      super
    end
  end

  # This method will rely on repository branch sha
  # in case start_commit_sha is nil. Its necesarry for old merge request diff
  # created before version 8.4 to work
  def safe_start_commit_sha
    start_commit_sha || merge_request.target_branch_sha
  end

  def size
    real_size.presence || raw_diffs.size
  end

  def lines_count
    strong_memoize(:lines_count) do
      raw_diffs(limits: false).line_count
    end
  end

  def raw_diffs(options = {})
    if options[:ignore_whitespace_change]
      @diffs_no_whitespace ||= compare.diffs(options)
    else
      @raw_diffs ||= {}
      @raw_diffs[options] ||= load_diffs(options)
    end
  end

  def commits(limit: nil)
    strong_memoize(:"commits_#{limit || 'all'}") do
      load_commits(limit: limit)
    end
  end

  def last_commit_sha
    commit_shas(limit: 1).first
  end

  def first_commit
    commits.last
  end

  def last_commit
    commits.first
  end

  def base_commit
    return unless base_commit_sha

    project.commit_by(oid: base_commit_sha)
  end

  def start_commit
    return unless start_commit_sha

    project.commit_by(oid: start_commit_sha)
  end

  def head_commit
    return unless head_commit_sha

    project.commit_by(oid: head_commit_sha)
  end

  def commit_shas(limit: nil)
    merge_request_diff_commits.limit(limit).pluck(:sha)
  end

  def includes_any_commits?(shas)
    return false if shas.blank?

    # when the number of shas is huge (1000+) we don't want
    # to pass them all as an SQL param, let's pass them in batches
    shas.each_slice(BATCH_SIZE).any? do |batched_shas|
      merge_request_diff_commits.where(sha: batched_shas).exists?
    end
  end

  def diff_refs=(new_diff_refs)
    self.base_commit_sha = new_diff_refs&.base_sha
    self.start_commit_sha = new_diff_refs&.start_sha
    self.head_commit_sha = new_diff_refs&.head_sha
  end

  def diff_refs
    return unless start_commit_sha || base_commit_sha

    Gitlab::Diff::DiffRefs.new(
      base_sha:  base_commit_sha,
      start_sha: start_commit_sha,
      head_sha:  head_commit_sha
    )
  end

  # MRs created before 8.4 don't store their true diff refs (start and base),
  # but we need to get a commit SHA for the "View file @ ..." link by a file,
  # so we use an approximation of the diff refs if we can't get the actual one.
  #
  # These will not be the actual diff refs if the target branch was merged into
  # the source branch after the merge request was created, but it is good enough
  # for the specific purpose of linking to a commit.
  #
  # It is not good enough for highlighting diffs, so we can't simply pass
  # these as `diff_refs.`
  def fallback_diff_refs
    real_refs = diff_refs
    return real_refs if real_refs

    likely_base_commit_sha = (first_commit&.parent || first_commit)&.sha

    Gitlab::Diff::DiffRefs.new(
      base_sha:  likely_base_commit_sha,
      start_sha: safe_start_commit_sha,
      head_sha:  head_commit_sha
    )
  end

  def diff_refs_by_sha?
    base_commit_sha? && head_commit_sha? && start_commit_sha?
  end

  def diffs_in_batch(batch_page, batch_size, diff_options:)
    fetching_repository_diffs(diff_options) do |comparison|
      if comparison
        comparison.diffs_in_batch(batch_page, batch_size, diff_options: diff_options)
      else
        diffs_in_batch_collection(batch_page, batch_size, diff_options: diff_options)
      end
    end
  end

  def diffs(diff_options = nil)
    fetching_repository_diffs(diff_options) do |comparison|
      # It should fetch the repository when diffs are cleaned by the system.
      # We don't keep these for storage overload purposes.
      # See https://gitlab.com/gitlab-org/gitlab-foss/issues/37639
      if comparison
        comparison.diffs(diff_options)
      else
        diffs_collection(diff_options)
      end
    end
  end

  # Should always return the DB persisted diffs collection
  # (e.g. Gitlab::Diff::FileCollection::MergeRequestDiff.
  # It's useful when trying to invalidate old caches through
  # FileCollection::MergeRequestDiff#clear_cache!
  def diffs_collection(diff_options = nil)
    Gitlab::Diff::FileCollection::MergeRequestDiff.new(self, diff_options: diff_options)
  end

  def project
    merge_request.target_project
  end

  def compare
    @compare ||=
      Gitlab::Git::Compare.new(
        repository.raw_repository,
        safe_start_commit_sha,
        head_commit_sha
      )
  end

  def latest?
    self.id == merge_request.latest_merge_request_diff_id
  end

  # rubocop: disable CodeReuse/ServiceClass
  def compare_with(sha)
    # When compare merge request versions we want diff A..B instead of A...B
    # so we handle cases when user does squash and rebase of the commits between versions.
    # For this reason we set straight to true by default.
    CompareService.new(project, head_commit_sha).execute(project, sha, straight: true)
  end
  # rubocop: enable CodeReuse/ServiceClass

  def modified_paths(fallback_on_overflow: false)
    if fallback_on_overflow && overflow?
      # This is an extremely slow means to find the modified paths for a given
      #   MergeRequestDiff. This should be avoided, except where the limit of
      #   1_000 (as of %12.10) entries returned by the default behavior is an
      #   issue.
      strong_memoize(:overflowed_modified_paths) do
        project.repository.diff_stats(
          base_commit_sha,
          head_commit_sha
        ).paths
      end
    else
      strong_memoize(:modified_paths) do
        merge_request_diff_files.pluck(:new_path, :old_path).flatten.uniq
      end
    end
  end

  def update_external_diff_store
    return unless saved_change_to_external_diff? || saved_change_to_stored_externally?

    update_column(:external_diff_store, external_diff.object_store)
  end

  # If enabled, yields the external file containing the diff. Otherwise, yields
  # nil. This method is not thread-safe, but it *is* re-entrant, which allows
  # multiple merge_request_diff_files to load their data efficiently
  def opening_external_diff
    return yield(nil) unless stored_externally?
    return yield(@external_diff_file) if @external_diff_file

    external_diff.open do |file|
      @external_diff_file = file

      yield(@external_diff_file)
    ensure
      @external_diff_file = nil
    end
  end

  # Transactionally migrate the current merge_request_diff_files entries to
  # external storage. If external storage isn't an option for this diff, the
  # method is a no-op.
  def migrate_files_to_external_storage!
    return if stored_externally? || !use_external_diff? || merge_request_diff_files.count == 0

    rows = build_merge_request_diff_files(merge_request_diff_files)

    transaction do
      MergeRequestDiffFile.where(merge_request_diff_id: id).delete_all
      create_merge_request_diff_files(rows)
      save!
    end

    merge_request_diff_files.reset
  end

  # Transactionally migrate the current merge_request_diff_files entries from
  # external storage, back to the database. This is the rollback operation for
  # +migrate_files_to_external_storage!+
  #
  # If this diff isn't in external storage, the method is a no-op.
  def migrate_files_to_database!
    return unless stored_externally?
    return if merge_request_diff_files.count == 0

    rows = convert_external_diffs_to_database

    transaction do
      MergeRequestDiffFile.where(merge_request_diff_id: id).delete_all
      Gitlab::Database.bulk_insert('merge_request_diff_files', rows) # rubocop:disable Gitlab/BulkInsert
      update!(stored_externally: false)
    end

    # Only delete the external diff file after the contents have been saved to
    # the database
    remove_external_diff!
    merge_request_diff_files.reset
  end

  private

  def convert_external_diffs_to_database
    opening_external_diff do |external_file|
      merge_request_diff_files.map do |diff_file|
        row = diff_file.attributes.except('diff')

        raise "Diff file lacks external diff offset or size: #{row.inspect}" unless
          row['external_diff_offset'] && row['external_diff_size']

        # The diff in the external file is already base64-encoded if necessary,
        # matching the 'binary' attribute of the row. Reading it directly allows
        # a cycle of decode-encode to be skipped
        external_file.seek(row.delete('external_diff_offset'))
        row['diff'] = external_file.read(row.delete('external_diff_size'))

        row
      end
    end
  end

  def diffs_in_batch_collection(batch_page, batch_size, diff_options:)
    Gitlab::Diff::FileCollection::MergeRequestDiffBatch.new(self,
                                                            batch_page,
                                                            batch_size,
                                                            diff_options: diff_options)
  end

  def encode_in_base64?(diff_text)
    (diff_text.encoding == Encoding::BINARY && !diff_text.ascii_only?) ||
      diff_text.include?("\0")
  end

  def build_external_merge_request_diff_files(rows)
    tempfile = build_external_diff_tempfile(rows)

    self.external_diff = tempfile
    self.stored_externally = true

    rows
  ensure
    tempfile&.unlink
  end

  def create_merge_request_diff_files(rows)
    rows = build_external_merge_request_diff_files(rows) if use_external_diff?

    # Faster inserts
    Gitlab::Database.bulk_insert('merge_request_diff_files', rows) # rubocop:disable Gitlab/BulkInsert
  end

  def build_external_diff_tempfile(rows)
    Tempfile.open(external_diff.filename) do |file|
      rows.each do |row|
        data = row.delete(:diff)
        row[:external_diff_offset] = file.pos
        row[:external_diff_size] = data.bytesize

        file.write(data)
      end

      file
    end
  end

  def build_merge_request_diff_files(diffs)
    diffs.map.with_index do |diff, index|
      diff_hash = diff.to_hash.merge(
        binary: false,
        merge_request_diff_id: self.id,
        relative_order: index
      )

      # Compatibility with old diffs created with Psych.
      diff_hash.tap do |hash|
        diff_text = hash[:diff]

        if encode_in_base64?(diff_text)
          hash[:binary] = true
          hash[:diff] = [diff_text].pack('m0')
        end
      end
    end
  end

  # Yields the block with the repository Compare object if it should
  # fetch diffs from the repository instead DB.
  def fetching_repository_diffs(diff_options)
    return unless block_given?

    diff_options ||= {}

    # Can be read as: fetch the persisted diffs if yielded without the
    # Compare object.
    return yield unless without_files? || diff_options[:ignore_whitespace_change]
    return yield unless diff_refs&.complete?

    comparison = diff_refs.compare_in(repository.project)

    return yield unless comparison

    yield(comparison)
  end

  def use_external_diff?
    return false unless Gitlab.config.external_diffs.enabled

    case Gitlab.config.external_diffs.when
    when 'always'
      true
    when 'outdated'
      outdated_by_merge? || outdated_by_closure? || old_version?
    else
      false # Disable external diffs if misconfigured
    end
  end

  def outdated_by_merge?
    return false unless merge_request&.metrics&.merged_at

    merge_request.merged? && merge_request.metrics.merged_at < EXTERNAL_DIFF_CUTOFF.ago
  end

  def outdated_by_closure?
    return false unless merge_request&.metrics&.latest_closed_at

    merge_request.closed? && merge_request.metrics.latest_closed_at < EXTERNAL_DIFF_CUTOFF.ago
  end

  def old_version?
    latest_id = MergeRequest
      .where(id: merge_request_id)
      .limit(1)
      .pluck(:latest_merge_request_diff_id)
      .first

    latest_id && self.id < latest_id
  end

  def load_diffs(options)
    # Ensure all diff files operate on the same external diff file instance if
    # present. This reduces file open/close overhead.
    opening_external_diff do
      collection = merge_request_diff_files

      if options[:include_context_commits]
        collection += merge_request.merge_request_context_commit_diff_files
      end

      if paths = options[:paths]
        collection = collection.where('old_path IN (?) OR new_path IN (?)', paths, paths)
      end

      Gitlab::Git::DiffCollection.new(collection.map(&:to_hash), options)
    end
  end

  def load_commits(limit: nil)
    commits = merge_request_diff_commits.limit(limit)
      .map { |commit| Commit.from_hash(commit.to_hash, project) }

    CommitCollection
      .new(merge_request.source_project, commits, merge_request.source_branch)
  end

  def save_diffs
    new_attributes = {}

    if compare.commits.size.zero?
      new_attributes[:state] = :empty
    else
      diff_collection = compare.diffs(Commit.max_diff_options)
      new_attributes[:real_size] = diff_collection.real_size

      if diff_collection.any?
        new_attributes[:state] = :collected

        rows = build_merge_request_diff_files(diff_collection)
        create_merge_request_diff_files(rows)
      end

      # Set our state to 'overflow' to make the #empty? and #collected?
      # methods (generated by StateMachine) return false.
      #
      # This attribution has to come at the end of the method so 'overflow'
      # state does not get overridden by 'collected'.
      new_attributes[:state] = :overflow if diff_collection.overflow?
    end

    assign_attributes(new_attributes)
  end

  def save_commits
    MergeRequestDiffCommit.create_bulk(self.id, compare.commits.reverse)

    # merge_request_diff_commits.reset is preferred way to reload associated
    # objects but it returns cached result for some reason in this case
    # we can circumvent that by specifying that we need an uncached reload
    commits = self.class.uncached { merge_request_diff_commits.reset }
    self.commits_count = commits.size
  end

  def repository
    project.repository
  end

  def find_base_sha
    return unless head_commit_sha && start_commit_sha

    project.merge_base_commit(head_commit_sha, start_commit_sha).try(:sha)
  end

  def keep_around_commits
    [repository, merge_request.source_project.repository].uniq.each do |repo|
      repo.keep_around(start_commit_sha, head_commit_sha, base_commit_sha)
    end
  end
end

MergeRequestDiff.prepend_if_ee('EE::MergeRequestDiff')