summaryrefslogtreecommitdiff
path: root/lib/gitlab/untrusted_regexp.rb
blob: fe3377dae68c45a11b407c6574be6bd861dd1774 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# frozen_string_literal: true

module Gitlab
  # An untrusted regular expression is any regexp containing patterns sourced
  # from user input.
  #
  # Ruby's built-in regular expression library allows patterns which complete in
  # exponential time, permitting denial-of-service attacks.
  #
  # Not all regular expression features are available in untrusted regexes, and
  # there is a strict limit on total execution time. See the RE2 documentation
  # at https://github.com/google/re2/wiki/Syntax for more details.
  #
  # This class doesn't change any instance variables, which allows it to be frozen
  # and setup in constants.
  class UntrustedRegexp
    require_dependency 're2'

    delegate :===, :source, to: :regexp

    def initialize(pattern, multiline: false)
      if multiline
        pattern = "(?m)#{pattern}"
      end

      @regexp = RE2::Regexp.new(pattern, log_errors: false)
      @scan_regexp = initialize_scan_regexp

      raise RegexpError, regexp.error unless regexp.ok?
    end

    def replace_all(text, rewrite)
      RE2.GlobalReplace(text, regexp, rewrite)
    end

    # There is no built-in replace with block support (like `gsub`).  We can accomplish
    # the same thing by parsing and rebuilding the string with the substitutions.
    def replace_gsub(text)
      new_text = +''
      remainder = text

      matched = match(remainder)

      until matched.nil? || matched.to_a.compact.empty?
        partitioned = remainder.partition(matched.to_s)
        new_text << partitioned.first
        remainder = partitioned.last

        new_text << yield(matched)

        matched = match(remainder)
      end

      new_text << remainder
    end

    def scan(text)
      matches = scan_regexp.scan(text).to_a
      matches.map!(&:first) if regexp.number_of_capturing_groups == 0
      matches
    end

    def match(text)
      scan_regexp.match(text)
    end

    def match?(text)
      text.present? && scan(text).present?
    end

    def replace(text, rewrite)
      RE2.Replace(text, regexp, rewrite)
    end

    # #scan returns an array of the groups captured, rather than MatchData.
    # Use this to give the capture group name and grab the proper value
    def extract_named_group(name, match)
      return unless match

      match_position = regexp.named_capturing_groups[name.to_s]
      raise RegexpError, "Invalid named capture group: #{name}" unless match_position

      match[match_position - 1]
    end

    def ==(other)
      self.source == other.source
    end

    # Handles regular expressions with the preferred RE2 library where possible
    # via UntustedRegex. Falls back to Ruby's built-in regular expression library
    # when the syntax would be invalid in RE2.
    #
    # One difference between these is `(?m)` multi-line mode. Ruby regex enables
    # this by default, but also handles `^` and `$` differently.
    # See: https://www.regular-expressions.info/modifiers.html
    def self.with_fallback(pattern, multiline: false)
      UntrustedRegexp.new(pattern, multiline: multiline)
    rescue RegexpError
      raise if Feature.enabled?(:disable_unsafe_regexp)

      if Feature.enabled?(:ci_unsafe_regexp_logger, type: :ops)
        Gitlab::AppJsonLogger.info(
          class: self.name,
          regexp: pattern.to_s,
          fabricated: 'unsafe ruby regexp'
        )
      end

      Regexp.new(pattern)
    end

    private

    attr_reader :regexp, :scan_regexp

    # RE2 scan operates differently to Ruby scan when there are no capture
    # groups, so work around it
    def initialize_scan_regexp
      if regexp.number_of_capturing_groups == 0
        RE2::Regexp.new('(' + regexp.source + ')')
      else
        regexp
      end
    end
  end
end