summaryrefslogtreecommitdiff
path: root/lib/gitlab/url_sanitizer.rb
blob: 79e124a58f55c6a840e19bda6064dcd5cc5b205b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# frozen_string_literal: true

module Gitlab
  class UrlSanitizer
    include Gitlab::Utils::StrongMemoize

    ALLOWED_SCHEMES = %w[http https ssh git].freeze
    ALLOWED_WEB_SCHEMES = %w[http https].freeze
    SCHEMIFIED_SCHEME = 'glschemelessuri'
    SCHEMIFY_PLACEHOLDER = "#{SCHEMIFIED_SCHEME}://".freeze
    # URI::DEFAULT_PARSER.make_regexp will only match URLs with schemes or
    # relative URLs. This section will match schemeless URIs with userinfo
    # e.g. user:pass@gitlab.com but will not match scp-style URIs e.g.
    # user@server:path/to/file)
    #
    # The userinfo part is very loose compared to URI's implementation so we
    # also match non-escaped userinfo e.g foo:b?r@gitlab.com which should be
    # encoded as foo:b%3Fr@gitlab.com
    URI_REGEXP = %r{
    (?:
       #{URI::DEFAULT_PARSER.make_regexp(ALLOWED_SCHEMES)}
     |
       (?:(?:(?!@)[%#{URI::REGEXP::PATTERN::UNRESERVED}#{URI::REGEXP::PATTERN::RESERVED}])+(?:@))
       (?# negative lookahead ensures this isn't an SCP-style URL: [host]:[rel_path|abs_path] server:path/to/file)
       (?!#{URI::REGEXP::PATTERN::HOST}:(?:#{URI::REGEXP::PATTERN::REL_PATH}|#{URI::REGEXP::PATTERN::ABS_PATH}))
       #{URI::REGEXP::PATTERN::HOSTPORT}
    )
    }x

    def self.sanitize(content)
      content.gsub(URI_REGEXP) do |url|
        new(url).masked_url
      rescue Addressable::URI::InvalidURIError
        ''
      end
    end

    def self.valid?(url, allowed_schemes: ALLOWED_SCHEMES)
      return false unless url.present?
      return false unless url.is_a?(String)

      uri = Addressable::URI.parse(url.strip)

      allowed_schemes.include?(uri.scheme)
    rescue Addressable::URI::InvalidURIError
      false
    end

    def self.valid_web?(url)
      valid?(url, allowed_schemes: ALLOWED_WEB_SCHEMES)
    end

    def initialize(url, credentials: nil)
      %i[user password].each do |symbol|
        credentials[symbol] = credentials[symbol].presence if credentials&.key?(symbol)
      end

      @credentials = credentials
      @url = parse_url(url)
    end

    def credentials
      @credentials ||= { user: @url.user.presence, password: @url.password.presence }
    end

    def user
      credentials[:user]
    end

    def sanitized_url
      safe_url = @url.dup
      safe_url.password = nil
      safe_url.user = nil
      reverse_schemify(safe_url.to_s)
    end
    strong_memoize_attr :sanitized_url

    def masked_url
      url = @url.dup
      url.password = "*****" if url.password.present?
      url.user = "*****" if url.user.present?
      reverse_schemify(url.to_s)
    end
    strong_memoize_attr :masked_url

    def full_url
      return reverse_schemify(@url.to_s) unless valid_credentials?

      url = @url.dup
      url.password = encode_percent(credentials[:password]) if credentials[:password].present?
      url.user = encode_percent(credentials[:user]) if credentials[:user].present?
      reverse_schemify(url.to_s)
    end
    strong_memoize_attr :full_url

    private

    def parse_url(url)
      url = schemify(url.to_s.strip)
      match = url.match(%r{\A(?:(?:#{SCHEMIFIED_SCHEME}|git|ssh|http(?:s?)):)?//(?:(.+)(?:@))?(.+)}o)
      raw_credentials = match[1] if match

      if raw_credentials.present?
        url.sub!("#{raw_credentials}@", '')

        user, _, password = raw_credentials.partition(':')

        @credentials ||= {}
        @credentials[:user] = user.presence if @credentials[:user].blank?
        @credentials[:password] = password.presence if @credentials[:password].blank?
      end

      url = Addressable::URI.parse(url)
      url.password = password if password.present?
      url.user = user if user.present?
      url
    end

    def schemify(url)
      # Prepend the placeholder scheme unless the URL has a scheme or is relative
      url.prepend(SCHEMIFY_PLACEHOLDER) unless url.starts_with?(%r{(?:#{URI::REGEXP::PATTERN::SCHEME}:)?//}o)
      url
    end

    def reverse_schemify(url)
      url.slice!(SCHEMIFY_PLACEHOLDER) if url.starts_with?(SCHEMIFY_PLACEHOLDER)
      url
    end

    def valid_credentials?
      credentials.is_a?(Hash) && credentials.values.any?
    end

    def encode_percent(string)
      # CGI.escape converts spaces to +, but this doesn't work for git clone
      CGI.escape(string).gsub('+', '%20')
    end
  end
end