summaryrefslogtreecommitdiff
path: root/lib/banzai/filter/external_link_filter.rb
blob: dc65e2abb46070488d825cacc9a4dc4cf8628dd9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# frozen_string_literal: true

module Banzai
  module Filter
    # HTML Filter to modify the attributes of external links
    class ExternalLinkFilter < HTML::Pipeline::Filter
      SCHEMES      = ['http', 'https', nil].freeze
      RTLO         = "\u202E"
      ENCODED_RTLO = '%E2%80%AE'

      def call
        links.each do |node|
          # URI.parse does stricter checking on the url than Addressable,
          # such as on `mailto:` links. Since we've been using it, do an
          # initial parse for validity and then use Addressable
          # for IDN support, etc
          uri = uri_strict(node_src(node))
          if uri
            node.set_attribute(node_src_attribute(node), uri.to_s)
            addressable_uri = addressable_uri(node_src(node))
          else
            addressable_uri = nil
          end

          unless internal_url?(addressable_uri)
            punycode_autolink_node!(addressable_uri, node)
            sanitize_link_text!(node)
            add_malicious_tooltip!(addressable_uri, node)
            add_nofollow!(addressable_uri, node)
          end
        end

        doc
      end

      private

      # if this is a link to a proxied image, then `src` is already the correct
      # proxied url, so work with the `data-canonical-src`
      def node_src_attribute(node)
        node['data-canonical-src'] ? 'data-canonical-src' : 'href'
      end

      def node_src(node)
        node[node_src_attribute(node)]
      end

      def uri_strict(href)
        URI.parse(href)
      rescue URI::Error
        nil
      end

      def addressable_uri(href)
        Addressable::URI.parse(href)
      rescue Addressable::URI::InvalidURIError
        nil
      end

      def links
        query = 'descendant-or-self::a[@href and not(@href = "")]'
        doc.xpath(query)
      end

      def internal_url?(uri)
        return false if uri.nil?
        # Relative URLs miss a hostname
        return true unless uri.hostname

        uri.hostname == internal_url.hostname
      end

      def internal_url
        @internal_url ||= URI.parse(Gitlab.config.gitlab.url)
      end

      # Only replace an autolink with an IDN with it's punycode
      # version if we need emailable links.  Otherwise let it
      # be shown normally and the tooltips will show the
      # punycode version.
      def punycode_autolink_node!(uri, node)
        return unless uri
        return unless context[:emailable_links]

        unencoded_uri_str = Addressable::URI.unencode(node_src(node))

        if unencoded_uri_str == node.content && idn?(uri)
          node.content = uri.normalize
        end
      end

      # escape any right-to-left (RTLO) characters in link text
      def sanitize_link_text!(node)
        node.inner_html = node.inner_html.gsub(RTLO, ENCODED_RTLO)
      end

      # If the domain is an international domain name (IDN),
      # let's expose with a tooltip in case it's intended
      # to be malicious. This is particularly useful for links
      # where the link text is not the same as the actual link.
      # We will continue to show the unicode version of the domain
      # in autolinked link text, which could contain emojis, etc.
      #
      # Also show the tooltip if the url contains the RTLO character,
      # as this is an indicator of a malicious link
      def add_malicious_tooltip!(uri, node)
        if idn?(uri) || has_encoded_rtlo?(uri)
          node.add_class('has-tooltip')
          node.set_attribute('title', uri.normalize)
        end
      end

      def add_nofollow!(uri, node)
        if SCHEMES.include?(uri&.scheme)
          license = true if node.attribute('rel')&.value == 'license'
          node.set_attribute('rel', 'nofollow noreferrer noopener')
          node.kwattr_append('rel', 'license') if license
          node.set_attribute('target', '_blank')
        end
      end

      def idn?(uri)
        uri&.normalized_host&.start_with?('xn--')
      end

      def has_encoded_rtlo?(uri)
        uri&.to_s&.include?(ENCODED_RTLO)
      end
    end
  end
end