summaryrefslogtreecommitdiff
path: root/lib/banzai/filter/external_link_filter.rb
blob: 67019454e44bdf8e2b6374180b547f1299b0775d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# frozen_string_literal: true

module Banzai
  module Filter
    # HTML Filter to modify the attributes of external links
    class ExternalLinkFilter < HTML::Pipeline::Filter
      SCHEMES      = ['http', 'https', nil].freeze
      RTLO         = "\u202E"
      ENCODED_RTLO = '%E2%80%AE'

      def call
        links.each do |node|
          # URI.parse does stricter checking on the url than Addressable,
          # such as on `mailto:` links. Since we've been using it, do an
          # initial parse for validity and then use Addressable
          # for IDN support, etc
          uri = uri_strict(node_src(node))
          if uri
            node.set_attribute(node_src_attribute(node), uri.to_s)
            addressable_uri = addressable_uri(node_src(node))
          else
            addressable_uri = nil
          end

          unless internal_url?(addressable_uri)
            punycode_autolink_node!(addressable_uri, node)
            sanitize_link_text!(node)
            add_malicious_tooltip!(addressable_uri, node)
            add_nofollow!(addressable_uri, node)
          end
        end

        doc
      end

      private

      # if this is a link to a proxied image, then `src` is already the correct
      # proxied url, so work with the `data-canonical-src`
      def node_src_attribute(node)
        node['data-canonical-src'] ? 'data-canonical-src' : 'href'
      end

      def node_src(node)
        node[node_src_attribute(node)]
      end

      def uri_strict(href)
        URI.parse(href)
      rescue URI::Error
        nil
      end

      def addressable_uri(href)
        Addressable::URI.parse(href)
      rescue Addressable::URI::InvalidURIError
        nil
      end

      def links
        query = 'descendant-or-self::a[@href and not(@href = "")]'
        doc.xpath(query)
      end

      def internal_url?(uri)
        return false if uri.nil?
        # Relative URLs miss a hostname
        return true unless uri.hostname

        uri.hostname == internal_url.hostname
      end

      def internal_url
        @internal_url ||= URI.parse(Gitlab.config.gitlab.url)
      end

      # Only replace an autolink with an IDN with it's punycode
      # version if we need emailable links.  Otherwise let it
      # be shown normally and the tooltips will show the
      # punycode version.
      def punycode_autolink_node!(uri, node)
        return unless uri
        return unless context[:emailable_links]

        unencoded_uri_str = Addressable::URI.unencode(node_src(node))

        if unencoded_uri_str == node.content && idn?(uri)
          node.content = uri.normalize
        end
      end

      # escape any right-to-left (RTLO) characters in link text
      def sanitize_link_text!(node)
        node.inner_html = node.inner_html.gsub(RTLO, ENCODED_RTLO)
      end

      # If the domain is an international domain name (IDN),
      # let's expose with a tooltip in case it's intended
      # to be malicious. This is particularly useful for links
      # where the link text is not the same as the actual link.
      # We will continue to show the unicode version of the domain
      # in autolinked link text, which could contain emojis, etc.
      #
      # Also show the tooltip if the url contains the RTLO character,
      # as this is an indicator of a malicious link
      def add_malicious_tooltip!(uri, node)
        if idn?(uri) || has_encoded_rtlo?(uri)
          node.add_class('has-tooltip')
          node.set_attribute('title', uri.normalize)
        end
      end

      def add_nofollow!(uri, node)
        if SCHEMES.include?(uri&.scheme)
          node.set_attribute('rel', 'nofollow noreferrer noopener')
          node.set_attribute('target', '_blank')
        end
      end

      def idn?(uri)
        uri&.normalized_host&.start_with?('xn--')
      end

      def has_encoded_rtlo?(uri)
        uri&.to_s&.include?(ENCODED_RTLO)
      end
    end
  end
end